From ca0d92227e3a5e5880dde67da9d96c6d06454328 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 25 Oct 2024 15:40:33 -0400 Subject: [PATCH 001/113] [Bugfix] Fix compressed_tensors_moe bad config.strategy (#9677) --- .../quantization/compressed_tensors/compressed_tensors_moe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 733eece4b5fa6..c21aaa40ff2cc 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -245,7 +245,7 @@ def __init__( config = self.quant_config.target_scheme_map["Linear"].get("weights") self.num_bits = config.num_bits self.packed_factor = 32 // config.num_bits - self.strategy = config.strategy.value + self.strategy = config.strategy self.group_size = config.group_size assert config.symmetric, ( "Only symmetric quantization is supported for MoE") From 228cfbd03fd1ad9b26001817a6d414cc9f2c22ae Mon Sep 17 00:00:00 2001 From: Rafael Vasquez Date: Fri, 25 Oct 2024 17:32:10 -0400 Subject: [PATCH 002/113] [Doc] Improve quickstart documentation (#9256) Signed-off-by: Rafael Vasquez --- docs/source/getting_started/quickstart.rst | 98 ++++++++++++---------- 1 file changed, 52 insertions(+), 46 deletions(-) diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 80b19ac672936..f0e6cddf09ef7 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -1,38 +1,50 @@ .. _quickstart: +========== Quickstart ========== -This guide shows how to use vLLM to: +This guide will help you quickly get started with vLLM to: -* run offline batched inference on a dataset; -* build an API server for a large language model; -* start an OpenAI-compatible API server. +* :ref:`Run offline batched inference ` +* :ref:`Run OpenAI-compatible inference ` -Be sure to complete the :ref:`installation instructions ` before continuing with this guide. +Prerequisites +-------------- +- OS: Linux +- Python: 3.8 - 3.12 +- GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) -.. note:: +Installation +-------------- + +You can install vLLM using pip. It's recommended to use `conda `_ to create and manage Python environments. + +.. code-block:: console - By default, vLLM downloads model from `HuggingFace `_. If you would like to use models from `ModelScope `_ in the following examples, please set the environment variable: + $ conda create -n myenv python=3.10 -y + $ conda activate myenv + $ pip install vllm - .. code-block:: shell +Please refer to the :ref:`installation documentation ` for more details on installing vLLM. - export VLLM_USE_MODELSCOPE=True +.. _offline_batched_inference: Offline Batched Inference ------------------------- -We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts. +With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). The example script for this section can be found `here `__. + +The first line of this example imports the classes :class:`~vllm.LLM` and :class:`~vllm.SamplingParams`: -Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM. -The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine. -The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process. +- :class:`~vllm.LLM` is the main class for running offline inference with vLLM engine. +- :class:`~vllm.SamplingParams` specifies the parameters for the sampling process. .. code-block:: python from vllm import LLM, SamplingParams -Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition `_. +The next section defines a list of input prompts and sampling parameters for text generation. The `sampling temperature `_ is set to ``0.8`` and the `nucleus sampling probability `_ is set to ``0.95``. You can find more information about the sampling parameters `here `__. .. code-block:: python @@ -44,46 +56,46 @@ Define the list of input prompts and the sampling parameters for generation. The ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `. +The :class:`~vllm.LLM` class initializes vLLM's engine and the `OPT-125M model `_ for offline inference. The list of supported models can be found :ref:`here `. .. code-block:: python llm = LLM(model="facebook/opt-125m") -Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens. +.. note:: + + By default, vLLM downloads models from `HuggingFace `_. If you would like to use models from `ModelScope `_, set the environment variable ``VLLM_USE_MODELSCOPE`` before initializing the engine. + +Now, the fun part! The outputs are generated using ``llm.generate``. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all of the output tokens. .. code-block:: python outputs = llm.generate(prompts, sampling_params) - # Print the outputs. for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - -The code example can also be found in `examples/offline_inference.py `_. +.. _openai_compatible_server: OpenAI-Compatible Server ------------------------ vLLM can be deployed as a server that implements the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. -By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the command below) and implements `list models `_, `create chat completion `_, and `create completion `_ endpoints. We are actively adding support for more endpoints. +By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time and implements endpoints such as `list models `_, `create chat completion `_, and `create completion `_ endpoints. -Start the server: +Run the following command to start the vLLM server with the `Qwen2.5-1.5B-Instruct `_ model: .. code-block:: console - $ vllm serve facebook/opt-125m + $ vllm serve Qwen/Qwen2.5-1.5B-Instruct -By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument: - -.. code-block:: console +.. note:: - $ vllm serve facebook/opt-125m --chat-template ./examples/template_chatml.jinja + By default, the server uses a predefined chat template stored in the tokenizer. You can learn about overriding it `here `__. -This server can be queried in the same format as OpenAI API. For example, list the models: +This server can be queried in the same format as OpenAI API. For example, to list the models: .. code-block:: console @@ -91,17 +103,17 @@ This server can be queried in the same format as OpenAI API. For example, list t You can pass in the argument ``--api-key`` or environment variable ``VLLM_API_KEY`` to enable the server to check for API key in the header. -Using OpenAI Completions API with vLLM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +OpenAI Completions API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Query the model with input prompts: +Once your server is started, you can query the model with input prompts: .. code-block:: console $ curl http://localhost:8000/v1/completions \ $ -H "Content-Type: application/json" \ $ -d '{ - $ "model": "facebook/opt-125m", + $ "model": "Qwen/Qwen2.5-1.5B-Instruct", $ "prompt": "San Francisco is a", $ "max_tokens": 7, $ "temperature": 0 @@ -120,36 +132,32 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep api_key=openai_api_key, base_url=openai_api_base, ) - completion = client.completions.create(model="facebook/opt-125m", + completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", prompt="San Francisco is a") print("Completion result:", completion) -For a more detailed client example, refer to `examples/openai_completion_client.py `_. - -Using OpenAI Chat API with vLLM -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +A more detailed client example can be found `here `__. -The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. +OpenAI Chat API with vLLM +~~~~~~~~~~~~~~~~~~~~~~~~~~ -Querying the model using OpenAI Chat API: +vLLM is designed to also support the OpenAI Chat API. The chat interface is a more dynamic, interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations. -You can use the `create chat completion `_ endpoint to communicate with the model in a chat-like interface: +You can use the `create chat completion `_ endpoint to interact with the model: .. code-block:: console $ curl http://localhost:8000/v1/chat/completions \ $ -H "Content-Type: application/json" \ $ -d '{ - $ "model": "facebook/opt-125m", + $ "model": "Qwen/Qwen2.5-1.5B-Instruct", $ "messages": [ $ {"role": "system", "content": "You are a helpful assistant."}, $ {"role": "user", "content": "Who won the world series in 2020?"} $ ] $ }' -Python Client Example: - -Using the `openai` python package, you can also communicate with the model in a chat-like manner: +Alternatively, you can use the `openai` python package: .. code-block:: python @@ -164,12 +172,10 @@ Using the `openai` python package, you can also communicate with the model in a ) chat_response = client.chat.completions.create( - model="facebook/opt-125m", + model="Qwen/Qwen2.5-1.5B-Instruct", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a joke."}, ] ) print("Chat response:", chat_response) - -For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation. From 6567e13724110fac2042d06a9e4c01fd822e8909 Mon Sep 17 00:00:00 2001 From: Travis Johnson Date: Fri, 25 Oct 2024 16:42:56 -0600 Subject: [PATCH 003/113] [Bugfix] Fix crash with llama 3.2 vision models and guided decoding (#9631) Signed-off-by: Travis Johnson Co-authored-by: pavlo-ruban Co-authored-by: Nick Hill --- .../guided_decoding/outlines_logits_processors.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py index c28bd71c9f682..e1309c31f77e7 100644 --- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py +++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py @@ -15,11 +15,11 @@ # limitations under the License. import copy import json -import math from collections import defaultdict from functools import lru_cache from typing import Callable, DefaultDict, Dict, List, Union +import numpy as np import torch from lark import Lark from outlines import grammars @@ -77,9 +77,17 @@ def __call__(self, input_ids: List[int], f"Unsupported instruction type {type(instruction)}") mask = torch.full((scores.shape[-1], ), - -math.inf, + -torch.inf, device=scores.device) - mask[allowed_tokens] = 0 + # The tokenizer may support more token ids than the model can generate, + # eg. Llama 3.2 Vision models have an `<|image|>` token with id 128256 + # but scores.shape == torch.Size([128256]) + # Using NumPy is faster for filtering token ids + allowed_tokens = np.array(allowed_tokens, dtype=np.int64) + allowed_tokens = torch.tensor(allowed_tokens, device=scores.device) + allowed_tokens = allowed_tokens.masked_select( + allowed_tokens < scores.shape[-1]) + mask.index_fill_(0, allowed_tokens, 0) scores.add_(mask) return scores From 067e77f9a87c3466fce41c8fe8710fddc69ec26c Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 25 Oct 2024 22:05:47 -0700 Subject: [PATCH 004/113] [Bugfix] Steaming continuous_usage_stats default to False (#9709) Signed-off-by: Sam Stoelinga --- vllm/entrypoints/openai/protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 733decf80a711..a212c0d608ddb 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -127,7 +127,7 @@ class ResponseFormat(OpenAIBaseModel): class StreamOptions(OpenAIBaseModel): include_usage: Optional[bool] = True - continuous_usage_stats: Optional[bool] = True + continuous_usage_stats: Optional[bool] = False class FunctionDefinition(OpenAIBaseModel): From 5cbdccd151ef50e3fc040690248a8d86d3b93c2a Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Sat, 26 Oct 2024 18:59:06 +0800 Subject: [PATCH 005/113] [Hardware][openvino] is_openvino --> current_platform.is_openvino (#9716) --- tests/kernels/test_attention_selector.py | 3 +- vllm/attention/selector.py | 4 +-- vllm/config.py | 4 +-- vllm/executor/openvino_executor.py | 20 +++++-------- vllm/model_executor/model_loader/openvino.py | 4 +-- vllm/platforms/__init__.py | 10 +++++++ vllm/platforms/interface.py | 4 +++ vllm/platforms/openvino.py | 31 ++++++++++++++++++++ vllm/utils.py | 11 +------ vllm/worker/openvino_worker.py | 16 +++++----- 10 files changed, 69 insertions(+), 38 deletions(-) create mode 100644 vllm/platforms/openvino.py diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index 8bcee98403775..df3e770e260e0 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -30,7 +30,8 @@ def test_env(name: str, device: str, monkeypatch): False) assert backend.name == "ROCM_FLASH" elif device == "openvino": - with patch("vllm.attention.selector.is_openvino", return_value=True): + with patch("vllm.attention.selector.current_platform.is_openvino", + return_value=True): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "OPENVINO" diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index cd3c642b8c8a2..10d4509b38279 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -10,7 +10,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino +from vllm.utils import STR_BACKEND_ENV_VAR, is_hip logger = init_logger(__name__) @@ -193,7 +193,7 @@ def which_attn_to_use( logger.info("Cannot use %s backend on CPU.", selected_backend) return _Backend.TORCH_SDPA - if is_openvino(): + if current_platform.is_openvino(): if selected_backend != _Backend.OPENVINO: logger.info("Cannot use %s backend on OpenVINO.", selected_backend) return _Backend.OPENVINO diff --git a/vllm/config.py b/vllm/config.py index 25f841231dedd..a1fba98233b80 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -17,7 +17,7 @@ get_hf_image_processor_config, get_hf_text_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - is_hip, is_openvino, print_warning_once) + is_hip, print_warning_once) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -1117,7 +1117,7 @@ def __init__(self, device: str = "auto") -> None: self.device_type = "cuda" elif current_platform.is_neuron(): self.device_type = "neuron" - elif is_openvino(): + elif current_platform.is_openvino(): self.device_type = "openvino" elif current_platform.is_tpu(): self.device_type = "tpu" diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py index 4a39839a03199..d0c0333854dae 100644 --- a/vllm/executor/openvino_executor.py +++ b/vllm/executor/openvino_executor.py @@ -10,6 +10,7 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest from vllm.utils import (GiB_bytes, get_distributed_init_method, get_ip, get_open_port, make_async) @@ -17,14 +18,6 @@ logger = init_logger(__name__) -def is_openvino_cpu() -> bool: - return "CPU" in envs.VLLM_OPENVINO_DEVICE - - -def is_openvino_gpu() -> bool: - return "GPU" in envs.VLLM_OPENVINO_DEVICE - - class OpenVINOExecutor(ExecutorBase): uses_ray: bool = False @@ -32,7 +25,8 @@ class OpenVINOExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "openvino" assert self.lora_config is None, "OpenVINO backend doesn't support LoRA" - assert is_openvino_cpu() or is_openvino_gpu(), \ + assert current_platform.is_openvino_cpu() or \ + current_platform.is_openvino_gpu(), \ "OpenVINO backend supports only CPU and GPU devices" self.ov_core = ov.Core() @@ -163,7 +157,7 @@ def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig: def _verify_and_get_cache_config(ov_core: ov.Core, config: CacheConfig) -> CacheConfig: if envs.VLLM_OPENVINO_CPU_KV_CACHE_PRECISION == "u8": - if not is_openvino_cpu(): + if not current_platform.is_openvino_cpu(): logger.info("VLLM_OPENVINO_CPU_KV_CACHE_PRECISION is" "ignored for GPU, f16 data type will be used.") config.cache_dtype = ov.Type.f16 @@ -172,7 +166,7 @@ def _verify_and_get_cache_config(ov_core: ov.Core, "VLLM_OPENVINO_CPU_KV_CACHE_PRECISION env var.") config.cache_dtype = ov.Type.u8 else: - if is_openvino_cpu(): + if current_platform.is_openvino_cpu(): ov_device = envs.VLLM_OPENVINO_DEVICE inference_precision = ov_core.get_property( ov_device, hints.inference_precision) @@ -183,7 +177,7 @@ def _verify_and_get_cache_config(ov_core: ov.Core, else: config.cache_dtype = ov.Type.f16 - if is_openvino_cpu(): + if current_platform.is_openvino_cpu(): if config.block_size != 32: logger.info( f"OpenVINO CPU optimal block size is 32, overriding currently set {config.block_size}" # noqa: G004, E501 @@ -198,7 +192,7 @@ def _verify_and_get_cache_config(ov_core: ov.Core, kv_cache_space = envs.VLLM_OPENVINO_KVCACHE_SPACE if kv_cache_space >= 0: - if kv_cache_space == 0 and is_openvino_cpu(): + if kv_cache_space == 0 and current_platform.is_openvino_cpu(): config.openvino_kvcache_space_bytes = 4 * GiB_bytes # type: ignore logger.warning( "Environment variable VLLM_OPENVINO_KVCACHE_SPACE (GB) " diff --git a/vllm/model_executor/model_loader/openvino.py b/vllm/model_executor/model_loader/openvino.py index 88b7ac46e5541..8ada2210d0d51 100644 --- a/vllm/model_executor/model_loader/openvino.py +++ b/vllm/model_executor/model_loader/openvino.py @@ -12,12 +12,12 @@ import vllm.envs as envs from vllm.attention.backends.openvino import OpenVINOAttentionMetadata from vllm.config import DeviceConfig, ModelConfig -from vllm.executor.openvino_executor import is_openvino_cpu from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import (LogitsProcessor, _prune_hidden_states) from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform logger = init_logger(__name__) @@ -136,7 +136,7 @@ def __init__( ov_device = envs.VLLM_OPENVINO_DEVICE paged_attention_transformation(pt_model.model) _modify_cache_parameters(pt_model.model, kv_cache_dtype, - is_openvino_cpu()) + current_platform.is_openvino_cpu()) ov_compiled = ov_core.compile_model(pt_model.model, ov_device) self.ov_request = ov_compiled.create_infer_request() diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 58912158139bd..7e9f8b1297b80 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -65,6 +65,13 @@ except ImportError: pass +is_openvino = False +try: + from importlib.metadata import version + is_openvino = "openvino" in version("vllm") +except Exception: + pass + if is_tpu: # people might install pytorch built with cuda but run on tpu # so we need to check tpu first @@ -85,6 +92,9 @@ elif is_neuron: from .neuron import NeuronPlatform current_platform = NeuronPlatform() +elif is_openvino: + from .openvino import OpenVinoPlatform + current_platform = OpenVinoPlatform() else: current_platform = UnspecifiedPlatform() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index d36367f2bc9c1..7c933385d6ff6 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -11,6 +11,7 @@ class PlatformEnum(enum.Enum): XPU = enum.auto() CPU = enum.auto() NEURON = enum.auto() + OPENVINO = enum.auto() UNSPECIFIED = enum.auto() @@ -52,6 +53,9 @@ def is_cpu(self) -> bool: def is_neuron(self) -> bool: return self._enum == PlatformEnum.NEURON + def is_openvino(self) -> bool: + return self._enum == PlatformEnum.OPENVINO + def is_cuda_alike(self) -> bool: """Stateless version of :func:`torch.cuda.is_available`.""" return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM) diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py new file mode 100644 index 0000000000000..35dbe22abf7ff --- /dev/null +++ b/vllm/platforms/openvino.py @@ -0,0 +1,31 @@ +import torch + +import vllm.envs as envs +from vllm.utils import print_warning_once + +from .interface import Platform, PlatformEnum + + +class OpenVinoPlatform(Platform): + _enum = PlatformEnum.OPENVINO + + @classmethod + def get_device_name(self, device_id: int = 0) -> str: + return "openvino" + + @classmethod + def inference_mode(self): + return torch.inference_mode(mode=True) + + @classmethod + def is_openvino_cpu(self) -> bool: + return "CPU" in envs.VLLM_OPENVINO_DEVICE + + @classmethod + def is_openvino_gpu(self) -> bool: + return "GPU" in envs.VLLM_OPENVINO_DEVICE + + @classmethod + def is_pin_memory_available(self) -> bool: + print_warning_once("Pin memory is not supported on OpenViNO.") + return False diff --git a/vllm/utils.py b/vllm/utils.py index 0e9b241b6f9f6..fba9804289b94 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -318,15 +318,6 @@ def is_hip() -> bool: return torch.version.hip is not None -@lru_cache(maxsize=None) -def is_openvino() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - return "openvino" in version("vllm") - except PackageNotFoundError: - return False - - @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -757,7 +748,7 @@ def is_pin_memory_available() -> bool: elif current_platform.is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False - elif current_platform.is_cpu() or is_openvino(): + elif current_platform.is_cpu() or current_platform.is_openvino(): return False return True diff --git a/vllm/worker/openvino_worker.py b/vllm/worker/openvino_worker.py index bc245d19663d6..a420d390c1ae4 100644 --- a/vllm/worker/openvino_worker.py +++ b/vllm/worker/openvino_worker.py @@ -13,12 +13,12 @@ from vllm.distributed import (broadcast_tensor_dict, ensure_model_parallel_initialized, init_distributed_environment) -from vllm.executor.openvino_executor import is_openvino_cpu from vllm.inputs import INPUT_REGISTRY from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.model_executor.layers.sampler import SamplerOutput from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.platforms import current_platform from vllm.sampling_params import SamplingParams from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata from vllm.worker.openvino_model_runner import OpenVINOModelRunner @@ -99,7 +99,7 @@ def _allocate_kv_cache( num_blocks, self.block_size, self.num_kv_heads, self.head_size)[1:] kv_cache: List[Tuple[ov.Tensor, ov.Tensor]] = [] - if is_openvino_cpu(): + if current_platform.is_openvino_cpu(): for _ in range(self.num_layers): key_blocks = ov.Tensor(self.cache_config.cache_dtype, k_block_shape) @@ -141,7 +141,7 @@ def _allocate_swap_cache( if num_blocks == 0: return swap_cache - assert not is_openvino_cpu(), \ + assert not current_platform.is_openvino_cpu(), \ "CPU device isn't supposed to have swap cache" # Update key_cache shape: @@ -285,7 +285,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: cache_block_size = self.get_cache_block_size_bytes() kvcache_space_bytes = self.cache_config.openvino_kvcache_space_bytes - if is_openvino_cpu(): + if current_platform.is_openvino_cpu(): num_device_blocks = int(kvcache_space_bytes // cache_block_size) num_swap_blocks = 0 else: @@ -322,7 +322,7 @@ def initialize_cache(self, num_gpu_blocks: int, num_device_blocks = num_gpu_blocks num_swap_blocks = num_cpu_blocks - if is_openvino_cpu(): + if current_platform.is_openvino_cpu(): assert (num_swap_blocks == 0 ), f"{type(self)} does not support swappable cache for CPU" @@ -366,7 +366,7 @@ def _init_cache_engine(self) -> None: assert self.kv_cache is not None # Populate the cache to warmup the memory - if is_openvino_cpu(): + if current_platform.is_openvino_cpu(): for key_cache, value_cache in self.kv_cache: key_cache.data[:] = 0 value_cache.data[:] = 0 @@ -414,7 +414,7 @@ def execute_model( blocks_to_swap_in = data["blocks_to_swap_in"] blocks_to_swap_out = data["blocks_to_swap_out"] - if is_openvino_cpu(): + if current_platform.is_openvino_cpu(): assert len(execute_model_req.blocks_to_swap_in) == 0 assert len(execute_model_req.blocks_to_swap_out) == 0 else: @@ -466,7 +466,7 @@ def get_cache_block_size_bytes(self) -> int: def profile_run(self) -> int: ov_device = envs.VLLM_OPENVINO_DEVICE - assert not is_openvino_cpu(), \ + assert not current_platform.is_openvino_cpu(), \ "CPU device isn't supposed to use profile run." import openvino.properties.device as device From 55137e8ee32509b2fa3b83d5caaee018a929f82d Mon Sep 17 00:00:00 2001 From: ErkinSagiroglu <52523336+MErkinSag@users.noreply.github.com> Date: Sat, 26 Oct 2024 13:12:57 +0100 Subject: [PATCH 006/113] Fix: MI100 Support By Bypassing Custom Paged Attention (#9560) --- vllm/attention/backends/rocm_flash_attn.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index c2aec4aaa74e7..30859dfa60634 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -21,7 +21,10 @@ logger = init_logger(__name__) _PARTITION_SIZE_ROCM = 512 -_ON_NAVI = "gfx1" in torch.cuda.get_device_properties("cuda").gcnArchName +_GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName +_ON_NAVI = "gfx1" in _GPU_ARCH +_ON_MI250_MI300 = any(arch in _GPU_ARCH + for arch in ["gfx90a", "gfx940", "gfx941", "gfx942"]) class ROCmFlashAttentionBackend(AttentionBackend): @@ -662,7 +665,8 @@ def _use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int, block_size: int, gqa_ratio: int, max_seq_len: int) -> bool: # rocm custom page attention not support on navi (gfx1*) - return (not _ON_NAVI and (qtype == torch.half or qtype == torch.bfloat16) + return (_ON_MI250_MI300 and not _ON_NAVI + and (qtype == torch.half or qtype == torch.bfloat16) and (head_size == 64 or head_size == 128) and (block_size == 16 or block_size == 32) and (gqa_ratio >= 1 and gqa_ratio <= 16) and max_seq_len <= 32768) From 07e981fdf43bb7a7186c782a5ad6b99b36c2fc19 Mon Sep 17 00:00:00 2001 From: Vasiliy Alekseev Date: Sat, 26 Oct 2024 19:29:38 +0300 Subject: [PATCH 007/113] [Frontend] Bad words sampling parameter (#9717) Signed-off-by: Vasily Alexeev --- tests/samplers/test_no_bad_words.py | 185 ++++++++++++++++++ vllm/engine/llm_engine.py | 13 +- vllm/logits_process.py | 119 +++++++++++ .../guided_decoding/__init__.py | 3 +- .../lm_format_enforcer_decoding.py | 3 +- vllm/sampling_params.py | 32 +-- 6 files changed, 339 insertions(+), 16 deletions(-) create mode 100644 tests/samplers/test_no_bad_words.py create mode 100644 vllm/logits_process.py diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py new file mode 100644 index 0000000000000..4190cf7cd7664 --- /dev/null +++ b/tests/samplers/test_no_bad_words.py @@ -0,0 +1,185 @@ +"""Make sure bad_words works. + +Run `pytest tests/samplers/test_no_bad_words.py`. + +""" +from typing import List, Optional + +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams + + +def _generate( + model: LLM, + prompt: str, + num_prompt_tokens: int, + temperature: float = 0, + bad_words: Optional[List[str]] = None, +) -> List[int]: + sampling_params = SamplingParams( + temperature=temperature, + bad_words=bad_words, + ) + + # [([output_token_ids, ], [output_text, ]), ] + output = model.generate([prompt], sampling_params=sampling_params) + + output_token_ids = output[0][0][0][num_prompt_tokens:] + # [0] first (and only) request output + # [0] token_ids (not text) + # [0] first (and only) output completion + + return output_token_ids + + +class TestOneTokenBadWord: + MODEL = "TheBloke/Llama-2-7B-fp16" + + PROMPT = "Hi! How are" + TARGET_TOKEN = "you" + + def setup_method(self, method): + self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL, + add_prefix_space=True) + + self.num_prompt_tokens = len(self._encode(self.PROMPT)) + self.target_token_id = self._encode(self.TARGET_TOKEN, + add_special_tokens=False)[0] + + def test_one_token_bad_word(self, vllm_runner): + with vllm_runner(self.MODEL) as llm: + output_token_ids = self._generate(llm) + assert output_token_ids[0] == self.target_token_id + + output_token_ids = self._generate(llm, + bad_words=[self.TARGET_TOKEN]) + assert self.target_token_id not in output_token_ids + + def _generate(self, + model: LLM, + bad_words: Optional[List[str]] = None) -> List[int]: + return _generate( + model=model, + prompt=self.PROMPT, + num_prompt_tokens=self.num_prompt_tokens, + bad_words=bad_words, + ) + + def _encode(self, + prompt: str, + add_special_tokens: bool = True) -> List[int]: + return self.tokenizer(prompt, + add_special_tokens=add_special_tokens).input_ids + + +class TestTwoTokenBadWord: + # Another model (with a different tokenizer behaviour) + MODEL = "openai-community/gpt2" + + PROMPT = "How old are you? I am 10" + TARGET_TOKEN1 = "years" + TARGET_TOKEN2 = "old" + NEIGHBOUR_TOKEN2 = "older" + + def setup_method(self, method): + self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL, + add_prefix_space=True) + + self.num_prompt_tokens = len(self._encode(self.PROMPT)) + self.target_token_id1 = self._encode(self.TARGET_TOKEN1, + add_special_tokens=False)[0] + self.target_token_id2 = self._encode(self.TARGET_TOKEN2, + add_special_tokens=False)[0] + self.neighbour_token_id2 = self._encode(self.NEIGHBOUR_TOKEN2, + add_special_tokens=False)[0] + + def test_two_token_bad_word(self, vllm_runner): + with vllm_runner(self.MODEL) as llm: + output_token_ids = self._generate(llm) + assert output_token_ids[:2] == [ + self.target_token_id1, self.target_token_id2 + ] + + output_token_ids = self._generate(llm, + bad_words=[self.TARGET_TOKEN1]) + assert self.target_token_id1 not in output_token_ids + + output_token_ids = self._generate(llm, + bad_words=[self.TARGET_TOKEN2]) + assert output_token_ids[0] == self.target_token_id1 + assert self.target_token_id2 not in output_token_ids + + output_token_ids = self._generate( + llm, bad_words=[f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}']) + assert output_token_ids[0] == self.target_token_id1 + assert output_token_ids[:2] != [ + self.target_token_id1, self.target_token_id2 + ] + assert not self._contains( + output_token_ids, + [self.target_token_id1, self.target_token_id2]) + # Model dependent behaviour + assert output_token_ids[:2] == [ + self.target_token_id1, self.neighbour_token_id2 + ] + + output_token_ids = self._generate( + llm, + bad_words=[ + f'{self.TARGET_TOKEN1} {self.TARGET_TOKEN2}', + f'{self.TARGET_TOKEN1} {self.NEIGHBOUR_TOKEN2}' + ]) + assert output_token_ids[0] == self.target_token_id1 + assert output_token_ids[:2] != [ + self.target_token_id1, self.target_token_id2 + ] + assert not self._contains( + output_token_ids, + [self.target_token_id1, self.target_token_id2]) + assert output_token_ids[:2] != [ + self.target_token_id1, self.neighbour_token_id2 + ] + assert not self._contains( + output_token_ids, + [self.target_token_id1, self.neighbour_token_id2]) + assert ((self.target_token_id2 in output_token_ids) + or (self.neighbour_token_id2 in output_token_ids)) + + def _generate(self, + model: LLM, + bad_words: Optional[List[str]] = None) -> List[int]: + return _generate( + model=model, + prompt=self.PROMPT, + num_prompt_tokens=self.num_prompt_tokens, + bad_words=bad_words, + ) + + @staticmethod + def _contains(sequence: List[int], subsequence: List[int]) -> bool: + searched = False + + for start in range(len(sequence)): + end = start + len(subsequence) + current_subsequence = sequence[start:end] + + if len(current_subsequence) < len(subsequence): + continue + + searched = True + + assert len(current_subsequence) == len(subsequence) + + if current_subsequence == subsequence: + return True + + assert searched, "All subsequences did not match in length..." + + return False + + def _encode(self, + prompt: str, + add_special_tokens: bool = True) -> List[int]: + return self.tokenizer(prompt, + add_special_tokens=add_special_tokens).input_ids diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 1dd0f097c74ff..ede77f04b1db9 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -26,7 +26,8 @@ SequenceGroupOutputProcessor) from vllm.engine.output_processor.stop_checker import StopChecker from vllm.engine.output_processor.util import create_output_by_sequence_group -from vllm.entrypoints.openai.logits_processors import get_logits_processors +from vllm.entrypoints.openai.logits_processors import ( + get_logits_processors as get_openai_logits_processors) from vllm.executor.executor_base import ExecutorBase from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster @@ -34,6 +35,7 @@ EncoderDecoderInputs, InputRegistry, PromptType) from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger +from vllm.logits_process import get_bad_words_logits_processors from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding import ( get_local_guided_decoding_logits_processor) @@ -1963,6 +1965,7 @@ def _build_logits_processors( logits_processors field. Returns the modified sampling params.""" logits_processors = [] + if (guided_decoding := sampling_params.guided_decoding) is not None: logger.debug( @@ -1984,7 +1987,7 @@ def _build_logits_processors( if (sampling_params.logit_bias or sampling_params.allowed_token_ids): tokenizer = self.get_tokenizer(lora_request=lora_request) - processors = get_logits_processors( + processors = get_openai_logits_processors( logit_bias=sampling_params.logit_bias, allowed_token_ids=sampling_params.allowed_token_ids, tokenizer=tokenizer) @@ -1994,6 +1997,12 @@ def _build_logits_processors( sampling_params.logit_bias = None sampling_params.allowed_token_ids = None + if len(sampling_params.bad_words) > 0: + tokenizer = self.get_tokenizer(lora_request) + processors = get_bad_words_logits_processors( + bad_words=sampling_params.bad_words, tokenizer=tokenizer) + logits_processors.extend(processors) + if logits_processors: if sampling_params.logits_processors is None: sampling_params.logits_processors = logits_processors diff --git a/vllm/logits_process.py b/vllm/logits_process.py new file mode 100644 index 0000000000000..7716ccd27e253 --- /dev/null +++ b/vllm/logits_process.py @@ -0,0 +1,119 @@ +from typing import Callable, List, Tuple, Union + +import torch + +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer + +LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor], + Callable[[List[int], List[int], torch.Tensor], + torch.Tensor]] +"""LogitsProcessor is a function that takes a list +of previously generated tokens, the logits tensor +for the next token and, optionally, prompt tokens as a +first argument, and returns a modified tensor of logits +to sample from.""" + + +def get_bad_words_logits_processors( + bad_words: List[str], + tokenizer: AnyTokenizer) -> List[LogitsProcessor]: + bad_words_ids: List[List[int]] = list() + + for bad_word in bad_words: + # To prohibit words both at the beginning + # and in the middle of text + # (related to add_prefix_space tokenizer parameter) + for add_prefix_space in [False, True]: + prefix = " " if add_prefix_space else "" + prompt = prefix + bad_word.lstrip() + + if isinstance(tokenizer, MistralTokenizer): + # Mistral tokenizers should not add special tokens + prompt_token_ids = tokenizer.encode(prompt=prompt) + else: + prompt_token_ids = tokenizer.encode(text=prompt, + add_special_tokens=False) + + # If no space at the beginning + # or if prefix space produces a new word token + if (not add_prefix_space) or ( + add_prefix_space + and prompt_token_ids[0] != bad_words_ids[-1][0] + and len(prompt_token_ids) == len(bad_words_ids[-1])): + bad_words_ids.append(prompt_token_ids) + + return [NoBadWordsLogitsProcessor(bad_words_ids=bad_words_ids)] + + +class NoBadWordsLogitsProcessor: + _SMALLEST_LOGIT = float("-inf") + _NEUTRAL_LOGIT = 0.0 + + def __init__(self, bad_words_ids: List[List[int]]): + self.bad_words_ids = bad_words_ids + self.word_bias: torch.FloatTensor = None + + def __call__( + self, + past_tokens_ids: Union[List[int], Tuple[int]], + logits: torch.FloatTensor, + ) -> torch.Tensor: + if self.word_bias is None: + self._init_word_bias(logits=logits) + + last_token_bias = torch.zeros_like(logits) + + for bad_word_ids in self.bad_words_ids: + if len(bad_word_ids) == 1: # 1-token words already processed + continue + + if len(bad_word_ids) > len(past_tokens_ids) + 1: + continue + + prefix_length = len(bad_word_ids) - 1 + last_token_id = bad_word_ids[-1] + actual_prefix = past_tokens_ids[-prefix_length:] + expected_prefix = bad_word_ids[:prefix_length] + + assert len(actual_prefix) == len(expected_prefix) + + is_match = tuple(actual_prefix) == tuple(expected_prefix) + last_token_bias[last_token_id] += (self._SMALLEST_LOGIT if is_match + else self._NEUTRAL_LOGIT) + + logits = logits + self.word_bias + last_token_bias + + return logits + + def _init_word_bias(self, logits: torch.FloatTensor) -> None: + # Code based on NoBadWordsLogitsProcessor and SequenceBiasLogitsProcessor # noqa: E501 + # from https://github.com/huggingface/transformers/blob/main/src/transformers/generation/logits_process.py + + vocab_size = logits.shape[-1] + + self._check_token_ids_bounds(vocab_size=vocab_size) + + self.word_bias = torch.zeros((vocab_size, ), + dtype=torch.float, + device=logits.device) + + for bad_word_ids in self.bad_words_ids: + if len(bad_word_ids) == 1: + bad_word_id = bad_word_ids[-1] + self.word_bias[bad_word_id] = self._SMALLEST_LOGIT + + def _check_token_ids_bounds(self, vocab_size: int) -> None: + invalid_token_ids = [] + + for bad_word_ids in self.bad_words_ids: + for token_id in bad_word_ids: + if token_id < 0 or token_id >= vocab_size: + invalid_token_ids.append(token_id) + + if len(invalid_token_ids) > 0: + raise ValueError( + f"The model vocabulary size is {vocab_size}," + f" but the following tokens" + f" were specified as bad: {invalid_token_ids}." + f" All token id values should be integers satisfying:" + f" 0 <= token_id < {vocab_size}.") diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 368436aa14613..d7b67425fcbc0 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -1,6 +1,7 @@ from typing import Optional -from vllm.sampling_params import GuidedDecodingParams, LogitsProcessor +from vllm.logits_process import LogitsProcessor +from vllm.sampling_params import GuidedDecodingParams async def get_guided_decoding_logits_processor( diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py index cf2162ed7720d..a17e75a80300f 100644 --- a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py @@ -9,7 +9,8 @@ build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data) from transformers import PreTrainedTokenizerBase -from vllm.sampling_params import GuidedDecodingParams, LogitsProcessor +from vllm.logits_process import LogitsProcessor +from vllm.sampling_params import GuidedDecodingParams def get_local_lm_format_enforcer_guided_decoding_logits_processor( diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index 9993cec13d649..bac32c991a0e3 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -3,14 +3,14 @@ from dataclasses import dataclass from enum import Enum, IntEnum from functools import cached_property -from typing import Any, Callable, Dict, List, Optional, Set, Union +from typing import Any, Dict, List, Optional, Set, Union import msgspec -import torch from pydantic import BaseModel from typing_extensions import Annotated from vllm.logger import init_logger +from vllm.logits_process import LogitsProcessor logger = init_logger(__name__) @@ -24,16 +24,6 @@ class SamplingType(IntEnum): RANDOM_SEED = 2 -LogitsProcessor = Union[Callable[[List[int], torch.Tensor], torch.Tensor], - Callable[[List[int], List[int], torch.Tensor], - torch.Tensor]] -"""LogitsProcessor is a function that takes a list -of previously generated tokens, the logits tensor -for the next token and, optionally, prompt tokens as a -first argument, and returns a modified tensor of logits -to sample from.""" - - # maybe make msgspec? @dataclass class GuidedDecodingParams: @@ -139,6 +129,10 @@ class SamplingParams( stop_token_ids: List of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. + bad_words: List of words that are not allowed to be generated. + More precisely, only the last token of a corresponding + token sequence is not allowed when the next generated token + can complete the sequence. include_stop_str_in_output: Whether to include the stop strings in output text. Defaults to False. ignore_eos: Whether to ignore the EOS token and continue generating @@ -186,6 +180,7 @@ class SamplingParams( seed: Optional[int] = None stop: Optional[Union[str, List[str]]] = None stop_token_ids: Optional[List[int]] = None + bad_words: Optional[List[str]] = None ignore_eos: bool = False max_tokens: Optional[int] = 16 min_tokens: int = 0 @@ -228,6 +223,7 @@ def from_optional( seed: Optional[int] = None, stop: Optional[Union[str, List[str]]] = None, stop_token_ids: Optional[List[int]] = None, + bad_words: Optional[List[str]] = None, include_stop_str_in_output: bool = False, ignore_eos: bool = False, max_tokens: Optional[int] = 16, @@ -267,6 +263,7 @@ def from_optional( seed=seed, stop=stop, stop_token_ids=stop_token_ids, + bad_words=bad_words, include_stop_str_in_output=include_stop_str_in_output, ignore_eos=ignore_eos, max_tokens=max_tokens, @@ -298,26 +295,36 @@ def __post_init__(self) -> None: f"got n={self.n} and best_of={self.best_of}.") self._real_n = self.n self.n = self.best_of + if 0 < self.temperature < _MAX_TEMP: logger.warning( "temperature %s is less than %s, which may cause numerical " "errors nan or inf in tensors. We have maxed it out to %s.", self.temperature, _MAX_TEMP, _MAX_TEMP) self.temperature = max(self.temperature, _MAX_TEMP) + if self.seed == -1: self.seed = None else: self.seed = self.seed + if self.stop is None: self.stop = [] elif isinstance(self.stop, str): self.stop = [self.stop] else: self.stop = list(self.stop) + if self.stop_token_ids is None: self.stop_token_ids = [] else: self.stop_token_ids = list(self.stop_token_ids) + + if self.bad_words is None: + self.bad_words = [] + else: + self.bad_words = list(self.bad_words) + self.logprobs = 1 if self.logprobs is True else self.logprobs self.prompt_logprobs = (1 if self.prompt_logprobs is True else self.prompt_logprobs) @@ -468,6 +475,7 @@ def __repr__(self) -> str: f"seed={self.seed}, " f"stop={self.stop}, " f"stop_token_ids={self.stop_token_ids}, " + f"bad_words={self.bad_words}, " f"include_stop_str_in_output={self.include_stop_str_in_output}, " f"ignore_eos={self.ignore_eos}, " f"max_tokens={self.max_tokens}, " From 6650e6a930dbdf1cd4def9b58e952376400ccfcf Mon Sep 17 00:00:00 2001 From: kakao-kevin-us Date: Sun, 27 Oct 2024 02:53:35 +0900 Subject: [PATCH 008/113] [Model] Add classification Task with Qwen2ForSequenceClassification (#9704) Signed-off-by: Kevin-Yang Co-authored-by: Kevin-Yang --- docs/source/models/supported_models.rst | 22 ++++ tests/conftest.py | 19 ++++ .../embedding/language/test_cls_models.py | 53 +++++++++ vllm/model_executor/layers/pooler.py | 9 +- vllm/model_executor/models/qwen2_cls.py | 107 ++++++++++++++++++ vllm/model_executor/models/registry.py | 2 + 6 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 tests/models/embedding/language/test_cls_models.py create mode 100644 vllm/model_executor/models/qwen2_cls.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 98d804052b575..ff893b613f150 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -361,6 +361,28 @@ Reward Modeling .. note:: As an interim measure, these models are supported via Embeddings API. See `this RFC `_ for upcoming changes. +Classification +--------------- + +.. list-table:: + :widths: 25 25 50 5 5 + :header-rows: 1 + + * - Architecture + - Models + - Example HF Models + - :ref:`LoRA ` + - :ref:`PP ` + * - :code:`Qwen2ForSequenceClassification` + - Qwen2-based + - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. + - + - ✅︎ + +.. note:: + As an interim measure, these models are supported via Embeddings API. It will be supported via Classification API in the future (no reference APIs exist now). + + Multimodal Language Models ^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/tests/conftest.py b/tests/conftest.py index 6adff5e2328c4..2fce2d772c6ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -343,6 +343,17 @@ def get_inputs( return all_inputs + def classify(self, prompts: List[str]) -> List[str]: + # output is final logits + all_inputs = self.get_inputs(prompts) + outputs = [] + for inputs in all_inputs: + output = self.model(**self.wrap_device(inputs)) + logits = output.logits.softmax(dim=-1)[0].tolist() + outputs.append(logits) + + return outputs + def generate( self, prompts: List[str], @@ -688,6 +699,14 @@ def get_inputs( return inputs + def classify(self, prompts: List[str]) -> List[str]: + req_outputs = self.model.encode(prompts) + outputs = [] + for req_output in req_outputs: + embedding = req_output.outputs.embedding + outputs.append(embedding) + return outputs + def generate( self, prompts: List[str], diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py new file mode 100644 index 0000000000000..d8ca6d361f0e3 --- /dev/null +++ b/tests/models/embedding/language/test_cls_models.py @@ -0,0 +1,53 @@ +"""Compare the outputs of HF and vLLM when using greedy sampling. + +This test only tests small models. Big models such as 7B should be tested from +test_big_models.py because it could use a larger instance to run tests. + +Run `pytest tests/models/test_cls_models.py`. +""" +import pytest +import torch +from transformers import AutoModelForSequenceClassification + +CLASSIFICATION_MODELS = ["jason9693/Qwen2.5-1.5B-apeach"] + + +@pytest.mark.parametrize("model", CLASSIFICATION_MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_classification_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, +) -> None: + with hf_runner(model, + dtype=dtype, + auto_cls=AutoModelForSequenceClassification) as hf_model: + hf_outputs = hf_model.classify(example_prompts) + + with vllm_runner(model, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.classify(example_prompts) + + print(hf_outputs, vllm_outputs) + + # check logits difference + for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): + hf_output = torch.tensor(hf_output) + vllm_output = torch.tensor(vllm_output) + + assert torch.allclose(hf_output, vllm_output, 1e-3) + + +@pytest.mark.parametrize("model", CLASSIFICATION_MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +def test_classification_model_print( + vllm_runner, + model: str, + dtype: str, +) -> None: + with vllm_runner(model, dtype=dtype) as vllm_model: + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + print(vllm_model.model.llm_engine.model_executor.driver_worker. + model_runner.model) diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py index 3455a4ccf282f..0a1df9cb699ae 100644 --- a/vllm/model_executor/layers/pooler.py +++ b/vllm/model_executor/layers/pooler.py @@ -28,11 +28,15 @@ class Pooler(nn.Module): normalize: Whether to normalize the pooled data. """ - def __init__(self, pooling_type: PoolingType, normalize: bool): + def __init__(self, + pooling_type: PoolingType, + normalize: bool, + softmax: bool = False): super().__init__() self.pooling_type = pooling_type self.normalize = normalize + self.softmax = softmax def forward( self, @@ -64,6 +68,9 @@ def forward( if self.normalize: pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1) + if self.softmax: + pooled_data = nn.functional.softmax(pooled_data, dim=-1) + pooled_outputs = [ EmbeddingSequenceGroupOutput(data.tolist()) for data in pooled_data ] diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py new file mode 100644 index 0000000000000..e10c6dbbb6472 --- /dev/null +++ b/vllm/model_executor/models/qwen2_cls.py @@ -0,0 +1,107 @@ +# coding=utf-8 +# Adapted from +# https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B/blob/main/modeling_qwen2_rm.py +# Copyright 2024 Kakao Corp. (Kanana-X Team) +# Copyright 2024 The Qwen team. +# Copyright 2023 The vLLM team. +"""Inference-only Qwen2-Classification model compatible with HF weights.""" +from typing import Iterable, List, Optional, Tuple + +import torch +from torch import nn +from transformers import Qwen2Config + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig, LoRAConfig +from vllm.model_executor.layers.linear import RowParallelLinear +from vllm.model_executor.layers.pooler import Pooler, PoolingType +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.models.qwen2 import Qwen2Model +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.sequence import IntermediateTensors, PoolerOutput + +from .utils import AutoWeightsLoader + + +class Qwen2ForSequenceClassification(nn.Module): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + + # LoRA specific attributes + supported_lora_modules = [ + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] + + def __init__( + self, + config: Qwen2Config, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + lora_config: Optional[LoRAConfig] = None, + ) -> None: + # TODO (@robertgshaw2): see if this can be moved out + if (cache_config.sliding_window is not None + and hasattr(config, "max_window_layers")): + raise ValueError("Sliding window for some but all layers is not " + "supported. This model uses sliding window " + "but `max_window_layers` = %s is less than " + "`num_hidden_layers` = %s. Please open an issue " + "to discuss this feature." % ( + config.max_window_layers, + config.num_hidden_layers, + )) + + super().__init__() + + self.config = config + self.lora_config = lora_config + + self.quant_config = quant_config + self.model = Qwen2Model(config, cache_config, quant_config) + + self.score = RowParallelLinear(config.hidden_size, + config.num_labels, + quant_config=quant_config) + self._pooler = Pooler(pooling_type=PoolingType.LAST, + normalize=False, + softmax=True) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> torch.Tensor: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors) + logits, _ = self.score(hidden_states) + return logits + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + loader = AutoWeightsLoader(self, + ignore_unexpected_prefixes=["lm_head."]) + loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 717615988a907..f6713ab0898f0 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -96,6 +96,8 @@ "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), "MistralModel": ("llama", "LlamaEmbeddingModel"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), + "Qwen2ForSequenceClassification": ( + "qwen2_cls", "Qwen2ForSequenceClassification"), # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), From 67a6882da474a45dde0d35b3789e096e7bd0fd4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A7=91=E8=8B=B1?= Date: Sun, 27 Oct 2024 12:18:03 +0800 Subject: [PATCH 009/113] [Misc] SpecDecodeWorker supports profiling (#9719) Signed-off-by: Abatom --- vllm/spec_decode/spec_decode_worker.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 316db43502d3b..9f7ef2f8d851c 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -1038,6 +1038,14 @@ def get_cache_block_size_bytes(self): """ raise NotImplementedError + def start_profile(self): + if isinstance(self.scorer_worker, Worker): + self.scorer_worker.start_profile() + + def stop_profile(self): + if isinstance(self.scorer_worker, Worker): + self.scorer_worker.stop_profile() + def split_num_cache_blocks_evenly(scorer_cache_block_size_bytes: int, proposer_cache_block_size_bytes: int, From 8549c82660cfa59a13cccd622f8afcc29cbd4281 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 27 Oct 2024 00:19:28 -0700 Subject: [PATCH 010/113] [core] cudagraph output with tensor weak reference (#9724) Signed-off-by: youkaichao --- csrc/ops.h | 24 +++++++++++++++++++++ csrc/torch_bindings.cpp | 3 +++ vllm/utils.py | 9 ++++++++ vllm/worker/model_runner.py | 42 +++++++++++++------------------------ 4 files changed, 50 insertions(+), 28 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index f737f50c2ec96..c50eb39a3dacc 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -5,6 +5,30 @@ #include "core/scalar_type.hpp" +#include + +torch::Tensor weak_ref_tensor(torch::Tensor& tensor) { + // Ensure tensor is on CUDA + if (!tensor.is_cuda()) { + throw std::runtime_error("Tensor must be on CUDA device"); + } + + // Get the raw data pointer + void* data_ptr = tensor.data_ptr(); + + // Get tensor sizes and strides + std::vector sizes = tensor.sizes().vec(); + std::vector strides = tensor.strides().vec(); + + // Get tensor options (dtype, device) + auto options = tensor.options(); + + // Create a new tensor from the raw data pointer + auto new_tensor = torch::from_blob(data_ptr, sizes, strides, options); + + return new_tensor; +} + void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index e704ff629fd6e..b8185c24d5628 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -18,6 +18,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops + ops.def("weak_ref_tensor(Tensor input) -> Tensor"); + ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); + // Attention ops // Compute the attention between an input query and the cached // keys/values using PagedAttention. diff --git a/vllm/utils.py b/vllm/utils.py index fba9804289b94..1f75de89d0cc2 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1479,3 +1479,12 @@ def __iter__(self): def __len__(self): return len(self._factory) + + +def weak_ref_tensor(tensor: torch.Tensor) -> torch.Tensor: + """ + Create a weak reference to a tensor. + The new tensor will share the same data as the original tensor, + but will not keep the original tensor alive. + """ + return torch.ops._C.weak_ref_tensor(tensor) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 8b74f06e77be0..4a287e3741d0f 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -50,7 +50,7 @@ from vllm.transformers_utils.config import uses_mrope from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d, flatten_2d_lists, is_hip, is_pin_memory_available, - supports_dynamo) + supports_dynamo, weak_ref_tensor) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, @@ -1426,12 +1426,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: dtype=self.model_config.dtype, device=self.device) - # Prepare buffer for outputs. These will be reused for all batch sizes. - # It will be filled after the first graph capture. - hidden_or_intermediate_states: List[Optional[torch.Tensor]] = [ - None - ] * self.parallel_config.pipeline_parallel_size - graph_batch_size = self.max_batchsize_to_capture batch_size_capture_list = [ bs for bs in _BATCH_SIZES_TO_CAPTURE if bs <= graph_batch_size @@ -1474,12 +1468,6 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: input_tokens[:batch_size], "positions": input_positions[..., :batch_size], - "hidden_or_intermediate_states": - hidden_or_intermediate_states[ - virtual_engine] # type: ignore - [:batch_size] - if hidden_or_intermediate_states[virtual_engine] - is not None else None, "intermediate_inputs": intermediate_inputs[:batch_size] if intermediate_inputs is not None else None, @@ -1762,15 +1750,13 @@ def capture( self, input_ids: torch.Tensor, positions: torch.Tensor, - hidden_or_intermediate_states: Optional[Union[IntermediateTensors, - torch.Tensor]], intermediate_inputs: Optional[IntermediateTensors], kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, memory_pool: Optional[Tuple[int, int]], stream: torch.cuda.Stream, **kwargs, - ) -> Union[torch.Tensor, IntermediateTensors]: + ): assert self._graph is None # Run the model a few times without capturing the graph. # This is to make sure that the captured graph does not include the @@ -1799,20 +1785,21 @@ def capture( intermediate_tensors=intermediate_inputs, **kwargs, ) - if hidden_or_intermediate_states is not None: - if get_pp_group().is_last_rank: - hidden_or_intermediate_states.copy_( - output_hidden_or_intermediate_states) - else: - for key in hidden_or_intermediate_states.tensors: - hidden_or_intermediate_states[key].copy_( - output_hidden_or_intermediate_states[key]) - else: - hidden_or_intermediate_states = ( + + if isinstance(output_hidden_or_intermediate_states, torch.Tensor): + hidden_or_intermediate_states = weak_ref_tensor( output_hidden_or_intermediate_states) + elif isinstance(output_hidden_or_intermediate_states, + IntermediateTensors): + hidden_or_intermediate_states = IntermediateTensors( + tensors={ + key: weak_ref_tensor(value) + for key, value in + output_hidden_or_intermediate_states.tensors.items() + }) del output_hidden_or_intermediate_states - # make sure `output_hidden_states` is deleted + # make sure `output_hidden_or_intermediate_states` is deleted # in the graph's memory pool gc.collect() torch.cuda.synchronize() @@ -1837,7 +1824,6 @@ def capture( } else: self.output_buffers = hidden_or_intermediate_states - return hidden_or_intermediate_states def forward( self, From 3cb07a36a20f9af11346650559470d685e9dc711 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Sun, 27 Oct 2024 05:44:24 -0400 Subject: [PATCH 011/113] [Misc] Upgrade to pytorch 2.5 (#9588) Signed-off-by: Bill Nell Signed-off-by: youkaichao Co-authored-by: youkaichao --- CMakeLists.txt | 4 +- cmake/utils.cmake | 6 +-- pyproject.toml | 2 +- requirements-build.txt | 2 +- requirements-cuda.txt | 6 +-- requirements-openvino.txt | 2 +- .../decoder_only/language/test_big_models.py | 46 ++++++++++++++----- vllm/platforms/cuda.py | 5 ++ 8 files changed, 48 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fc4ac10b7669a..1a6a311e97633 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,7 +49,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # requirements.txt files and should be kept consistent. The ROCm torch # versions are derived from Dockerfile.rocm # -set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0") +set(TORCH_SUPPORTED_VERSION_CUDA "2.5.0") set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0") # @@ -507,7 +507,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd + GIT_TAG 5259c586c403a4e4d8bf69973c159b40cc346fb9 GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 24bb7299338ac..40430dae10c5b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -424,11 +424,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of # dependencies that are not necessary and may not be installed. if (GPU_LANGUAGE STREQUAL "CUDA") - if ("${CUDA_CUDA_LIB}" STREQUAL "") - set(CUDA_CUDA_LIB "${CUDA_CUDA_LIBRARY}") - endif() - target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB} - ${CUDA_LIBRARIES}) + target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver) else() target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}) endif() diff --git a/pyproject.toml b/pyproject.toml index e0c56ab79cad0..e78f5652f486b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ requires = [ "packaging", "setuptools>=61", "setuptools-scm>=8.0", - "torch == 2.4.0", + "torch == 2.5.0", "wheel", "jinja2", ] diff --git a/requirements-build.txt b/requirements-build.txt index 6144a56da8c47..ea2b688bb3108 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -4,6 +4,6 @@ ninja packaging setuptools>=61 setuptools-scm>=8 -torch==2.4.0 +torch==2.5.0 wheel jinja2 diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 3b3c2f876919e..92fa303d687a2 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -4,7 +4,7 @@ # Dependencies for NVIDIA GPUs ray >= 2.9 nvidia-ml-py # for pynvml package -torch == 2.4.0 +torch == 2.5.0 # These must be updated alongside torch -torchvision == 0.19 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version -xformers == 0.0.27.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.4.0 +torchvision == 0.20 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version +xformers == 0.0.28.post2; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.0 diff --git a/requirements-openvino.txt b/requirements-openvino.txt index ac54cf0c3288f..7ad0d1e7f704b 100644 --- a/requirements-openvino.txt +++ b/requirements-openvino.txt @@ -1,7 +1,7 @@ # Common dependencies -r requirements-common.txt -torch == 2.4.0 # should be aligned with "common" vLLM torch version +torch == 2.5.0 # should be aligned with "common" vLLM torch version openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version diff --git a/tests/models/decoder_only/language/test_big_models.py b/tests/models/decoder_only/language/test_big_models.py index 75625b35209ce..fcfc159e4f5a0 100644 --- a/tests/models/decoder_only/language/test_big_models.py +++ b/tests/models/decoder_only/language/test_big_models.py @@ -8,7 +8,7 @@ from vllm.platforms import current_platform -from ...utils import check_outputs_equal +from ...utils import check_logprobs_close, check_outputs_equal MODELS = [ "meta-llama/Llama-2-7b-hf", @@ -43,18 +43,40 @@ def test_models( dtype: str, max_tokens: int, ) -> None: - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) - with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - - check_outputs_equal( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) + if model == "openbmb/MiniCPM3-4B": + # the output becomes slightly different when upgrading to + # pytorch 2.5 . Changing to logprobs checks instead of exact + # output checks. + NUM_LOG_PROBS = 8 + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, NUM_LOG_PROBS) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + else: + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) + + with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, + max_tokens) + + check_outputs_equal( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) @pytest.mark.parametrize("model", MODELS) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 30bbf5107475d..9c5212ace1346 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -7,6 +7,7 @@ from typing import Callable, List, Tuple, TypeVar import pynvml +import torch from typing_extensions import ParamSpec from vllm.logger import init_logger @@ -26,6 +27,10 @@ " and cause errors. See https://pypi.org/project/pynvml " "for more information.") +# pytorch 2.5 uses cudnn sdpa by default, which will cause crash on some models +# see https://github.com/huggingface/diffusers/issues/9704 for details +torch.backends.cuda.enable_cudnn_sdp(False) + # NVML utils # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`, # all the related functions work on real physical device ids. From e130c40e4eba63ee8f04d493d83bca8c59b5ada5 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Sun, 27 Oct 2024 17:30:03 +0000 Subject: [PATCH 012/113] Fix cache management in "Close inactive issues and PRs" actions workflow (#9734) --- .github/workflows/stale.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 2418c61bdcf63..81e7c9b050760 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -10,6 +10,7 @@ jobs: permissions: issues: write pull-requests: write + actions: write runs-on: ubuntu-latest steps: - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0 From 34a9941620d00879599a51609225452b705bae89 Mon Sep 17 00:00:00 2001 From: madt2709 <55849102+madt2709@users.noreply.github.com> Date: Sun, 27 Oct 2024 10:46:41 -0700 Subject: [PATCH 013/113] [Bugfix] Fix load config when using bools (#9533) --- tests/data/test_config.yaml | 2 ++ tests/test_utils.py | 6 +++++- vllm/engine/arg_utils.py | 14 +------------- vllm/utils.py | 35 +++++++++++++++++++++++++++-------- 4 files changed, 35 insertions(+), 22 deletions(-) diff --git a/tests/data/test_config.yaml b/tests/data/test_config.yaml index 42f4f6f7bb992..5090e8f357bb8 100644 --- a/tests/data/test_config.yaml +++ b/tests/data/test_config.yaml @@ -1,3 +1,5 @@ port: 12312 served_model_name: mymodel tensor_parallel_size: 2 +trust_remote_code: true +multi_step_stream_outputs: false diff --git a/tests/test_utils.py b/tests/test_utils.py index 0fed8e678fc76..a731b11eae81c 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -6,7 +6,7 @@ import pytest -from vllm.utils import (FlexibleArgumentParser, deprecate_kwargs, +from vllm.utils import (FlexibleArgumentParser, StoreBoolean, deprecate_kwargs, get_open_port, merge_async_iterators, supports_kw) from .utils import error_on_warning @@ -141,6 +141,8 @@ def parser_with_config(): parser.add_argument('--config', type=str) parser.add_argument('--port', type=int) parser.add_argument('--tensor-parallel-size', type=int) + parser.add_argument('--trust-remote-code', action='store_true') + parser.add_argument('--multi-step-stream-outputs', action=StoreBoolean) return parser @@ -214,6 +216,8 @@ def test_config_args(parser_with_config): args = parser_with_config.parse_args( ['serve', 'mymodel', '--config', './data/test_config.yaml']) assert args.tensor_parallel_size == 2 + assert args.trust_remote_code + assert not args.multi_step_stream_outputs def test_config_file(parser_with_config): diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c49f475b9ee61..38687809a31f6 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -19,7 +19,7 @@ from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) from vllm.transformers_utils.utils import check_gguf_file -from vllm.utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser, StoreBoolean if TYPE_CHECKING: from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup @@ -1144,18 +1144,6 @@ def add_cli_args(parser: FlexibleArgumentParser, return parser -class StoreBoolean(argparse.Action): - - def __call__(self, parser, namespace, values, option_string=None): - if values.lower() == "true": - setattr(namespace, self.dest, True) - elif values.lower() == "false": - setattr(namespace, self.dest, False) - else: - raise ValueError(f"Invalid boolean value: {values}. " - "Expected 'true' or 'false'.") - - # These functions are used by sphinx to build the documentation def _engine_args_parser(): return EngineArgs.add_cli_args(FlexibleArgumentParser()) diff --git a/vllm/utils.py b/vllm/utils.py index 1f75de89d0cc2..d4f2c936ca9cc 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1155,6 +1155,18 @@ def wrapper(*args: P.args, **kwargs: P.kwargs) -> None: return wrapper +class StoreBoolean(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + if values.lower() == "true": + setattr(namespace, self.dest, True) + elif values.lower() == "false": + setattr(namespace, self.dest, False) + else: + raise ValueError(f"Invalid boolean value: {values}. " + "Expected 'true' or 'false'.") + + class FlexibleArgumentParser(argparse.ArgumentParser): """ArgumentParser that allows both underscore and dash in names.""" @@ -1163,7 +1175,7 @@ def parse_args(self, args=None, namespace=None): args = sys.argv[1:] if '--config' in args: - args = FlexibleArgumentParser._pull_args_from_config(args) + args = self._pull_args_from_config(args) # Convert underscores to dashes and vice versa in argument names processed_args = [] @@ -1181,8 +1193,7 @@ def parse_args(self, args=None, namespace=None): return super().parse_args(processed_args, namespace) - @staticmethod - def _pull_args_from_config(args: List[str]) -> List[str]: + def _pull_args_from_config(self, args: List[str]) -> List[str]: """Method to pull arguments specified in the config file into the command-line args variable. @@ -1226,7 +1237,7 @@ def _pull_args_from_config(args: List[str]) -> List[str]: file_path = args[index + 1] - config_args = FlexibleArgumentParser._load_config_file(file_path) + config_args = self._load_config_file(file_path) # 0th index is for {serve,chat,complete} # followed by model_tag (only for serve) @@ -1247,8 +1258,7 @@ def _pull_args_from_config(args: List[str]) -> List[str]: return args - @staticmethod - def _load_config_file(file_path: str) -> List[str]: + def _load_config_file(self, file_path: str) -> List[str]: """Loads a yaml file and returns the key value pairs as a flattened list with argparse like pattern ```yaml @@ -1282,9 +1292,18 @@ def _load_config_file(file_path: str) -> List[str]: Make sure path is correct", file_path) raise ex + store_boolean_arguments = [ + action.dest for action in self._actions + if isinstance(action, StoreBoolean) + ] + for key, value in config.items(): - processed_args.append('--' + key) - processed_args.append(str(value)) + if isinstance(value, bool) and key not in store_boolean_arguments: + if value: + processed_args.append('--' + key) + else: + processed_args.append('--' + key) + processed_args.append(str(value)) return processed_args From 4e2d95e372ad5fbef7b27c66d527c37477c0c8bb Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Mon, 28 Oct 2024 12:07:00 +0800 Subject: [PATCH 014/113] [Hardware][ROCM] using current_platform.is_rocm (#9642) Signed-off-by: wangshuai09 <391746016@qq.com> --- .../test_basic_correctness.py | 4 +- tests/compile/utils.py | 4 +- tests/kernels/quant_utils.py | 17 +++-- tests/kernels/test_attention.py | 23 +++--- tests/kernels/test_attention_selector.py | 3 +- tests/kernels/test_blocksparse_attention.py | 7 +- tests/kernels/test_encoder_decoder_attn.py | 76 ++++++++++--------- tests/kernels/test_moe.py | 7 +- tests/lora/test_gemma.py | 5 +- tests/lora/test_quant_model.py | 4 +- .../vision_language/test_paligemma.py | 9 ++- .../vision_language/test_phi3v.py | 3 +- .../e2e/test_integration_dist_tp2.py | 4 +- tests/utils.py | 4 +- vllm/_custom_ops.py | 8 +- .../ops/blocksparse_attention/interface.py | 6 +- vllm/attention/selector.py | 4 +- vllm/config.py | 49 ++++++------ vllm/executor/ray_utils.py | 4 +- vllm/model_executor/custom_op.py | 4 +- .../compressed_tensors_moe.py | 5 +- .../schemes/compressed_tensors_w8a8_fp8.py | 6 +- .../layers/quantization/fbgemm_fp8.py | 3 +- .../model_executor/layers/quantization/fp8.py | 10 +-- .../layers/quantization/utils/w8a8_utils.py | 6 +- vllm/model_executor/models/exaone.py | 4 +- vllm/model_executor/models/granite.py | 4 +- vllm/model_executor/models/llama.py | 4 +- vllm/model_executor/models/registry.py | 4 +- vllm/model_executor/models/solar.py | 4 +- vllm/utils.py | 6 +- vllm/worker/model_runner.py | 9 ++- 32 files changed, 162 insertions(+), 148 deletions(-) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 3c2ca1bddd906..79647589d5204 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -11,7 +11,7 @@ import pytest from vllm import LLM -from vllm.utils import is_hip +from vllm.platforms import current_platform from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata from ..models.utils import check_outputs_equal @@ -51,7 +51,7 @@ def test_models( enforce_eager: bool, ) -> None: - if backend == "FLASHINFER" and is_hip(): + if backend == "FLASHINFER" and current_platform.is_rocm(): pytest.skip("Flashinfer does not support ROCm/HIP.") os.environ["VLLM_ATTENTION_BACKEND"] = backend diff --git a/tests/compile/utils.py b/tests/compile/utils.py index c69343b51ae02..64fc08e80de3b 100644 --- a/tests/compile/utils.py +++ b/tests/compile/utils.py @@ -5,7 +5,7 @@ from tests.quantization.utils import is_quant_method_supported from vllm import LLM, SamplingParams from vllm.compilation.levels import CompilationLevel -from vllm.utils import is_hip +from vllm.platforms import current_platform TEST_MODELS = [ ("facebook/opt-125m", {}), @@ -55,7 +55,7 @@ "quantization": "marlin" })) -if not is_hip() and is_quant_method_supported("awq"): +if not current_platform.is_rocm() and is_quant_method_supported("awq"): TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", { "quantization": "AWQ" })) diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 8f6a54ff5979c..f2358940fc7b8 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -2,12 +2,13 @@ import torch -from vllm.utils import is_hip +from vllm.platforms import current_platform # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. ROCM_FP8_MAX = 224.0 -FP8_DTYPE = torch.float8_e4m3fnuz if is_hip() else torch.float8_e4m3fn +FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm() \ + else torch.float8_e4m3fn def as_float32_tensor(x: Union[float, torch.tensor]) -> torch.tensor: @@ -24,8 +25,10 @@ def ref_dynamic_per_token_quant(x: torch.tensor, qtype_traits = torch.iinfo(quant_dtype) if quant_dtype == torch.int8 \ else torch.finfo(quant_dtype) - qtype_traits_max = ROCM_FP8_MAX if is_hip() else qtype_traits.max - qtype_traits_min = -ROCM_FP8_MAX if is_hip() else qtype_traits.min + qtype_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + else qtype_traits.max + qtype_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + else qtype_traits.min qtype_max = as_float32_tensor(qtype_traits_max) s_1 = as_float32_tensor(1.0) s_512 = as_float32_tensor(512.0) @@ -66,8 +69,10 @@ def ref_dynamic_per_tensor_fp8_quant(x: torch.tensor) \ -> Tuple[torch.tensor, torch.tensor]: fp8_traits = torch.finfo(FP8_DTYPE) - fp8_traits_max = ROCM_FP8_MAX if is_hip() else fp8_traits.max - fp8_traits_min = -ROCM_FP8_MAX if is_hip() else fp8_traits.min + fp8_traits_max = ROCM_FP8_MAX if current_platform.is_rocm() \ + else fp8_traits.max + fp8_traits_min = -ROCM_FP8_MAX if current_platform.is_rocm() \ + else fp8_traits.min fp8_max = as_float32_tensor(fp8_traits_max) one = as_float32_tensor(1.0) diff --git a/tests/kernels/test_attention.py b/tests/kernels/test_attention.py index 52f1ecd176963..1604aa4d2d6e5 100644 --- a/tests/kernels/test_attention.py +++ b/tests/kernels/test_attention.py @@ -6,11 +6,12 @@ from tests.kernels.utils import opcheck from vllm import _custom_ops as ops -from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything +from vllm.platforms import current_platform +from vllm.utils import get_max_shared_memory_bytes, seed_everything from .allclose_default import get_default_atol, get_default_rtol -if not is_hip(): +if not current_platform.is_rocm(): from xformers import ops as xops from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask @@ -23,8 +24,9 @@ NUM_BLOCKS = 4321 # Arbitrary values for testing PARTITION_SIZE = 512 # flshattF and tritonflashattF supported: {torch.float16, torch.bfloat16} -DTYPES = [torch.half, torch.bfloat16, torch.float - ] if not is_hip() else [torch.half, torch.bfloat16] +DTYPES = [ + torch.half, torch.bfloat16, torch.float +] if not current_platform.is_rocm() else [torch.half, torch.bfloat16] NUM_GEN_SEQS = [7] # Arbitrary values for testing NUM_PREFILL_SEQS = [3] # Arbitrary values for testing NUM_HEADS = [(40, 40), (64, 8)] # Arbitrary values for testing @@ -114,7 +116,8 @@ def ref_single_query_cached_kv_attention( @pytest.mark.parametrize( - "version", ["v1", "v2"] if not is_hip() else ["v1", "v2", "rocm"]) + "version", + ["v1", "v2"] if not current_platform.is_rocm() else ["v1", "v2", "rocm"]) @pytest.mark.parametrize("num_seqs", NUM_GEN_SEQS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @@ -317,8 +320,8 @@ def test_paged_attention( # NOTE(woosuk): Due to the kernel-level differences in the two # implementations, there is a small numerical difference in the two # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. @@ -368,7 +371,7 @@ def ref_multi_query_kv_attention( @pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("device", CUDA_DEVICES) -@pytest.mark.skipif(is_hip(), +@pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") @torch.inference_mode() def test_multi_query_kv_attention( @@ -425,6 +428,6 @@ def test_multi_query_kv_attention( scale, dtype, ) - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 torch.testing.assert_close(output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index df3e770e260e0..3fe9ca0b0450f 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -25,7 +25,8 @@ def test_env(name: str, device: str, monkeypatch): False) assert backend.name == "TORCH_SDPA" elif device == "hip": - with patch("vllm.attention.selector.is_hip", return_value=True): + with patch("vllm.attention.selector.current_platform.is_rocm", + return_value=True): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "ROCM_FLASH" diff --git a/tests/kernels/test_blocksparse_attention.py b/tests/kernels/test_blocksparse_attention.py index f3bd8f0524264..b65efb3abc230 100644 --- a/tests/kernels/test_blocksparse_attention.py +++ b/tests/kernels/test_blocksparse_attention.py @@ -7,7 +7,8 @@ from vllm import _custom_ops as ops from vllm.attention.ops.blocksparse_attention.interface import ( LocalStridedBlockSparseAttn) -from vllm.utils import get_max_shared_memory_bytes, is_hip, seed_everything +from vllm.platforms import current_platform +from vllm.utils import get_max_shared_memory_bytes, seed_everything from .allclose_default import get_default_atol, get_default_rtol @@ -316,8 +317,8 @@ def test_paged_attention( # NOTE(woosuk): Due to the kernel-level differences in the two # implementations, there is a small numerical difference in the two # outputs. Thus, we use a relaxed tolerance for the test. - atol = get_default_atol(output) if is_hip() else 1e-3 - rtol = get_default_rtol(output) if is_hip() else 1e-5 + atol = get_default_atol(output) if current_platform.is_rocm() else 1e-3 + rtol = get_default_rtol(output) if current_platform.is_rocm() else 1e-5 # NOTE(zhaoyang): FP8 KV Cache will introduce quantization error, # so we use a relaxed tolerance for the test. diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index 6b979d0558c46..bc99c5559d388 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -18,7 +18,7 @@ from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, global_force_attn_backend_context_manager) -from vllm.utils import is_hip +from vllm.platforms import current_platform # List of support backends for encoder/decoder models LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS] @@ -82,7 +82,7 @@ class TestResources(NamedTuple): will leverage attn_backend for the purpose of constructing backend-compatible attention metadata instances - + Attributes: * scale: 1/sqrt(d) scale factor for attn @@ -105,10 +105,10 @@ def _make_test_resources(test_pt: TestPoint, ) -> TestResources: Build key components for performing encoder/decoder attention test. Note that - (1) The Attention instance constructed here, automatically selects + (1) The Attention instance constructed here, automatically selects an attention backend class based on platform info & a set of canned heuristics, so - (2) The attention backend instance constructed here is thus *not + (2) The attention backend instance constructed here is thus *not the same backend instance* used by attn, but rather it is intended to be a *different instance* of the *same backend class*; therefore, @@ -156,7 +156,7 @@ def _encoder_attn_setup( ''' Set up test vectors & data structures for encoder attention test. - A triplet of synthetic query/key/value tensors are constructed. + A triplet of synthetic query/key/value tensors are constructed. Given this is an encoder attention test, the key & value sequences will have the same length as the corresponding queries. @@ -169,14 +169,14 @@ def _encoder_attn_setup( Arguments: * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field - + Returns: - + * PhaseTestParameters data structure comprising (1) packed query/key/value tensors, (2) the ideal output of attention computed using a naive implementation, and (3) KVCache field set to None @@ -265,7 +265,7 @@ def _decoder_attn_setup( Arguments: * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field @@ -275,14 +275,14 @@ def _decoder_attn_setup( * qkv: Unpacked (batch_size x padded_seq_len x num_heads x head_size) query/key/value tensors * Prefill-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) + including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data + computed using a naive implementation, and (3) memory-mapping data structures appropriate for prefill phase. - * Decode-phase decoder self-attention PhaseTestParameters data structure, - including (1) packed (number_of_tokens x num_heads x head_size) - query/key/value tensors along with (2) ideal attention output - computed using a naive implementation, and (3) memory-mapping data + * Decode-phase decoder self-attention PhaseTestParameters data structure, + including (1) packed (number_of_tokens x num_heads x head_size) + query/key/value tensors along with (2) ideal attention output + computed using a naive implementation, and (3) memory-mapping data structures appropriate for decode phase. * max_block_idx: max physical address in decoder self-attention block-table (intended to be used as the base address for the encoder/ @@ -436,12 +436,12 @@ def _enc_dec_cross_attn_setup_reuses_query( This function also constructs the cross-attention KV cache memory mapping (slot mapping and block table), ensuring that the block table starts at - block_base_addr. + block_base_addr. Arguments: * decoder_qkv: pre-existing unpacked (batch_size x padded_seq_len x - num_heads x head_size) decoder self-attention inputs; + num_heads x head_size) decoder self-attention inputs; this function relies on the query and q_seq_lens fields * encoder_test_params: PhaseTestParameters data structure which was @@ -452,7 +452,7 @@ def _enc_dec_cross_attn_setup_reuses_query( self-attention; all fields including KV cache required * test_pt: TestPoint data structure; this function relies on the - following fields: batch_size, num_heads, head_size, + following fields: batch_size, num_heads, head_size, block_size, max_q_seq_len * test_rsrcs: TestResources data structure; this function relies on the scale field @@ -460,16 +460,16 @@ def _enc_dec_cross_attn_setup_reuses_query( Returns: - * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data - structure, including (1) packed + * Prefill-phase encoder/decoder cross-attention PhaseTestParameters data + structure, including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a + along with (2) ideal attention output computed using a naive implementation, and (3) memory-mapping data structures appropriate for prefill phase. - * Decode-phase encoder/decoder cross-attention PhaseTestParameters data + * Decode-phase encoder/decoder cross-attention PhaseTestParameters data structure, including (1) packed (number_of_tokens x num_heads x head_size) query/key/value tensors - along with (2) ideal attention output computed using a + along with (2) ideal attention output computed using a naive implementation, and (3) memory-mapping data structures appropriate for decode phase. ''' @@ -596,7 +596,7 @@ def _run_encoder_attention_test( ''' Run encoder attention. - attn.forward() is passed attn_type=AttentionType.ENCODER in order + attn.forward() is passed attn_type=AttentionType.ENCODER in order to configure the kernel invocation for encoder attention Requires attn_metadata.num_decode_tokens == 0 @@ -607,7 +607,7 @@ def _run_encoder_attention_test( * attn: Attention wrapper instance * encoder_test_params: encoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention @@ -646,7 +646,7 @@ def _run_decoder_self_attention_test( and attn (Attention wrapper instance) fields * decoder_test_params: decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query/key/value fields * attn_metadata: attention metadata for decoder-self attention (contains KV cache memory-mapping) @@ -694,11 +694,11 @@ def _run_encoder_decoder_cross_attention_test( and attn (Attention wrapper instance) fields * decoder_test_params: decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) query field * cross_test_params: encoder/decoder PhaseTestParameters data structure; this function relies on the packed - (number_of_tokens x num_heads x head_size) + (number_of_tokens x num_heads x head_size) key/value fields * attn_metadata: attention metadata for encoder/decoder-self attention @@ -726,7 +726,8 @@ def _run_encoder_decoder_cross_attention_test( attn_type=attn_type) -@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) +@pytest.mark.skipif(current_platform.is_rocm(), + reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @@ -755,7 +756,8 @@ def test_encoder_only( No KV cache is required for encoder-only attention. Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if is_hip(). + AMD GPUs, therefore this test simply is skipped if + current_platform.is_rocm(). This test globally forces an override of the usual backend auto-selection process, forcing the specific backend-under-test @@ -811,7 +813,8 @@ def test_encoder_only( assert_actual_matches_ideal(enc_test_params, enc_pckd_act_out) -@pytest.mark.skipif(is_hip(), reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) +@pytest.mark.skipif(current_platform.is_rocm(), + reason=STR_NOT_IMPL_ENC_DEC_ROCM_HIP) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", HEAD_SIZES) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) @@ -837,14 +840,14 @@ def test_e2e_enc_dec_attn( attributes for prefill-phase, and (2) an analogous attention metadata structure but for decode-phase * Test attention steps in the following order - + * Encoder attention * Prefill self-attention * Prefill cross-attention * Decode self-attention * Decode cross-attention - * Besides being reflective of realistic use-cases, this order would - exacerbate any accidental overlap in the self-/cross-attention + * Besides being reflective of realistic use-cases, this order would + exacerbate any accidental overlap in the self-/cross-attention block tables, which one hopes to avoid @@ -864,10 +867,11 @@ def test_e2e_enc_dec_attn( to be utilized. Note on ROCm/HIP: currently encoder/decoder models are not supported on - AMD GPUs, therefore this test simply is skipped if is_hip(). + AMD GPUs, therefore this test simply is skipped if + current_platform.is_rocm(). Note on metadata: there is a single attention metadata structure shared by - all prefill-phase attention operations (encoder, decoder, enc/dec cross), + all prefill-phase attention operations (encoder, decoder, enc/dec cross), and a single one shared by all decode-phase attention operations (decoder & enc/dec cross.) This is intended to reflect the behavior of EncoderDecoderModelRunner, which constructs a single attention metadata diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index c0053071258ea..4bfc089c82179 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -18,8 +18,9 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( marlin_quantize) from vllm.model_executor.models.mixtral import MixtralMoE +from vllm.platforms import current_platform from vllm.scalar_type import scalar_types -from vllm.utils import is_hip, seed_everything +from vllm.utils import seed_everything @pytest.mark.parametrize("m", [1024 * 128, 512, 222, 33, 1]) @@ -103,7 +104,7 @@ def test_mixtral_moe(dtype: torch.dtype): @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) -@pytest.mark.skipif(is_hip(), reason="Skip for rocm") +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_fused_marlin_moe( m: int, n: int, @@ -256,7 +257,7 @@ def test_fused_marlin_moe( @pytest.mark.parametrize("act_order", [True, False]) @pytest.mark.parametrize("num_bits", [4, 8]) @pytest.mark.parametrize("is_k_full", [True, False]) -@pytest.mark.skipif(is_hip(), reason="Skip for rocm") +@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm") def test_single_marlin_moe_multiply( m: int, n: int, diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index f7c1d4f041c12..15ec66b0f5502 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -4,7 +4,7 @@ import vllm from vllm.lora.request import LoRARequest -from vllm.utils import is_hip +from vllm.platforms import current_platform MODEL_PATH = "google/gemma-7b" @@ -31,7 +31,8 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts -@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm") +@pytest.mark.xfail(current_platform.is_rocm(), + reason="There can be output mismatch on ROCm") def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index d004c65929418..5432fa4ad0d3a 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -8,7 +8,7 @@ import vllm from vllm.distributed import cleanup_dist_env_and_memory from vllm.lora.request import LoRARequest -from vllm.utils import is_hip +from vllm.platforms import current_platform @dataclass @@ -19,7 +19,7 @@ class ModelWithQuantization: MODELS: List[ModelWithQuantization] #AWQ quantization is currently not supported in ROCm. -if is_hip(): +if current_platform.is_rocm(): MODELS = [ ModelWithQuantization( model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", diff --git a/tests/models/decoder_only/vision_language/test_paligemma.py b/tests/models/decoder_only/vision_language/test_paligemma.py index a3ca0845e5ff8..69189ba2f25cb 100644 --- a/tests/models/decoder_only/vision_language/test_paligemma.py +++ b/tests/models/decoder_only/vision_language/test_paligemma.py @@ -6,8 +6,9 @@ BatchEncoding) from vllm.multimodal.utils import rescale_image_size +from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, is_hip +from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ...utils import check_logprobs_close @@ -24,7 +25,7 @@ # ROCm Triton FA can run into compilation issues with these models due to, # excessive use of shared memory. Use other backends in the meantime. # FIXME (mattwong, gshtrasb, hongxiayan) -if is_hip(): +if current_platform.is_rocm(): os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" @@ -70,7 +71,7 @@ def run_test( All the image fixtures for the test are from IMAGE_ASSETS. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalDataDict objects + For vllm runner, we provide MultiModalDataDict objects and corresponding MultiModalConfig as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. @@ -151,7 +152,7 @@ def process(hf_inputs: BatchEncoding): pytest.param( "float", marks=pytest.mark.skipif( - is_hip(), + current_platform.is_rocm(), reason= "ROCm FA does not yet fully support 32-bit precision on PaliGemma") ), "half" diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py index dfe10629f1c66..1840b4bb8574c 100644 --- a/tests/models/decoder_only/vision_language/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/test_phi3v.py @@ -12,7 +12,6 @@ from vllm.multimodal.utils import rescale_image_size from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs -from vllm.utils import is_hip from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, _ImageAssets) @@ -56,7 +55,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, # ROCm Triton FA can run into shared memory issues with these models, # use other backends in the meantime # FIXME (mattwong, gshtrasb, hongxiayan) -if is_hip(): +if current_platform.is_rocm(): os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0" diff --git a/tests/spec_decode/e2e/test_integration_dist_tp2.py b/tests/spec_decode/e2e/test_integration_dist_tp2.py index b829d1a5be784..25562ca85adf4 100644 --- a/tests/spec_decode/e2e/test_integration_dist_tp2.py +++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py @@ -5,7 +5,7 @@ import pytest import torch -from vllm.utils import is_hip +from vllm.platforms import current_platform from .conftest import run_equality_correctness_test_tp @@ -51,7 +51,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs, batch_size: int, output_len: int, seed: int): """Verify greedy equality when tensor parallelism is used. """ - if is_hip(): + if current_platform.is_rocm(): pytest.skip("hip is not well-supported yet") run_equality_correctness_test_tp("JackFram/llama-68m", common_llm_kwargs, diff --git a/tests/utils.py b/tests/utils.py index e983104e3cb0c..0c61891cfefec 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -26,7 +26,7 @@ from vllm.platforms import current_platform from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import (FlexibleArgumentParser, GB_bytes, - cuda_device_count_stateless, get_open_port, is_hip) + cuda_device_count_stateless, get_open_port) if current_platform.is_rocm(): from amdsmi import (amdsmi_get_gpu_vram_usage, @@ -487,7 +487,7 @@ def wait_for_gpu_memory_to_clear(devices: List[int], output: Dict[int, str] = {} output_raw: Dict[int, float] = {} for device in devices: - if is_hip(): + if current_platform.is_rocm(): dev_handle = amdsmi_get_processor_handles()[device] mem_info = amdsmi_get_gpu_vram_usage(dev_handle) gb_used = mem_info["vram_used"] / 2**10 diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index f57414bd5197e..46a2fb8bc80a2 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -659,11 +659,11 @@ def scaled_fp8_quant( Args: input: The input tensor to be quantized to FP8 scale: Optional scaling factor for the FP8 quantization - scale_ub: Optional upper bound for scaling factor in dynamic + scale_ub: Optional upper bound for scaling factor in dynamic per token case num_token_padding: If specified, pad the first dimension of the output to at least this value. - use_per_token_if_dynamic: Whether to do per_tensor or per_token + use_per_token_if_dynamic: Whether to do per_tensor or per_token in the dynamic quantization case. Returns: @@ -674,8 +674,8 @@ def scaled_fp8_quant( assert (input.ndim == 2) shape: Union[Tuple[int, int], torch.Size] = input.shape # For rocm, the output fp8 dtype is torch.float_e3m3fnuz - out_dtype: torch.dtype = torch.float8_e4m3fnuz if vllm.utils.is_hip() \ - else torch.float8_e4m3fn + out_dtype: torch.dtype = torch.float8_e4m3fnuz \ + if current_platform.is_rocm() else torch.float8_e4m3fn if num_token_padding: shape = (max(num_token_padding, input.shape[0]), shape[1]) output = torch.empty(shape, device=input.device, dtype=out_dtype) diff --git a/vllm/attention/ops/blocksparse_attention/interface.py b/vllm/attention/ops/blocksparse_attention/interface.py index e4dc576d27932..a98eb431ac7fc 100644 --- a/vllm/attention/ops/blocksparse_attention/interface.py +++ b/vllm/attention/ops/blocksparse_attention/interface.py @@ -3,7 +3,6 @@ import torch from vllm.platforms import current_platform -from vllm.utils import is_hip from .utils import (dense_to_crow_col, get_head_sliding_step, get_sparse_attn_mask) @@ -32,8 +31,9 @@ def __init__( ): super().__init__() if use_spda is None: - use_spda = is_hip() or current_platform.is_cpu() or not \ - IS_COMPUTE_8_OR_ABOVE + use_spda = current_platform.is_rocm() or \ + current_platform.is_cpu() or not \ + IS_COMPUTE_8_OR_ABOVE device = device or (torch.cuda.current_device() if current_platform.is_cuda_alike() else "cpu") device = torch.device(device) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 10d4509b38279..376b3136f0fb8 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -10,7 +10,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import STR_BACKEND_ENV_VAR, is_hip +from vllm.utils import STR_BACKEND_ENV_VAR logger = init_logger(__name__) @@ -208,7 +208,7 @@ def which_attn_to_use( logger.info("Cannot use %s backend on TPU.", selected_backend) return _Backend.PALLAS - if is_hip(): + if current_platform.is_rocm(): # AMD GPUs. selected_backend = (_Backend.ROCM_FLASH if selected_backend == _Backend.FLASH_ATTN else selected_backend) diff --git a/vllm/config.py b/vllm/config.py index a1fba98233b80..99a82c8f1b40b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -17,7 +17,7 @@ get_hf_image_processor_config, get_hf_text_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - is_hip, print_warning_once) + print_warning_once) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -43,7 +43,7 @@ class ModelConfig: Args: model: Name or path of the huggingface model to use. - It is also used as the content for `model_name` tag in metrics + It is also used as the content for `model_name` tag in metrics output when `served_model_name` is not specified. task: The task to use the model for. Each vLLM instance only supports one task, even if the same model can be used for multiple tasks. @@ -99,15 +99,15 @@ class ModelConfig: skip_tokenizer_init: If true, skip initialization of tokenizer and detokenizer. served_model_name: The model name used in metrics tag `model_name`, - matches the model name exposed via the APIs. If multiple model - names provided, the first name will be used. If not specified, + matches the model name exposed via the APIs. If multiple model + names provided, the first name will be used. If not specified, the model name will be the same as `model`. - limit_mm_per_prompt: Maximum number of data instances per modality + limit_mm_per_prompt: Maximum number of data instances per modality per prompt. Only applicable for multimodal models. - override_neuron_config: Initialize non default neuron config or - override default neuron config that are specific to Neuron devices, - this argument will be used to configure the neuron config that - can not be gathered from the vllm arguments. + override_neuron_config: Initialize non default neuron config or + override default neuron config that are specific to Neuron devices, + this argument will be used to configure the neuron config that + can not be gathered from the vllm arguments. config_format: The config format which shall be loaded. Defaults to 'auto' which defaults to 'hf'. mm_processor_kwargs: Arguments to be forwarded to the model's processor @@ -350,7 +350,7 @@ def _verify_quantization(self) -> None: raise ValueError( f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}.") - if is_hip( + if current_platform.is_rocm( ) and self.quantization not in rocm_supported_quantization: raise ValueError( f"{self.quantization} quantization is currently not " @@ -365,7 +365,7 @@ def _verify_quantization(self) -> None: "%s quantization is not fully " "optimized yet. The speed can be slower than " "non-quantized models.", self.quantization) - if (self.quantization == "awq" and is_hip() + if (self.quantization == "awq" and current_platform.is_rocm() and not envs.VLLM_USE_TRITON_AWQ): logger.warning( "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" @@ -385,7 +385,7 @@ def _verify_cuda_graph(self) -> None: def _verify_bnb_config(self) -> None: """ - The current version of bitsandbytes (0.44.0) with 8-bit models does not + The current version of bitsandbytes (0.44.0) with 8-bit models does not yet support CUDA graph. """ is_bitsandbytes = self.quantization == "bitsandbytes" @@ -810,7 +810,7 @@ class LoadConfig: fast weight loading. "bitsandbytes" will load nf4 type weights. ignore_patterns: The list of patterns to ignore when loading the model. - Default to "original/**/*" to avoid repeated loading of llama's + Default to "original/**/*" to avoid repeated loading of llama's checkpoints. """ @@ -843,7 +843,8 @@ def _verify_load_format(self) -> None: self.load_format = LoadFormat(load_format) rocm_not_supported_load_format: List[str] = [] - if is_hip() and load_format in rocm_not_supported_load_format: + if current_platform.is_rocm( + ) and load_format in rocm_not_supported_load_format: rocm_supported_load_format = [ f for f in LoadFormat.__members__ if (f not in rocm_not_supported_load_format) @@ -967,7 +968,7 @@ def _verify_args(self) -> None: if self.use_ray: from vllm.executor import ray_utils ray_utils.assert_ray_available() - if is_hip(): + if current_platform.is_rocm(): self.disable_custom_all_reduce = True logger.info( "Disabled the custom all-reduce kernel because it is not " @@ -996,7 +997,7 @@ class SchedulerConfig: prompt latency) before scheduling next prompt. enable_chunked_prefill: If True, prefill requests can be chunked based on the remaining max_num_batched_tokens. - preemption_mode: Whether to perform preemption by swapping or + preemption_mode: Whether to perform preemption by swapping or recomputation. If not specified, we determine the mode as follows: We use recomputation by default since it incurs lower overhead than swapping. However, when the sequence group has multiple sequences @@ -1215,7 +1216,7 @@ def maybe_create_spec_config( typical_acceptance_sampler_posterior_threshold (Optional[float]): A threshold value that sets a lower bound on the posterior probability of a token in the target model for it to be - accepted. This threshold is used only when we use the + accepted. This threshold is used only when we use the TypicalAcceptanceSampler for token acceptance. typical_acceptance_sampler_posterior_alpha (Optional[float]): A scaling factor for the entropy-based threshold in the @@ -1225,7 +1226,7 @@ def maybe_create_spec_config( If set to False, token log probabilities are returned according to the log probability settings in SamplingParams. If not specified, it defaults to True. - + Returns: Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if the necessary conditions are met, else None. @@ -1470,13 +1471,13 @@ def __init__( typical_acceptance_sampler_posterior_threshold (Optional[float]): A threshold value that sets a lower bound on the posterior probability of a token in the target model for it to be - accepted. This threshold is used only when we use the + accepted. This threshold is used only when we use the TypicalAcceptanceSampler for token acceptance. typical_acceptance_sampler_posterior_alpha (Optional[float]): A scaling factor for the entropy-based threshold in the TypicalAcceptanceSampler. disable_logprobs: If set to True, token log probabilities will not - be returned even if requested by sampling parameters. This + be returned even if requested by sampling parameters. This reduces latency by skipping logprob calculation in proposal sampling, target sampling, and after accepted tokens are determined. If set to False, log probabilities will be @@ -1843,10 +1844,10 @@ def get_min_sliding_window( def get_served_model_name(model: str, served_model_name: Optional[Union[str, List[str]]]): """ - If the input is a non-empty list, the first model_name in - `served_model_name` is taken. - If the input is a non-empty string, it is used directly. - For cases where the input is either an empty string or an + If the input is a non-empty list, the first model_name in + `served_model_name` is taken. + If the input is a non-empty string, it is used directly. + For cases where the input is either an empty string or an empty list, the fallback is to use `self.model`. """ if not served_model_name: diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 0af7b3386d895..aa546ebada473 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -10,7 +10,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.utils import get_ip, is_hip +from vllm.utils import get_ip from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -231,7 +231,7 @@ def initialize_ray_cluster( assert_ray_available() # Connect to a ray cluster. - if is_hip() or current_platform.is_xpu(): + if current_platform.is_rocm() or current_platform.is_xpu(): ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 71eed6eb68d78..83910339f3c9f 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -7,7 +7,7 @@ from vllm.compilation.levels import CompilationLevel from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import is_hip, print_warning_once +from vllm.utils import print_warning_once logger = init_logger(__name__) @@ -72,7 +72,7 @@ def dispatch_forward(self): if not enabled: return self.forward_native - if is_hip(): + if current_platform.is_rocm(): return self.forward_hip elif current_platform.is_cpu(): return self.forward_cpu diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index c21aaa40ff2cc..be3d3985a74ad 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -14,7 +14,8 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs -from vllm.utils import is_hip, print_warning_once +from vllm.platforms import current_platform +from vllm.utils import print_warning_once class GPTQMarlinState(Enum): @@ -150,7 +151,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w2_input_scale.max(), requires_grad=False) # If rocm, normalize the weights and scales to e4m3fnuz - if is_hip(): + if current_platform.is_rocm(): # Normalize the weights and scales w13_weight, w13_weight_scale, w13_input_scale = \ normalize_e4m3fn_to_e4m3fnuz( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 7270b302ef965..73cc8ce0d2a4b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -12,7 +12,7 @@ from vllm.model_executor.parameter import (ChannelQuantScaleParameter, ModelWeightParameter, PerTensorScaleParameter) -from vllm.utils import is_hip +from vllm.platforms import current_platform __all__ = ["CompressedTensorsW8A8Fp8"] @@ -40,7 +40,7 @@ def process_weights_after_loading(self, layer) -> None: logical_widths=layer.logical_widths, ) - if is_hip(): + if current_platform.is_rocm(): weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( weight=weight, weight_scale=max_w_scale, @@ -56,7 +56,7 @@ def process_weights_after_loading(self, layer) -> None: elif self.strategy == QuantizationStrategy.CHANNEL: weight = layer.weight - if is_hip(): + if current_platform.is_rocm(): weight, weight_scale, input_scale = \ normalize_e4m3fn_to_e4m3fnuz( weight=weight, diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index f26907176ad1a..825d01d1b3551 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -19,7 +19,6 @@ from vllm.model_executor.parameter import (ChannelQuantScaleParameter, ModelWeightParameter) from vllm.platforms import current_platform -from vllm.utils import is_hip logger = init_logger(__name__) @@ -127,7 +126,7 @@ def process_weights_after_loading(self, layer: Module) -> None: weight = layer.weight - if is_hip(): + if current_platform.is_rocm(): weight, weight_scale, input_scale = \ normalize_e4m3fn_to_e4m3fnuz( weight=weight, diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index b5feb55db0e74..d34579b7099bb 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -26,7 +26,7 @@ PerTensorScaleParameter) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import is_hip, print_warning_once +from vllm.utils import print_warning_once ACTIVATION_SCHEMES = ["static", "dynamic"] @@ -123,7 +123,7 @@ def __init__(self, quant_config: Fp8Config): self.use_marlin = (not current_platform.has_device_capability(89) or envs.VLLM_TEST_FORCE_FP8_MARLIN) # Disable marlin for rocm - if is_hip(): + if current_platform.is_rocm(): self.use_marlin = False def create_weights( @@ -226,7 +226,7 @@ def process_weights_after_loading(self, layer: Module) -> None: weight_scale = layer.weight_scale # If rocm, use float8_e4m3fnuz. - if is_hip(): + if current_platform.is_rocm(): weight, weight_scale, input_scale = \ normalize_e4m3fn_to_e4m3fnuz( weight=weight, @@ -372,7 +372,7 @@ def process_weights_after_loading(self, layer: Module) -> None: if not self.quant_config.is_checkpoint_fp8_serialized: # If rocm, use float8_e4m3fnuz as dtype fp8_dtype = torch.float8_e4m3fnuz \ - if is_hip() else torch.float8_e4m3fn + if current_platform.is_rocm() else torch.float8_e4m3fn w13_weight = torch.empty_like(layer.w13_weight.data, dtype=fp8_dtype) w2_weight = torch.empty_like(layer.w2_weight.data, dtype=fp8_dtype) @@ -420,7 +420,7 @@ def process_weights_after_loading(self, layer: Module) -> None: layer.w2_input_scale = torch.nn.Parameter( layer.w2_input_scale.max(), requires_grad=False) # If rocm, normalize the weights and scales to e4m3fnuz - if is_hip(): + if current_platform.is_rocm(): # Normalize the weights and scales w13_weight, w13_weight_scale, w13_input_scale = \ normalize_e4m3fn_to_e4m3fnuz( diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index 411af922149fd..1879d2855d93d 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -4,16 +4,16 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform -from vllm.utils import is_hip # Input scaling factors are no longer optional in _scaled_mm starting # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale -TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() if is_hip() else None +TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() \ + if current_platform.is_rocm() else None def cutlass_fp8_supported() -> bool: # cutlass is not supported on Rocm - if is_hip(): + if current_platform.is_rocm(): return False capability_tuple = current_platform.get_device_capability() diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 4126ceb7117d4..22f194c776b69 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -49,9 +49,9 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.exaone import ExaoneConfig -from vllm.utils import is_hip from .interfaces import SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, @@ -595,7 +595,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: if not isinstance(self.transformer.h[layer_idx], nn.Identity): layer_self_attn = self.transformer.h[layer_idx].attn - if is_hip(): + if current_platform.is_rocm(): # The scaling factor convention we are assuming is # quantized_value * scaling_factor ~= true_value # which is consistent with the practice of setting diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 5a397ed8ff6a0..c968817747754 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -49,8 +49,8 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import is_hip from .interfaces import SupportsLoRA, SupportsPP from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers @@ -534,7 +534,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: if not isinstance(self.model.layers[layer_idx], nn.Identity): layer_self_attn = self.model.layers[layer_idx].self_attn - if is_hip(): + if current_platform.is_rocm(): # The scaling factor convention we are assuming is # quantized_value * scaling_factor ~= true_value # which is consistent with the practice of setting diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c346e3e808e3f..b0ca1fe006239 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -50,8 +50,8 @@ default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, PoolerOutput -from vllm.utils import is_hip from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter, @@ -423,7 +423,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: if not isinstance(self.layers[layer_idx], nn.Identity): layer_self_attn = self.layers[layer_idx].self_attn - if is_hip(): + if current_platform.is_rocm(): # The scaling factor convention we are assuming is # quantized_value * scaling_factor ~= true_value # which is consistent with the practice of setting diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f6713ab0898f0..595a9256f958e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -12,7 +12,7 @@ import torch.nn as nn from vllm.logger import init_logger -from vllm.utils import is_hip +from vllm.platforms import current_platform from .interfaces import (has_inner_state, is_attention_free, supports_multimodal, supports_pp) @@ -247,7 +247,7 @@ def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, ) -> Optional[Type[nn.Module]]: - if is_hip(): + if current_platform.is_rocm(): if model_arch in _ROCM_UNSUPPORTED_MODELS: raise ValueError(f"Model architecture '{model_arch}' is not " "supported by ROCm for now.") diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index 5a3dd3c02b85b..e3e7ccb5cf179 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -49,8 +49,8 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors -from vllm.utils import is_hip from .interfaces import SupportsLoRA, SupportsPP from .utils import (PPMissingLayer, is_pp_missing_parameter, @@ -558,7 +558,7 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: if not isinstance(self.model.layers[layer_idx], nn.Identity): layer_self_attn = self.model.layers[layer_idx].self_attn - if is_hip(): + if current_platform.is_rocm(): # The scaling factor convention we are assuming is # quantized_value * scaling_factor ~= true_value # which is consistent with the practice of setting diff --git a/vllm/utils.py b/vllm/utils.py index d4f2c936ca9cc..c3f9a6bdd8b80 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -314,10 +314,6 @@ def reset(self): self._index = 0 -def is_hip() -> bool: - return torch.version.hip is not None - - @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -1098,7 +1094,7 @@ def _cuda_device_count_stateless( if not torch.cuda._is_compiled(): return 0 - if is_hip(): + if current_platform.is_rocm(): # ROCm uses amdsmi instead of nvml for stateless device count # This requires a sufficiently modern version of Torch 2.4.0 raw_count = torch.cuda._device_count_amdsmi() if (hasattr( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 4a287e3741d0f..233a9e664d845 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -41,6 +41,7 @@ from vllm.model_executor.models.utils import set_cpu_offload_max_bytes from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs, MultiModalInputs, MultiModalRegistry) +from vllm.platforms import current_platform from vllm.prompt_adapter.layers import PromptAdapterMapping from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.prompt_adapter.worker_manager import ( @@ -49,7 +50,7 @@ from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.transformers_utils.config import uses_mrope from vllm.utils import (DeviceMemoryProfiler, PyObjectCache, async_tensor_h2d, - flatten_2d_lists, is_hip, is_pin_memory_available, + flatten_2d_lists, is_pin_memory_available, supports_dynamo, weak_ref_tensor) from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, @@ -737,13 +738,13 @@ def _get_cuda_graph_pad_size(self, family of functions. Args: - num_seqs (int): Number of sequences scheduled to run. + num_seqs (int): Number of sequences scheduled to run. max_decode_seq_len (int): Greatest of all the decode sequence lengths. Used only in checking the viablility of using CUDA graphs. max_encoder_seq_len (int, optional): Greatest of all the encode sequence lengths. Defaults to 0. Used only in checking the - viability of using CUDA graphs. + viability of using CUDA graphs. Returns: int: Returns the determined number of padding sequences. If CUDA graphs is not viable, returns -1. @@ -1103,7 +1104,7 @@ def load_model(self) -> None: self.prompt_adapter_manager.create_prompt_adapter_manager( self.model)) - if self.kv_cache_dtype == "fp8" and is_hip(): + if self.kv_cache_dtype == "fp8" and current_platform.is_rocm(): # Currently only ROCm accepts kv-cache scaling factors # via quantization_param_path and this will be deprecated # in the future. From 32176fee733b76b295346870d717d44cb7102944 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 27 Oct 2024 21:58:04 -0700 Subject: [PATCH 015/113] [torch.compile] support moe models (#9632) Signed-off-by: youkaichao --- benchmarks/kernels/benchmark_moe.py | 33 +++--- tests/compile/test_basic_correctness.py | 4 +- tests/kernels/test_awq_marlin.py | 21 ++-- tests/kernels/test_moe.py | 7 +- .../layers/fused_moe/__init__.py | 28 ++++- .../layers/fused_moe/fused_marlin_moe.py | 51 +++++++-- .../layers/fused_moe/fused_moe.py | 100 ++++++++++++++++-- vllm/model_executor/layers/fused_moe/layer.py | 29 +++-- .../layers/quantization/awq_marlin.py | 7 +- .../compressed_tensors_moe.py | 7 +- .../layers/quantization/gptq_marlin.py | 6 +- vllm/model_executor/models/granitemoe.py | 2 + 12 files changed, 217 insertions(+), 78 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index c2ad98b7e2656..4f88e8e6eb1a6 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -88,22 +88,23 @@ def prepare(i: int): input_gating.copy_(gating_output[i]) def run(): - fused_moe( - x, - w1, - w2, - input_gating, - topk, - renormalize=True, - inplace=True, - override_config=config, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a16=use_int8_w8a16, - w1_scale=w1_scale, - w2_scale=w2_scale, - a1_scale=a1_scale, - a2_scale=a2_scale, - ) + from vllm.model_executor.layers.fused_moe import override_config + with override_config(config): + fused_moe( + x, + w1, + w2, + input_gating, + topk, + renormalize=True, + inplace=True, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a16=use_int8_w8a16, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + ) # JIT compilation & warmup run() diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 77c56d91d0a8b..6aa27b24b4a6e 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -13,11 +13,11 @@ @pytest.mark.parametrize( "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", [ - ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASH_ATTN", "generate", True), + ("meta-llama/Llama-3.2-1B", [], 2, 2, "FLASHINFER", "generate", True), ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", ["--quantization", "compressed-tensors" ], 1, 1, "FLASH_ATTN", "generate", True), - ("google/gemma-2-2b-it", [], 1, 2, "FLASHINFER", "generate", True), + ("ibm/PowerMoE-3b", [], 1, 2, "FLASH_ATTN", "generate", True), # TODO: add multi-modality test for llava ("llava-hf/llava-1.5-7b-hf", [], 2, 1, "FLASHINFER", "generate", False) ]) diff --git a/tests/kernels/test_awq_marlin.py b/tests/kernels/test_awq_marlin.py index 0f0a2b24563fd..59917dd2c58ad 100644 --- a/tests/kernels/test_awq_marlin.py +++ b/tests/kernels/test_awq_marlin.py @@ -5,11 +5,10 @@ import pytest import torch +import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import (compute_max_diff, stack_and_dev, torch_moe, torch_moe_single) from vllm import _custom_ops as ops -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( awq_marlin_quantize) @@ -81,7 +80,7 @@ def test_fused_marlin_moe_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) topk_weights, topk_ids = fused_topk(a, score, topk, False) - marlin_output = fused_marlin_moe( + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, qweight2, @@ -150,14 +149,14 @@ def test_single_marlin_moe_multiply_awq( score = torch.randn((m, e), device="cuda", dtype=dtype) - marlin_output = single_marlin_moe(a, - qweight, - scales, - score, - topk, - renormalize=False, - w_zeros=zp, - num_bits=num_bits) + marlin_output = torch.ops.vllm.single_marlin_moe(a, + qweight, + scales, + score, + topk, + renormalize=False, + w_zeros=zp, + num_bits=num_bits) torch_output = torch_moe_single(a, w_ref.transpose(1, 2), score, topk) diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py index 4bfc089c82179..70906ab2187bc 100644 --- a/tests/kernels/test_moe.py +++ b/tests/kernels/test_moe.py @@ -7,12 +7,11 @@ from transformers import MixtralConfig from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock +import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import (compute_max_diff, opcheck, stack_and_dev, torch_moe, torch_moe_single) from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import fused_moe -from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_topk, moe_align_block_size) from vllm.model_executor.layers.quantization.utils.marlin_utils_test import ( @@ -193,7 +192,7 @@ def test_fused_marlin_moe( topk, renormalize=False, ) - marlin_output = fused_marlin_moe( + marlin_output = torch.ops.vllm.fused_marlin_moe( a, qweight1, qweight2, @@ -309,7 +308,7 @@ def test_single_marlin_moe_multiply( sort_indices = stack_and_dev(sort_indices_l) score = torch.randn((m, e), device="cuda", dtype=dtype) - marlin_output = single_marlin_moe( + marlin_output = torch.ops.vllm.single_marlin_moe( a, qweight, scales, diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index e9b5703ca28be..c4223d12600ac 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -1,23 +1,43 @@ +from contextlib import contextmanager +from typing import Any, Dict, Optional + from vllm.model_executor.layers.fused_moe.layer import ( FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.triton_utils import HAS_TRITON +_config: Optional[Dict[str, Any]] = None + + +@contextmanager +def override_config(config): + global _config + old_config = _config + _config = config + yield + _config = old_config + + +def get_config() -> Optional[Dict[str, Any]]: + return _config + + __all__ = [ "FusedMoE", "FusedMoEMethodBase", "FusedMoeWeightScaleSupported", + "override_config", + "get_config", ] if HAS_TRITON: - from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe, single_marlin_moe) + # import to register the custom ops + import vllm.model_executor.layers.fused_moe.fused_marlin_moe # noqa + import vllm.model_executor.layers.fused_moe.fused_moe # noqa from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, fused_moe, fused_topk, get_config_file_name, grouped_topk) __all__ += [ - "fused_marlin_moe", - "single_marlin_moe", "fused_moe", "fused_topk", "fused_experts", diff --git a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py index 5ae40a2af5a2b..93019d0d0abb6 100644 --- a/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_marlin_moe.py @@ -1,6 +1,6 @@ """Fused MoE utilities for GPTQ.""" import functools -from typing import Any, Dict, Optional +from typing import Optional import torch @@ -18,6 +18,7 @@ def get_scalar_type(num_bits: int, has_zp: bool): return scalar_types.uint4b8 if num_bits == 4 else scalar_types.uint8b128 +@torch.library.custom_op("vllm::single_marlin_moe", mutates_args=[]) def single_marlin_moe( hidden_states: torch.Tensor, w: torch.Tensor, @@ -28,7 +29,6 @@ def single_marlin_moe( g_idx: Optional[torch.Tensor] = None, sort_indices: Optional[torch.Tensor] = None, w_zeros: Optional[torch.Tensor] = None, - override_config: Optional[Dict[str, Any]] = None, num_bits: int = 8, is_k_full: bool = True, ) -> torch.Tensor: @@ -49,8 +49,6 @@ def single_marlin_moe( - topk (int): The number of top-k experts to select. - renormalize (bool): If True, renormalize the top-k weights to sum to 1. - w_zeros (Optional[torch.Tensor]): Optional zero points to be used for w. - - override_config (Optional[Dict[str, Any]]): Optional override - for the kernel configuration. - num_bits (bool): The number of bits in expert weights quantization. Returns: @@ -79,7 +77,6 @@ def single_marlin_moe( w.shape, topk_ids.shape[1], None, - override_config=override_config, is_marlin=True) config = get_config_func(M) @@ -122,6 +119,24 @@ def single_marlin_moe( return torch.sum(intermediate_cache.view(*intermediate_cache.shape), dim=1) +@single_marlin_moe.register_fake +def _( + hidden_states: torch.Tensor, + w: torch.Tensor, + scales: torch.Tensor, + gating_output: torch.Tensor, + topk: int, + renormalize: bool, + g_idx: Optional[torch.Tensor] = None, + sort_indices: Optional[torch.Tensor] = None, + w_zeros: Optional[torch.Tensor] = None, + num_bits: int = 8, + is_k_full: bool = True, +) -> torch.Tensor: + return torch.empty_like(hidden_states) + + +@torch.library.custom_op("vllm::fused_marlin_moe", mutates_args=[]) def fused_marlin_moe( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -137,7 +152,6 @@ def fused_marlin_moe( sort_indices2: Optional[torch.Tensor] = None, w1_zeros: Optional[torch.Tensor] = None, w2_zeros: Optional[torch.Tensor] = None, - override_config: Optional[Dict[str, Any]] = None, num_bits: int = 8, is_k_full: bool = True, ) -> torch.Tensor: @@ -161,8 +175,6 @@ def fused_marlin_moe( permutation. - topk_weights (torch.Tensor): Top-k weights. - topk_ids (torch.Tensor): Indices of topk-k elements. - - override_config (Optional[Dict[str, Any]]): Optional override - for the kernel configuration. - w1_zeros (Optional[torch.Tensor]): Optional zero points to be used for w1. - w2_zeros (Optional[torch.Tensor]): Optional zero points to be used for w2. - num_bits (bool): The number of bits in expert weights quantization. @@ -209,7 +221,6 @@ def fused_marlin_moe( w2.shape, topk_ids.shape[1], None, - override_config=override_config, is_marlin=True, ) config = get_config_func(M) @@ -311,3 +322,25 @@ def fused_marlin_moe( return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape), dim=1) + + +@fused_marlin_moe.register_fake +def _( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + gating_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + g_idx1: Optional[torch.Tensor] = None, + g_idx2: Optional[torch.Tensor] = None, + sort_indices1: Optional[torch.Tensor] = None, + sort_indices2: Optional[torch.Tensor] = None, + w1_zeros: Optional[torch.Tensor] = None, + w2_zeros: Optional[torch.Tensor] = None, + num_bits: int = 8, + is_k_full: bool = True, +) -> torch.Tensor: + return torch.empty_like(hidden_states) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 90a4209b5bce5..1cf5c2253ca0b 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -358,9 +358,10 @@ def try_get_optimal_moe_config( top_k: int, dtype: Optional[str], M: int, - override_config: Optional[Dict[str, Any]] = None, is_marlin: bool = False, ): + from vllm.model_executor.layers.fused_moe import get_config + override_config = get_config() if override_config: config = override_config else: @@ -465,19 +466,109 @@ def get_config_dtype_str(dtype: torch.dtype, return None +@torch.library.custom_op("vllm::inplace_fused_experts", + mutates_args=["hidden_states"]) +def inplace_fused_experts(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None) -> None: + fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, True, + use_fp8_w8a8, use_int8_w8a16, w1_scale, w2_scale, + a1_scale, a2_scale) + + +@inplace_fused_experts.register_fake +def _(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None) -> None: + pass + + +@torch.library.custom_op("vllm::outplace_fused_experts", mutates_args=[]) +def outplace_fused_experts( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor: + return fused_experts_impl(hidden_states, w1, w2, topk_weights, topk_ids, + False, use_fp8_w8a8, use_int8_w8a16, w1_scale, + w2_scale, a1_scale, a2_scale) + + +@outplace_fused_experts.register_fake +def _(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None) -> torch.Tensor: + return torch.empty_like(hidden_states) + + def fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, topk_weights: torch.Tensor, topk_ids: torch.Tensor, inplace: bool = False, - override_config: Optional[Dict[str, Any]] = None, use_fp8_w8a8: bool = False, use_int8_w8a16: bool = False, w1_scale: Optional[torch.Tensor] = None, w2_scale: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None): + if inplace: + torch.ops.vllm.inplace_fused_experts(hidden_states, w1, w2, + topk_weights, topk_ids, + use_fp8_w8a8, use_int8_w8a16, + w1_scale, w2_scale, a1_scale, + a2_scale) + return hidden_states + else: + return torch.ops.vllm.outplace_fused_experts( + hidden_states, w1, w2, topk_weights, topk_ids, use_fp8_w8a8, + use_int8_w8a16, w1_scale, w2_scale, a1_scale, a2_scale) + + +def fused_experts_impl(hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a16: bool = False, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None): # Check constraints. assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" @@ -504,7 +595,6 @@ def fused_experts(hidden_states: torch.Tensor, w2.shape, topk_ids.shape[1], config_dtype, - override_config=override_config, ) config = get_config_func(M) @@ -602,7 +692,6 @@ def fused_moe( topk: int, renormalize: bool, inplace: bool = False, - override_config: Optional[Dict[str, Any]] = None, use_grouped_topk: bool = False, num_expert_group: Optional[int] = None, topk_group: Optional[int] = None, @@ -628,8 +717,6 @@ def fused_moe( - renormalize (bool): If True, renormalize the top-k weights to sum to 1. - inplace (bool): If True, perform the operation in-place. Defaults to False. - - override_config (Optional[Dict[str, Any]]): Optional override - for the kernel configuration. - num_expert_group: Optional[int]: additional parameter for grouped_topk - topk_group: Optional[int]: additional parameter for grouped_topk - use_grouped_topk: If True, use grouped_topk instead of fused_topk @@ -667,7 +754,6 @@ def fused_moe( topk_weights, topk_ids, inplace=inplace, - override_config=override_config, use_fp8_w8a8=use_fp8_w8a8, use_int8_w8a16=use_int8_w8a16, w1_scale=w1_scale, diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 8dd36620e3fa0..5570771ac917b 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -12,7 +12,16 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs - +from vllm.platforms import current_platform + +if current_platform.is_cuda_alike(): + from .fused_moe import fused_experts +else: + fused_experts = None # type: ignore +if current_platform.is_tpu(): + from .moe_pallas import fused_moe as fused_moe_pallas +else: + fused_moe_pallas = None # type: ignore logger = init_logger(__name__) @@ -96,9 +105,6 @@ def forward_cuda( num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_experts) - topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -132,17 +138,18 @@ def forward_tpu( num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe assert not use_grouped_topk assert num_expert_group is None assert topk_group is None assert custom_routing_function is None - return fused_moe(hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk=top_k, - gating_output=router_logits, - renormalize=renormalize) + return fused_moe_pallas(hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk=top_k, + gating_output=router_logits, + renormalize=renormalize) + + forward_native = forward_cuda class FusedMoE(torch.nn.Module): diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index b3d93b285769c..95ec12daeeeb5 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -3,6 +3,7 @@ import torch from torch.nn import Parameter +import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import ( @@ -435,10 +436,6 @@ def apply( topk_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, ) -> torch.Tensor: - - from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe) - topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -449,7 +446,7 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function) - return fused_marlin_moe( + return torch.ops.vllm.fused_marlin_moe( x, layer.w13_qweight, layer.w2_qweight, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index be3d3985a74ad..dad04017d3212 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -6,6 +6,7 @@ from compressed_tensors import CompressionFormat from compressed_tensors.quantization import QuantizationStrategy +import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) @@ -481,10 +482,6 @@ def apply( topk_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, ) -> torch.Tensor: - - from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe) - topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -495,7 +492,7 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function) - return fused_marlin_moe( + return torch.ops.vllm.fused_marlin_moe( x, layer.w13_weight_packed, layer.w2_weight_packed, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index e77191796bd7e..b97dd108d6785 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -2,6 +2,7 @@ import torch +import vllm.model_executor.layers.fused_moe # noqa from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.layer import ( @@ -536,9 +537,6 @@ def apply( topk_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - fused_marlin_moe) - # The input must currently be float16 orig_dtype = x.dtype x = x.half() @@ -553,7 +551,7 @@ def apply( num_expert_group=num_expert_group, custom_routing_function=None) - return fused_marlin_moe( + return torch.ops.vllm.fused_marlin_moe( x, layer.w13_qweight, layer.w2_qweight, diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index fd0d4c89a28fe..5307bb21adb96 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -28,6 +28,7 @@ from transformers.models.granitemoe import GraniteMoeConfig from vllm.attention import Attention, AttentionMetadata +from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, LoRAConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE @@ -244,6 +245,7 @@ def forward( return hidden_states +@support_torch_compile class GraniteMoeModel(nn.Module): def __init__( From feb92fbe4ab6803527df48658a87ebd00b99969f Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Mon, 28 Oct 2024 02:59:37 -0400 Subject: [PATCH 016/113] Fix beam search eos (#9627) --- vllm/engine/protocol.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 5c504e0f0217d..b00dd136d4a47 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -140,7 +140,12 @@ async def beam_search( best_beams = sorted_completed[:beam_width] for beam in best_beams: - beam.text = tokenizer.decode(beam.tokens[tokenized_length:]) + if (beam.tokens[-1] == tokenizer.eos_token_id and not ignore_eos): + # Skip the eos token in the text. + tokens = beam.tokens[tokenized_length:-1] + else: + tokens = beam.tokens[tokenized_length:] + beam.text = tokenizer.decode(tokens) beam_search_output = RequestOutput( request_id=request_id, From 2adb4409e0359039135b5aa6501994da12aa5a26 Mon Sep 17 00:00:00 2001 From: Yan Ma Date: Mon, 28 Oct 2024 15:13:03 +0800 Subject: [PATCH 017/113] [Bugfix] Fix ray instance detect issue (#9439) --- vllm/executor/ray_utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index aa546ebada473..993d279890820 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -232,9 +232,16 @@ def initialize_ray_cluster( # Connect to a ray cluster. if current_platform.is_rocm() or current_platform.is_xpu(): - ray.init(address=ray_address, - ignore_reinit_error=True, - num_gpus=parallel_config.world_size) + # Try to connect existing ray instance and create a new one if not found + try: + ray.init("auto") + except ConnectionError: + logger.warning( + "No existing RAY instance detected. " + "A new instance will be launched with current node resources.") + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) else: ray.init(address=ray_address, ignore_reinit_error=True) From 8b0e4f2ad7b5a3ddd6d61acbe8ceb50b4ea3c309 Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Mon, 28 Oct 2024 12:38:09 -0400 Subject: [PATCH 018/113] [CI/Build] Adopt Mergify for auto-labeling PRs (#9259) Signed-off-by: Russell Bryant --- .github/mergify.yml | 57 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 .github/mergify.yml diff --git a/.github/mergify.yml b/.github/mergify.yml new file mode 100644 index 0000000000000..2a3dee7c662d1 --- /dev/null +++ b/.github/mergify.yml @@ -0,0 +1,57 @@ +pull_request_rules: +- name: label-documentation + description: Automatically apply documentation label + conditions: + - or: + - files~=^[^/]+\.md$ + - files~=^docs/ + actions: + label: + add: + - documentation + +- name: label-ci-build + description: Automatically apply ci/build label + conditions: + - files~=^\.github/ + - files~=\.buildkite/ + - files~=^cmake/ + - files=CMakeLists.txt + - files~=^Dockerfile + - files~=^requirements.*\.txt + - files=setup.py + actions: + label: + add: + - ci/build + +- name: label-frontend + description: Automatically apply frontend label + conditions: + - files~=^vllm/entrypoints/ + actions: + label: + add: + - frontend + +- name: ping author on conflicts and add 'needs-rebase' label + conditions: + - conflict + - -closed + actions: + label: + add: + - needs-rebase + comment: + message: | + This pull request has merge conflicts that must be resolved before it can be + merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork + +- name: remove 'needs-rebase' label when conflict is resolved + conditions: + - -conflict + - -closed + actions: + label: + remove: + - needs-rebase From 5f8d8075f957d5376b2f1cc451e35a2a757e95a5 Mon Sep 17 00:00:00 2001 From: litianjian <45817262+litianjian@users.noreply.github.com> Date: Tue, 29 Oct 2024 02:04:10 +0800 Subject: [PATCH 019/113] [Model][VLM] Add multi-video support for LLaVA-Onevision (#8905) Co-authored-by: litianjian Co-authored-by: DarkLight1337 --- .../vision_language/test_llava_onevision.py | 173 +++++------------- vllm/model_executor/models/clip.py | 4 +- vllm/model_executor/models/llava_onevision.py | 94 +++++++--- vllm/model_executor/models/siglip.py | 4 +- vllm/multimodal/video.py | 10 +- 5 files changed, 123 insertions(+), 162 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 367f25f446279..1616fd299b9aa 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Tuple, Type, overload +from typing import List, Optional, Tuple, Type import pytest from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, @@ -9,9 +9,8 @@ from vllm.sequence import SampleLogprobs from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, VllmRunner, - _VideoAssets) -from ....utils import large_gpu_test +from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, + PromptVideoInput, VllmRunner) from ...utils import check_logprobs_close # Video test @@ -20,7 +19,7 @@ "<|im_start|>user\n