From 6c6f7fe8a850ca08f9a8774de020163a2a7c2164 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Fri, 27 Dec 2024 16:45:25 +0800 Subject: [PATCH 001/462] [Platform] Move model arch check to platform (#11503) Signed-off-by: Mengqing Cao --- vllm/model_executor/models/registry.py | 37 +----------------------- vllm/platforms/interface.py | 12 ++++++++ vllm/platforms/rocm.py | 39 +++++++++++++++++++++++++- 3 files changed, 51 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index feb33bb373c3e..89992de7e238d 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -187,31 +187,6 @@ **_SPECULATIVE_DECODING_MODELS, } -# Models not supported by ROCm. -_ROCM_UNSUPPORTED_MODELS: List[str] = [] - -# Models partially supported by ROCm. -# Architecture -> Reason. -_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in " - "Triton flash attention. For half-precision SWA support, " - "please use CK flash attention by setting " - "`VLLM_USE_TRITON_FLASH_ATTN=0`") -_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { - "Qwen2ForCausalLM": - _ROCM_SWA_REASON, - "MistralForCausalLM": - _ROCM_SWA_REASON, - "MixtralForCausalLM": - _ROCM_SWA_REASON, - "PaliGemmaForConditionalGeneration": - ("ROCm flash attention does not yet " - "fully support 32-bit precision on PaliGemma"), - "Phi3VForCausalLM": - ("ROCm Triton flash attention may run into compilation errors due to " - "excessive use of shared memory. If this happens, disable Triton FA " - "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") -} - @dataclass(frozen=True) class _ModelInfo: @@ -297,17 +272,7 @@ def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, ) -> Optional[Type[nn.Module]]: - if current_platform.is_rocm(): - if model_arch in _ROCM_UNSUPPORTED_MODELS: - raise ValueError(f"Model architecture '{model_arch}' is not " - "supported by ROCm for now.") - - if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: - msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] - logger.warning( - "Model architecture '%s' is partially " - "supported by ROCm: %s", model_arch, msg) - + current_platform.verify_model_arch(model_arch) try: return model.load_model_cls() except Exception: diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 4150b0cdf836a..ddccaa2ce0148 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -199,6 +199,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: """ pass + @classmethod + def verify_model_arch(cls, model_arch: str) -> None: + """ + Verify whether the current platform supports the specified model + architecture. + + - This will raise an Error or Warning based on the model support on + the current platform. + - By default all models are considered supported. + """ + pass + @classmethod def verify_quantization(cls, quant: str) -> None: """ diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 7778b565372cb..aa779f265135f 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -1,6 +1,6 @@ import os from functools import lru_cache -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING, Dict, List, Optional import torch @@ -33,6 +33,31 @@ " `spawn` instead.") os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +# Models not supported by ROCm. +_ROCM_UNSUPPORTED_MODELS: List[str] = [] + +# Models partially supported by ROCm. +# Architecture -> Reason. +_ROCM_SWA_REASON = ("Sliding window attention (SWA) is not yet supported in " + "Triton flash attention. For half-precision SWA support, " + "please use CK flash attention by setting " + "`VLLM_USE_TRITON_FLASH_ATTN=0`") +_ROCM_PARTIALLY_SUPPORTED_MODELS: Dict[str, str] = { + "Qwen2ForCausalLM": + _ROCM_SWA_REASON, + "MistralForCausalLM": + _ROCM_SWA_REASON, + "MixtralForCausalLM": + _ROCM_SWA_REASON, + "PaliGemmaForConditionalGeneration": + ("ROCm flash attention does not yet " + "fully support 32-bit precision on PaliGemma"), + "Phi3VForCausalLM": + ("ROCm Triton flash attention may run into compilation errors due to " + "excessive use of shared memory. If this happens, disable Triton FA " + "by setting `VLLM_USE_TRITON_FLASH_ATTN=0`") +} + class RocmPlatform(Platform): _enum = PlatformEnum.ROCM @@ -102,6 +127,18 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: else: parallel_config.worker_cls = "vllm.worker.worker.Worker" + @classmethod + def verify_model_arch(cls, model_arch: str) -> None: + if model_arch in _ROCM_UNSUPPORTED_MODELS: + raise ValueError(f"Model architecture '{model_arch}' is not " + "supported by ROCm for now.") + + if model_arch in _ROCM_PARTIALLY_SUPPORTED_MODELS: + msg = _ROCM_PARTIALLY_SUPPORTED_MODELS[model_arch] + logger.warning( + "Model architecture '%s' is partially " + "supported by ROCm: %s", model_arch, msg) + @classmethod def verify_quantization(cls, quant: str) -> None: super().verify_quantization(quant) From d003f3ea391b4c879f6f848dd485dd3c04fa6ca9 Mon Sep 17 00:00:00 2001 From: AlexHe99 Date: Fri, 27 Dec 2024 18:00:04 +0800 Subject: [PATCH 002/462] Update deploying_with_k8s.md with AMD ROCm GPU example (#11465) Signed-off-by: Alex He Co-authored-by: Cyrus Leung --- docs/source/serving/deploying_with_k8s.md | 79 ++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md index d27db826cd006..77f848088ea43 100644 --- a/docs/source/serving/deploying_with_k8s.md +++ b/docs/source/serving/deploying_with_k8s.md @@ -47,7 +47,11 @@ data: token: "REPLACE_WITH_TOKEN" ``` -Create a deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model: +Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. + +Here are two examples for using NVIDIA GPU and AMD GPU. + +- NVIDIA GPU ```yaml apiVersion: apps/v1 @@ -119,6 +123,79 @@ spec: periodSeconds: 5 ``` +- AMD GPU + +You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mistral-7b + namespace: default + labels: + app: mistral-7b +spec: + replicas: 1 + selector: + matchLabels: + app: mistral-7b + template: + metadata: + labels: + app: mistral-7b + spec: + volumes: + # PVC + - name: cache-volume + persistentVolumeClaim: + claimName: mistral-7b + # vLLM needs to access the host's shared memory for tensor parallel inference. + - name: shm + emptyDir: + medium: Memory + sizeLimit: "8Gi" + hostNetwork: true + hostIPC: true + containers: + - name: mistral-7b + image: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + securityContext: + seccompProfile: + type: Unconfined + runAsGroup: 44 + capabilities: + add: + - SYS_PTRACE + command: ["/bin/sh", "-c"] + args: [ + "vllm serve mistralai/Mistral-7B-v0.3 --port 8000 --trust-remote-code --enable-chunked-prefill --max_num_batched_tokens 1024" + ] + env: + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + ports: + - containerPort: 8000 + resources: + limits: + cpu: "10" + memory: 20G + amd.com/gpu: "1" + requests: + cpu: "6" + memory: 6G + amd.com/gpu: "1" + volumeMounts: + - name: cache-volume + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm +``` +You can get the full example with steps and sample yaml files from . + 2. **Create a Kubernetes Service for vLLM** Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: From 2c9b8ea2b006e763b8268b8ab02181c9822cfe76 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Fri, 27 Dec 2024 18:39:15 +0800 Subject: [PATCH 003/462] [Bugfix] Fix TeleChat2ForCausalLM weights mapper (#11546) Signed-off-by: Jee Jee Li --- vllm/model_executor/models/telechat2.py | 26 ++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 28c37bb96612c..02ca7fe08e556 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -31,19 +31,6 @@ class TeleChat2Model(LlamaModel): - hf_to_vllm_mapper = WeightsMapper( - orig_to_new_prefix={ - "transformer.": "model.", - }, - orig_to_new_substr={ - ".h.": ".layers.", - ".self_attention.": ".self_attn.", - ".word_embeddings.": ".embed_tokens.", - ".dense.": ".o_proj.", - ".ln_f.": ".norm.", - }, - ) - def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # 1. Initialize the LlamaModel with bias vllm_config.model_config.hf_config.bias = True @@ -118,6 +105,19 @@ def load_weights(self, weights: Iterable[Tuple[str, class TeleChat2ForCausalLM(LlamaForCausalLM): + hf_to_vllm_mapper = WeightsMapper( + orig_to_new_prefix={ + "transformer.": "model.", + }, + orig_to_new_substr={ + ".h.": ".layers.", + ".self_attention.": ".self_attn.", + ".word_embeddings.": ".embed_tokens.", + ".dense.": ".o_proj.", + ".ln_f.": ".norm.", + }, + ) + def _init_model(self, vllm_config: VllmConfig, prefix: str = ""): return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) From 7af553ea30031446b4c1c74ad83187f9fd3de4e7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 27 Dec 2024 19:21:23 +0800 Subject: [PATCH 004/462] [Misc] Abstract the logic for reading and writing media content (#11527) Signed-off-by: DarkLight1337 --- tests/entrypoints/openai/test_serving_chat.py | 1 + tests/entrypoints/test_chat_utils.py | 6 +- tests/multimodal/test_utils.py | 59 ++- vllm/assets/audio.py | 6 +- vllm/entrypoints/chat_utils.py | 129 +++-- vllm/multimodal/audio.py | 36 +- vllm/multimodal/base.py | 38 +- vllm/multimodal/image.py | 41 +- vllm/multimodal/utils.py | 477 ++++++++---------- vllm/multimodal/video.py | 87 +++- 10 files changed, 493 insertions(+), 387 deletions(-) diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 51b255bb2a6db..61677b65af342 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -33,6 +33,7 @@ class MockModelConfig: hf_config = MockHFConfig() logits_processor_pattern = None diff_sampling_param: Optional[dict] = None + allowed_local_media_path: str = "" def get_diff_sampling_param(self): return self.diff_sampling_param or {} diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 996e60bfee592..d63b963522e73 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -2,7 +2,6 @@ from typing import Optional import pytest -from PIL import Image from vllm.assets.image import ImageAsset from vllm.config import ModelConfig @@ -91,10 +90,7 @@ def _assert_mm_data_is_image_input( image_data = mm_data.get("image") assert image_data is not None - if image_count == 1: - assert isinstance(image_data, Image.Image) - else: - assert isinstance(image_data, list) and len(image_data) == image_count + assert isinstance(image_data, list) and len(image_data) == image_count def test_parse_chat_messages_single_image( diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index fd82fb0c55fd7..6029f2e514772 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -9,7 +9,7 @@ from PIL import Image, ImageChops from transformers import AutoConfig, AutoTokenizer -from vllm.multimodal.utils import (async_fetch_image, fetch_image, +from vllm.multimodal.utils import (MediaConnector, repeat_and_pad_placeholder_tokens) # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) @@ -23,7 +23,12 @@ @pytest.fixture(scope="module") def url_images() -> Dict[str, Image.Image]: - return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS} + connector = MediaConnector() + + return { + image_url: connector.fetch_image(image_url) + for image_url in TEST_IMAGE_URLS + } def get_supported_suffixes() -> Tuple[str, ...]: @@ -43,8 +48,10 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool: @pytest.mark.asyncio @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_fetch_image_http(image_url: str): - image_sync = fetch_image(image_url) - image_async = await async_fetch_image(image_url) + connector = MediaConnector() + + image_sync = connector.fetch_image(image_url) + image_async = await connector.fetch_image_async(image_url) assert _image_equals(image_sync, image_async) @@ -53,6 +60,7 @@ async def test_fetch_image_http(image_url: str): @pytest.mark.parametrize("suffix", get_supported_suffixes()) async def test_fetch_image_base64(url_images: Dict[str, Image.Image], image_url: str, suffix: str): + connector = MediaConnector() url_image = url_images[image_url] try: @@ -75,48 +83,49 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image], base64_image = base64.b64encode(f.read()).decode("utf-8") data_url = f"data:{mime_type};base64,{base64_image}" - data_image_sync = fetch_image(data_url) + data_image_sync = connector.fetch_image(data_url) if _image_equals(url_image, Image.open(f)): assert _image_equals(url_image, data_image_sync) else: pass # Lossy format; only check that image can be opened - data_image_async = await async_fetch_image(data_url) + data_image_async = await connector.fetch_image_async(data_url) assert _image_equals(data_image_sync, data_image_async) @pytest.mark.asyncio @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS) async def test_fetch_image_local_files(image_url: str): + connector = MediaConnector() + with TemporaryDirectory() as temp_dir: - origin_image = fetch_image(image_url) + local_connector = MediaConnector(allowed_local_media_path=temp_dir) + + origin_image = connector.fetch_image(image_url) origin_image.save(os.path.join(temp_dir, os.path.basename(image_url)), quality=100, icc_profile=origin_image.info.get('icc_profile')) - image_async = await async_fetch_image( - f"file://{temp_dir}/{os.path.basename(image_url)}", - allowed_local_media_path=temp_dir) - - image_sync = fetch_image( - f"file://{temp_dir}/{os.path.basename(image_url)}", - allowed_local_media_path=temp_dir) + image_async = await local_connector.fetch_image_async( + f"file://{temp_dir}/{os.path.basename(image_url)}") + image_sync = local_connector.fetch_image( + f"file://{temp_dir}/{os.path.basename(image_url)}") # Check that the images are equal assert not ImageChops.difference(image_sync, image_async).getbbox() - with pytest.raises(ValueError): - await async_fetch_image( - f"file://{temp_dir}/../{os.path.basename(image_url)}", - allowed_local_media_path=temp_dir) - with pytest.raises(ValueError): - await async_fetch_image( + with pytest.raises(ValueError, match="must be a subpath"): + await local_connector.fetch_image_async( + f"file://{temp_dir}/../{os.path.basename(image_url)}") + with pytest.raises(RuntimeError, match="Cannot load local files"): + await connector.fetch_image_async( f"file://{temp_dir}/../{os.path.basename(image_url)}") - with pytest.raises(ValueError): - fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}", - allowed_local_media_path=temp_dir) - with pytest.raises(ValueError): - fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}") + with pytest.raises(ValueError, match="must be a subpath"): + local_connector.fetch_image( + f"file://{temp_dir}/../{os.path.basename(image_url)}") + with pytest.raises(RuntimeError, match="Cannot load local files"): + connector.fetch_image( + f"file://{temp_dir}/../{os.path.basename(image_url)}") @pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-mistral-7b-hf"]) diff --git a/vllm/assets/audio.py b/vllm/assets/audio.py index 9033644e3264a..a46c67ad7e00e 100644 --- a/vllm/assets/audio.py +++ b/vllm/assets/audio.py @@ -21,12 +21,10 @@ class AudioAsset: name: Literal["winning_call", "mary_had_lamb"] @property - def audio_and_sample_rate(self) -> tuple[npt.NDArray, int]: + def audio_and_sample_rate(self) -> tuple[npt.NDArray, float]: audio_path = get_vllm_public_assets(filename=f"{self.name}.ogg", s3_prefix=ASSET_DIR) - y, sr = librosa.load(audio_path, sr=None) - assert isinstance(sr, int) - return y, sr + return librosa.load(audio_path, sr=None) @property def url(self) -> str: diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 3df08c740d65b..a492d5496e025 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -6,7 +6,7 @@ from functools import lru_cache, partial from pathlib import Path from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List, - Literal, Mapping, Optional, Tuple, TypeVar, Union, cast) + Literal, Optional, Tuple, TypeVar, Union, cast) import jinja2.nodes import transformers.utils.chat_template_utils as hf_chat_utils @@ -23,6 +23,8 @@ ChatCompletionMessageParam as OpenAIChatCompletionMessageParam) from openai.types.chat import (ChatCompletionMessageToolCallParam, ChatCompletionToolMessageParam) +from openai.types.chat.chat_completion_content_part_input_audio_param import ( + InputAudio) # yapf: enable # pydantic needs the TypedDict from typing_extensions from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast @@ -31,11 +33,7 @@ from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.multimodal import MultiModalDataDict -from vllm.multimodal.utils import (async_get_and_parse_audio, - async_get_and_parse_image, - async_get_and_parse_video, - get_and_parse_audio, get_and_parse_image, - get_and_parse_video) +from vllm.multimodal.utils import MediaConnector from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer from vllm.utils import print_warning_once @@ -368,14 +366,17 @@ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): self._tokenizer = tokenizer self._allowed_items = (model_config.multimodal_config.limit_per_prompt if model_config.multimodal_config else {}) - self._consumed_items = {k: 0 for k in self._allowed_items} - self._items: List[_T] = [] + self._items_by_modality = defaultdict[str, list[_T]](list) @property def model_config(self) -> ModelConfig: return self._model_config + @property + def allowed_local_media_path(self): + return self._model_config.allowed_local_media_path + @staticmethod @lru_cache(maxsize=None) def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: @@ -435,38 +436,19 @@ def _placeholder_str(self, modality: ModalityStr, else: raise TypeError(f"Unknown modality: {modality}") - @staticmethod - def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict: - mm_lists: Mapping[str, List[object]] = defaultdict(list) - - # Merge all the multi-modal items - for single_mm_data in items: - for mm_key, mm_item in single_mm_data.items(): - if isinstance(mm_item, list): - mm_lists[mm_key].extend(mm_item) - else: - mm_lists[mm_key].append(mm_item) - - # Unpack any single item lists for models that don't expect multiple. - return { - mm_key: mm_list[0] if len(mm_list) == 1 else mm_list - for mm_key, mm_list in mm_lists.items() - } - def add(self, modality: ModalityStr, item: _T) -> Optional[str]: """ Add a multi-modal item to the current prompt and returns the placeholder string to use, if any. """ allowed_count = self._allowed_items.get(modality, 1) - current_count = self._consumed_items.get(modality, 0) + 1 + current_count = len(self._items_by_modality[modality]) + 1 if current_count > allowed_count: raise ValueError( f"At most {allowed_count} {modality}(s) may be provided in " "one request.") - self._consumed_items[modality] = current_count - self._items.append(item) + self._items_by_modality[modality].append(item) return self._placeholder_str(modality, current_count) @@ -475,22 +457,26 @@ def create_parser(self) -> "BaseMultiModalContentParser": raise NotImplementedError -class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]): +class MultiModalItemTracker(BaseMultiModalItemTracker[object]): def all_mm_data(self) -> Optional[MultiModalDataDict]: - return self._combine(self._items) if self._items else None + if self._items_by_modality: + return dict(self._items_by_modality) + + return None def create_parser(self) -> "BaseMultiModalContentParser": return MultiModalContentParser(self) -class AsyncMultiModalItemTracker( - BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]): +class AsyncMultiModalItemTracker(BaseMultiModalItemTracker[Awaitable[object]]): async def all_mm_data(self) -> Optional[MultiModalDataDict]: - if self._items: - items = await asyncio.gather(*self._items) - return self._combine(items) + if self._items_by_modality: + return { + modality: await asyncio.gather(*items) + for modality, items in self._items_by_modality.items() + } return None @@ -522,7 +508,7 @@ def parse_audio(self, audio_url: str) -> None: raise NotImplementedError @abstractmethod - def parse_input_audio(self, input_audio: Dict[str, str]) -> None: + def parse_input_audio(self, input_audio: InputAudio) -> None: raise NotImplementedError @abstractmethod @@ -537,31 +523,31 @@ def __init__(self, tracker: MultiModalItemTracker) -> None: self._tracker = tracker + self._connector = MediaConnector( + allowed_local_media_path=tracker.allowed_local_media_path, + ) + def parse_image(self, image_url: str) -> None: - image = get_and_parse_image(image_url, - allowed_local_media_path=self._tracker. - _model_config.allowed_local_media_path) + image = self._connector.fetch_image(image_url) placeholder = self._tracker.add("image", image) self._add_placeholder(placeholder) def parse_audio(self, audio_url: str) -> None: - audio = get_and_parse_audio(audio_url) + audio = self._connector.fetch_audio(audio_url) placeholder = self._tracker.add("audio", audio) self._add_placeholder(placeholder) - def parse_input_audio(self, input_audio: Dict[str, str]) -> None: - input_audio_data = input_audio.get("data","") - input_audio_format = input_audio.get("format","") - audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" - audio = get_and_parse_audio(audio_url) + def parse_input_audio(self, input_audio: InputAudio) -> None: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + audio_url = f"data:audio/{audio_format};base64,{audio_data}" - placeholder = self._tracker.add("audio", audio) - self._add_placeholder(placeholder) + return self.parse_audio(audio_url) def parse_video(self, video_url: str) -> None: - video = get_and_parse_video(video_url) + video = self._connector.fetch_video(video_url) placeholder = self._tracker.add("video", video) self._add_placeholder(placeholder) @@ -573,33 +559,31 @@ def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: super().__init__() self._tracker = tracker + self._connector = MediaConnector( + allowed_local_media_path=tracker.allowed_local_media_path, + ) def parse_image(self, image_url: str) -> None: - image_coro = async_get_and_parse_image( - image_url, - allowed_local_media_path=self._tracker._model_config. - allowed_local_media_path) + image_coro = self._connector.fetch_image_async(image_url) placeholder = self._tracker.add("image", image_coro) self._add_placeholder(placeholder) def parse_audio(self, audio_url: str) -> None: - audio_coro = async_get_and_parse_audio(audio_url) + audio_coro = self._connector.fetch_audio_async(audio_url) placeholder = self._tracker.add("audio", audio_coro) self._add_placeholder(placeholder) - def parse_input_audio(self, input_audio: Dict[str, str]) -> None: - input_audio_data = input_audio.get("data","") - input_audio_format = input_audio.get("format","") - audio_url = f"data:audio/{input_audio_format};base64,{input_audio_data}" - audio_coro = async_get_and_parse_audio(audio_url) + def parse_input_audio(self, input_audio: InputAudio) -> None: + audio_data = input_audio.get("data", "") + audio_format = input_audio.get("format", "") + audio_url = f"data:audio/{audio_format};base64,{audio_data}" - placeholder = self._tracker.add("audio", audio_coro) - self._add_placeholder(placeholder) + return self.parse_audio(audio_url) def parse_video(self, video_url: str) -> None: - video = async_get_and_parse_video(video_url) + video = self._connector.fetch_video_async(video_url) placeholder = self._tracker.add("video", video) self._add_placeholder(placeholder) @@ -695,10 +679,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) +_ContentPart: TypeAlias = Union[str, Dict[str, str], InputAudio] + # Define a mapping from part types to their corresponding parsing functions. -MM_PARSER_MAP: Dict[str, - Callable[[ChatCompletionContentPartParam], - Union[str, Dict[str,str]]]] = { +MM_PARSER_MAP: Dict[ + str, + Callable[[ChatCompletionContentPartParam], _ContentPart], +] = { "text": lambda part: _TextParser(part).get("text", ""), "image_url": @@ -715,8 +702,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], def _parse_chat_message_content_mm_part( - part: ChatCompletionContentPartParam) -> Tuple[str, - Union[str, Dict[str, str]]]: + part: ChatCompletionContentPartParam) -> tuple[str, _ContentPart]: """ Parses a given multi-modal content part based on its type. @@ -783,7 +769,7 @@ def _parse_chat_message_content_parts( *, wrap_dicts: bool, ) -> List[ConversationMessage]: - content: List[Union[str, Dict[str, str]]] = [] + content = list[_ContentPart]() mm_parser = mm_tracker.create_parser() @@ -814,7 +800,7 @@ def _parse_chat_message_content_part( mm_parser: BaseMultiModalContentParser, *, wrap_dicts: bool, -) -> Optional[Union[str, Dict[str, str]]]: +) -> Optional[_ContentPart]: """Parses a single part of a conversation. If wrap_dicts is True, structured dictionary pieces for texts and images will be wrapped in dictionaries, i.e., {"type": "text", "text", ...} and @@ -823,8 +809,7 @@ def _parse_chat_message_content_part( with multimodal placeholders. """ if isinstance(part, str): # Handle plain text parts - text = _TextParser(part) - return text + return part # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) @@ -855,7 +840,7 @@ def _parse_chat_message_content_part( return {'type': 'audio'} if wrap_dicts else None if part_type == "input_audio": - dict_content = cast(Dict[str, str], content) + dict_content = cast(InputAudio, content) mm_parser.parse_input_audio(dict_content) return {'type': 'audio'} if wrap_dicts else None diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index ed3bb82bf0aaa..3e09ef1fcbb56 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,10 +1,14 @@ +import base64 +from io import BytesIO +from pathlib import Path + import numpy as np import numpy.typing as npt from vllm.inputs.registry import InputContext from vllm.utils import PlaceholderModule -from .base import MultiModalPlugin +from .base import MediaIO, MultiModalPlugin from .inputs import AudioItem, MultiModalData, MultiModalKwargs try: @@ -12,6 +16,11 @@ except ImportError: librosa = PlaceholderModule("librosa") # type: ignore[assignment] +try: + import soundfile +except ImportError: + soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] + class AudioPlugin(MultiModalPlugin): """Plugin for audio data.""" @@ -39,3 +48,28 @@ def resample_audio( target_sr: float, ) -> npt.NDArray[np.floating]: return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) + + +class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): + + def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: + return librosa.load(BytesIO(data), sr=None) + + def load_base64( + self, + media_type: str, + data: str, + ) -> tuple[npt.NDArray, float]: + return self.load_bytes(base64.b64decode(data)) + + def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]: + return librosa.load(filepath, sr=None) + + def encode_base64(self, media: tuple[npt.NDArray, float]) -> str: + audio, sr = media + + with BytesIO() as buffer: + soundfile.write(buffer, audio, sr, format="WAV") + data = buffer.getvalue() + + return base64.b64encode(data).decode('utf-8') diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 1e5a46946c6c0..10488e24b30cc 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from collections import defaultdict -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, +from pathlib import Path +from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple, Optional, Sequence, Tuple, Type, TypeVar, Union) from torch import nn @@ -118,7 +119,7 @@ def map_input( self, model_config: "ModelConfig", data: MultiModalData[Any], - mm_processor_kwargs: Optional[Dict[str, Any]], + mm_processor_kwargs: Optional[dict[str, Any]], ) -> MultiModalKwargs: """ Transform the data into a dictionary of model inputs using the @@ -254,10 +255,10 @@ class MultiModalPlaceholderMap: """ class IndexMap(NamedTuple): - src: List[int] - dest: List[int] + src: list[int] + dest: list[int] - src_ranges: List[range] + src_ranges: list[range] """ The indices of the multi-modal embeddings that will replace the corresponding placeholder embeddings pointed to by ``dest_ranges``. @@ -268,7 +269,7 @@ class IndexMap(NamedTuple): The total number of flattened multi-modal embeddings. """ - dest_ranges: List[range] + dest_ranges: list[range] """ The indices of the placeholder embeddings that will be replaced by the multimodal embeddings. @@ -288,7 +289,7 @@ def __init__(self): @classmethod def from_seq_group( cls, seq_group: "SequenceGroupMetadata", positions: range - ) -> Tuple[Optional[MultiModalDataDict], Dict[str, + ) -> Tuple[Optional[MultiModalDataDict], dict[str, "MultiModalPlaceholderMap"]]: """ Returns the multi-modal items that intersect with the portion of a @@ -376,9 +377,9 @@ def from_seq_group( def append_items_from_seq_group( self, positions: range, - multi_modal_items: List[_T], + multi_modal_items: list[_T], multi_modal_placeholders: Sequence[PlaceholderRange], - ) -> List[_T]: + ) -> list[_T]: """ Adds the multi-modal items that intersect ```positions`` to this placeholder map and returns the intersecting items. @@ -454,3 +455,22 @@ def index_map(self) -> "IndexMap": return MultiModalPlaceholderMap.IndexMap(src=src_indices, dest=dest_indices) + + +class MediaIO(ABC, Generic[_T]): + + @abstractmethod + def load_bytes(self, data: bytes) -> _T: + raise NotImplementedError + + @abstractmethod + def load_base64(self, media_type: str, data: str) -> _T: + """ + List of media types: + https://www.iana.org/assignments/media-types/media-types.xhtml + """ + raise NotImplementedError + + @abstractmethod + def load_file(self, filepath: Path) -> _T: + raise NotImplementedError diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index c705e1a3d1554..14c79dfadec0c 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,4 +1,7 @@ +import base64 from functools import lru_cache +from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional import torch @@ -9,7 +12,7 @@ from vllm.transformers_utils.processor import get_image_processor from vllm.utils import is_list_of -from .base import MultiModalPlugin +from .base import MediaIO, MultiModalPlugin from .inputs import ImageItem, MultiModalData, MultiModalKwargs if TYPE_CHECKING: @@ -96,3 +99,39 @@ def rescale_image_size(image: Image.Image, if transpose >= 0: image = image.transpose(Image.Transpose(transpose)) return image + + +class ImageMediaIO(MediaIO[Image.Image]): + + def __init__(self, *, image_mode: str = "RGB") -> None: + super().__init__() + + self.image_mode = image_mode + + def load_bytes(self, data: bytes) -> Image.Image: + image = Image.open(BytesIO(data)) + image.load() + return image.convert(self.image_mode) + + def load_base64(self, media_type: str, data: str) -> Image.Image: + return self.load_bytes(base64.b64decode(data)) + + def load_file(self, filepath: Path) -> Image.Image: + image = Image.open(filepath) + image.load() + return image.convert(self.image_mode) + + def encode_base64( + self, + media: Image.Image, + *, + image_format: str = "JPEG", + ) -> str: + image = media + + with BytesIO() as buffer: + image = image.convert(self.image_mode) + image.save(buffer, image_format) + data = buffer.getvalue() + + return base64.b64encode(data).decode('utf-8') diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index a49da2bdee972..87b12a6fb33c1 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -1,8 +1,7 @@ -import base64 -import os from functools import lru_cache -from io import BytesIO -from typing import List, Optional, Tuple, TypeVar, Union +from pathlib import Path +from typing import Optional, TypeVar, Union +from urllib.parse import ParseResult, urlparse import numpy as np import numpy.typing as npt @@ -10,283 +9,246 @@ from PIL import Image import vllm.envs as envs -from vllm.connections import global_http_connection +from vllm.connections import HTTPConnection, global_http_connection from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer -from vllm.utils import PlaceholderModule -from .inputs import MultiModalDataDict, PlaceholderRange - -try: - import decord -except ImportError: - decord = PlaceholderModule("decord") # type: ignore[assignment] - -try: - import librosa -except ImportError: - librosa = PlaceholderModule("librosa") # type: ignore[assignment] - -try: - import soundfile -except ImportError: - soundfile = PlaceholderModule("soundfile") # type: ignore[assignment] +from .audio import AudioMediaIO +from .base import MediaIO +from .image import ImageMediaIO +from .inputs import PlaceholderRange +from .video import VideoMediaIO logger = init_logger(__name__) cached_get_tokenizer = lru_cache(get_tokenizer) +_M = TypeVar("_M") -def _load_image_from_bytes(b: bytes) -> Image.Image: - image = Image.open(BytesIO(b)) - image.load() - return image - - -def _is_subpath(image_path: str, allowed_local_media_path: str) -> bool: - # Get the common path - common_path = os.path.commonpath([ - os.path.abspath(image_path), - os.path.abspath(allowed_local_media_path) - ]) - # Check if the common path is the same as allowed_local_media_path - return common_path == os.path.abspath(allowed_local_media_path) +class MediaConnector: -def _load_image_from_file(image_url: str, - allowed_local_media_path: str) -> Image.Image: - if not allowed_local_media_path: - raise ValueError("Invalid 'image_url': Cannot load local files without" - "'--allowed-local-media-path'.") - if allowed_local_media_path: - if not os.path.exists(allowed_local_media_path): - raise ValueError( - "Invalid '--allowed-local-media-path': " - f"The path {allowed_local_media_path} does not exist.") - if not os.path.isdir(allowed_local_media_path): + def __init__( + self, + connection: HTTPConnection = global_http_connection, + *, + allowed_local_media_path: str = "", + ) -> None: + super().__init__() + + self.connection = connection + + if allowed_local_media_path: + allowed_local_media_path_ = Path(allowed_local_media_path) + + if not allowed_local_media_path_.exists(): + raise ValueError( + "Invalid `--allowed-local-media-path`: The path " + f"{allowed_local_media_path_} does not exist.") + if not allowed_local_media_path_.is_dir(): + raise ValueError( + "Invalid `--allowed-local-media-path`: The path " + f"{allowed_local_media_path_} must be a directory.") + else: + allowed_local_media_path_ = None + + self.allowed_local_media_path = allowed_local_media_path_ + + def _load_data_url( + self, + url_spec: ParseResult, + media_io: MediaIO[_M], + ) -> _M: + data_spec, data = url_spec.path.split(",", 1) + media_type, data_type = data_spec.split(";", 1) + + if data_type != "base64": + msg = "Only base64 data URLs are supported for now." + raise NotImplementedError(msg) + + return media_io.load_base64(media_type, data) + + def _load_file_url( + self, + url_spec: ParseResult, + media_io: MediaIO[_M], + ) -> _M: + allowed_local_media_path = self.allowed_local_media_path + if allowed_local_media_path is None: + raise RuntimeError("Cannot load local files without " + "`--allowed-local-media-path`.") + + filepath = Path(url_spec.path) + if allowed_local_media_path not in filepath.resolve().parents: raise ValueError( - "Invalid '--allowed-local-media-path': " - f"The path {allowed_local_media_path} must be a directory.") - - # Only split once and assume the second part is the image path - _, image_path = image_url.split("file://", 1) - if not _is_subpath(image_path, allowed_local_media_path): - raise ValueError( - f"Invalid 'image_url': The file path {image_path} must" - " be a subpath of '--allowed-local-media-path'" - f" '{allowed_local_media_path}'.") - - image = Image.open(image_path) - image.load() - return image + f"The file path {filepath} must be a subpath " + f"of `--allowed-local-media-path` {allowed_local_media_path}.") + return media_io.load_file(filepath) -def _load_image_from_data_url(image_url: str) -> Image.Image: - # Only split once and assume the second part is the base64 encoded image - _, image_base64 = image_url.split(",", 1) - return load_image_from_base64(image_base64) - - -def fetch_image(image_url: str, - *, - image_mode: str = "RGB", - allowed_local_media_path: str = "") -> Image.Image: - """ - Load a PIL image from a HTTP or base64 data URL. - - By default, the image is converted into RGB format. - """ - if image_url.startswith('http'): - image_raw = global_http_connection.get_bytes( - image_url, - timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, - ) - image = _load_image_from_bytes(image_raw) - - elif image_url.startswith('data:image'): - image = _load_image_from_data_url(image_url) - elif image_url.startswith('file://'): - image = _load_image_from_file(image_url, allowed_local_media_path) - else: - raise ValueError("Invalid 'image_url': A valid 'image_url' must start " - "with either 'data:image', 'file://' or 'http'.") - - return image.convert(image_mode) - - -async def async_fetch_image(image_url: str, - *, - image_mode: str = "RGB", - allowed_local_media_path: str = "") -> Image.Image: - """ - Asynchronously load a PIL image from a HTTP or base64 data URL. - - By default, the image is converted into RGB format. - """ - if image_url.startswith('http'): - image_raw = await global_http_connection.async_get_bytes( - image_url, - timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, - ) - image = _load_image_from_bytes(image_raw) - - elif image_url.startswith('data:image'): - image = _load_image_from_data_url(image_url) - elif image_url.startswith('file://'): - image = _load_image_from_file(image_url, allowed_local_media_path) - else: - raise ValueError("Invalid 'image_url': A valid 'image_url' must start " - "with either 'data:image', 'file://' or 'http'.") + def load_from_url( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = None, + ) -> _M: + url_spec = urlparse(url) - return image.convert(image_mode) + if url_spec.scheme.startswith("http"): + connection = self.connection + data = connection.get_bytes(url, timeout=fetch_timeout) + return media_io.load_bytes(data) -def _load_video_from_bytes(b: bytes, num_frames: int = 32) -> npt.NDArray: - video_path = BytesIO(b) - vr = decord.VideoReader(video_path, num_threads=1) - total_frame_num = len(vr) + if url_spec.scheme == "data": + return self._load_data_url(url_spec, media_io) - if total_frame_num > num_frames: - uniform_sampled_frames = np.linspace(0, - total_frame_num - 1, - num_frames, - dtype=int) - frame_idx = uniform_sampled_frames.tolist() - else: - frame_idx = [i for i in range(0, total_frame_num)] - frames = vr.get_batch(frame_idx).asnumpy() + if url_spec.scheme == "file": + return self._load_file_url(url_spec, media_io) - return frames + msg = "The URL must be either a HTTP, data or file URL." + raise ValueError(msg) + async def load_from_url_async( + self, + url: str, + media_io: MediaIO[_M], + *, + fetch_timeout: Optional[int] = None, + ) -> _M: + url_spec = urlparse(url) -def _load_video_from_data_url(video_url: str) -> npt.NDArray: - # Only split once and assume the second part is the base64 encoded video - _, video_base64 = video_url.split(",", 1) + if url_spec.scheme.startswith("http"): + connection = self.connection + data = await connection.async_get_bytes(url, timeout=fetch_timeout) - if video_url.startswith("data:video/jpeg;"): - return np.stack([ - np.array(load_image_from_base64(frame_base64)) - for frame_base64 in video_base64.split(",") - ]) + return media_io.load_bytes(data) - return load_video_from_base64(video_base64) + if url_spec.scheme == "data": + return self._load_data_url(url_spec, media_io) + if url_spec.scheme == "file": + return self._load_file_url(url_spec, media_io) -def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray: - """ - Load video from a HTTP or base64 data URL. - """ - if video_url.startswith('http') or video_url.startswith('https'): - video_raw = global_http_connection.get_bytes( - video_url, - timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, - ) - video = _load_video_from_bytes(video_raw, num_frames) - elif video_url.startswith('data:video'): - video = _load_video_from_data_url(video_url) - else: - raise ValueError("Invalid 'video_url': A valid 'video_url' must start " - "with either 'data:video' or 'http'.") - return video + msg = "The URL must be either a HTTP, data or file URL." + raise ValueError(msg) + def fetch_audio( + self, + audio_url: str, + ) -> tuple[np.ndarray, Union[int, float]]: + """ + Load audio from a URL. + """ + audio_io = AudioMediaIO() -async def async_fetch_video(video_url: str, - *, - num_frames: int = 32) -> npt.NDArray: - """ - Asynchronously load video from a HTTP or base64 data URL. - - By default, the image is converted into RGB format. - """ - if video_url.startswith('http') or video_url.startswith('https'): - video_raw = await global_http_connection.async_get_bytes( - video_url, - timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, - ) - video = _load_video_from_bytes(video_raw, num_frames) - elif video_url.startswith('data:video'): - video = _load_video_from_data_url(video_url) - else: - raise ValueError("Invalid 'video_url': A valid 'video_url' must start " - "with either 'data:video' or 'http'.") - return video - - -def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: - """ - Load audio from a URL. - """ - if audio_url.startswith("http"): - audio_bytes = global_http_connection.get_bytes( + return self.load_from_url( audio_url, - timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, + audio_io, + fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, ) - elif audio_url.startswith("data:audio"): - _, audio_base64 = audio_url.split(",", 1) - audio_bytes = base64.b64decode(audio_base64) - else: - raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start " - "with either 'data:audio' or 'http'.") - - return librosa.load(BytesIO(audio_bytes), sr=None) + async def fetch_audio_async( + self, + audio_url: str, + ) -> tuple[np.ndarray, Union[int, float]]: + """ + Asynchronously fetch audio from a URL. + """ + audio_io = AudioMediaIO() -async def async_fetch_audio( - audio_url: str) -> Tuple[np.ndarray, Union[int, float]]: - """ - Asynchronously fetch audio from a URL. - """ - if audio_url.startswith("http"): - audio_bytes = await global_http_connection.async_get_bytes( + return await self.load_from_url_async( audio_url, - timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, + audio_io, + fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT, ) - elif audio_url.startswith("data:audio"): - _, audio_base64 = audio_url.split(",", 1) - audio_bytes = base64.b64decode(audio_base64) - else: - raise ValueError("Invalid 'audio_url': A valid 'audio_url' must start " - "with either 'data:audio' or 'http'.") - - return librosa.load(BytesIO(audio_bytes), sr=None) + def fetch_image( + self, + image_url: str, + *, + image_mode: str = "RGB", + ) -> Image.Image: + """ + Load a PIL image from a HTTP or base64 data URL. -def get_and_parse_audio(audio_url: str) -> MultiModalDataDict: - audio, sr = fetch_audio(audio_url) - return {"audio": (audio, sr)} + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) + return self.load_from_url( + image_url, + image_io, + fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) -def get_and_parse_image( + async def fetch_image_async( + self, image_url: str, *, - allowed_local_media_path: str = "") -> MultiModalDataDict: - image = fetch_image(image_url, - allowed_local_media_path=allowed_local_media_path) - return {"image": image} - + image_mode: str = "RGB", + ) -> Image.Image: + """ + Asynchronously load a PIL image from a HTTP or base64 data URL. -def get_and_parse_video(video_url: str) -> MultiModalDataDict: - video = fetch_video(video_url) - return {"video": video} + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) + return await self.load_from_url_async( + image_url, + image_io, + fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT, + ) -async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict: - audio, sr = await async_fetch_audio(audio_url) - return {"audio": (audio, sr)} - + def fetch_video( + self, + video_url: str, + *, + image_mode: str = "RGB", + num_frames: int = 32, + ) -> npt.NDArray: + """ + Load video from a HTTP or base64 data URL. + """ + image_io = ImageMediaIO(image_mode=image_mode) + video_io = VideoMediaIO(image_io, num_frames=num_frames) + + return self.load_from_url( + video_url, + video_io, + fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, + ) -async def async_get_and_parse_image( - image_url: str, + async def fetch_video_async( + self, + video_url: str, *, - allowed_local_media_path: str = "") -> MultiModalDataDict: - image = await async_fetch_image( - image_url, allowed_local_media_path=allowed_local_media_path) - return {"image": image} + image_mode: str = "RGB", + num_frames: int = 32, + ) -> npt.NDArray: + """ + Asynchronously load video from a HTTP or base64 data URL. + + By default, the image is converted into RGB format. + """ + image_io = ImageMediaIO(image_mode=image_mode) + video_io = VideoMediaIO(image_io, num_frames=num_frames) + + return await self.load_from_url_async( + video_url, + video_io, + fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT, + ) -async def async_get_and_parse_video(video_url: str) -> MultiModalDataDict: - video = await async_fetch_video(video_url) - return {"video": video} +global_media_connector = MediaConnector() +"""The global :class:`MediaConnector` instance used by vLLM.""" + +fetch_audio = global_media_connector.fetch_audio +fetch_image = global_media_connector.fetch_image +fetch_video = global_media_connector.fetch_video def encode_audio_base64( @@ -294,10 +256,8 @@ def encode_audio_base64( sampling_rate: int, ) -> str: """Encode audio as base64.""" - buffered = BytesIO() - soundfile.write(buffered, audio, sampling_rate, format="WAV") - - return base64.b64encode(buffered.getvalue()).decode('utf-8') + audio_io = AudioMediaIO() + return audio_io.encode_base64((audio, sampling_rate)) def encode_image_base64( @@ -311,29 +271,14 @@ def encode_image_base64( By default, the image is converted into RGB format before being encoded. """ - buffered = BytesIO() - image = image.convert(image_mode) - image.save(buffered, format) - return base64.b64encode(buffered.getvalue()).decode('utf-8') - - -def load_image_from_base64(image: Union[bytes, str]) -> Image.Image: - """Load image from base64 format.""" - return _load_image_from_bytes(base64.b64decode(image)) + image_io = ImageMediaIO(image_mode=image_mode) + return image_io.encode_base64(image, image_format=format) def encode_video_base64(frames: npt.NDArray) -> str: - base64_frames = [] - frames_list = [frames[i] for i in range(frames.shape[0])] - for frame in frames_list: - img_base64 = encode_image_base64(Image.fromarray(frame)) - base64_frames.append(img_base64) - return ",".join(base64_frames) - - -def load_video_from_base64(video: Union[bytes, str]) -> npt.NDArray: - """Load video from base64 format.""" - return _load_video_from_bytes(base64.b64decode(video)) + image_io = ImageMediaIO() + video_io = VideoMediaIO(image_io) + return video_io.encode_base64(frames) def resolve_visual_encoder_outputs( @@ -389,7 +334,7 @@ def repeat_and_pad_token( repeat_count: int = 1, pad_token_left: Optional[_T] = None, pad_token_right: Optional[_T] = None, -) -> List[_T]: +) -> list[_T]: replacement = [token] * repeat_count if pad_token_left is not None: replacement = [pad_token_left] + replacement @@ -402,13 +347,13 @@ def repeat_and_pad_token( def repeat_and_pad_placeholder_tokens( tokenizer: AnyTokenizer, prompt: Optional[str], - prompt_token_ids: List[int], + prompt_token_ids: list[int], *, placeholder_token_id: int, - repeat_count: Union[int, List[int]], + repeat_count: Union[int, list[int]], pad_token_left: Optional[int] = None, pad_token_right: Optional[int] = None, -) -> Tuple[Optional[str], List[int], List[PlaceholderRange]]: +) -> tuple[Optional[str], list[int], list[PlaceholderRange]]: if isinstance(repeat_count, int): repeat_count = [repeat_count] @@ -450,8 +395,8 @@ def repeat_and_pad_placeholder_tokens( new_prompt += prompt_parts[i] + replacement_str new_prompt += prompt_parts[-1] - new_token_ids: List[int] = [] - placeholder_ranges: List[PlaceholderRange] = [] + new_token_ids = list[int]() + placeholder_ranges = list[PlaceholderRange]() placeholder_token_idx = 0 for i, token in enumerate(prompt_token_ids): if token == placeholder_token_id: @@ -481,7 +426,7 @@ def repeat_and_pad_placeholder_tokens( def consecutive_placeholder_ranges( num_items: int, item_size: int, - initial_offset: int = 0) -> List[PlaceholderRange]: + initial_offset: int = 0) -> list[PlaceholderRange]: """Returns a list of consecutive PlaceholderRanges of a fixed size""" return [ diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index c4be100562703..b7d43c830cc46 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,23 +1,32 @@ -from functools import lru_cache +import base64 +from functools import lru_cache, partial +from io import BytesIO +from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, Optional import cv2 import numpy as np import numpy.typing as npt +from PIL import Image from vllm.inputs.registry import InputContext from vllm.logger import init_logger from vllm.transformers_utils.processor import get_video_processor from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.utils import is_list_of +from vllm.utils import PlaceholderModule, is_list_of -from .base import MultiModalData -from .image import ImagePlugin +from .base import MediaIO, MultiModalData +from .image import ImageMediaIO, ImagePlugin from .inputs import MultiModalKwargs, VideoItem if TYPE_CHECKING: from vllm.config import ModelConfig +try: + import decord +except ImportError: + decord = PlaceholderModule("decord") # type: ignore[assignment] + logger = init_logger(__name__) cached_get_video_processor = lru_cache(get_video_processor) @@ -107,3 +116,73 @@ def sample_frames_from_video(frames: npt.NDArray, frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int) sampled_frames = frames[frame_indices, ...] return sampled_frames + + +class VideoMediaIO(MediaIO[npt.NDArray]): + + def __init__( + self, + image_io: ImageMediaIO, + *, + num_frames: int = 32, + ) -> None: + super().__init__() + + self.image_io = image_io + self.num_frames = num_frames + + def load_bytes(self, data: bytes) -> npt.NDArray: + vr = decord.VideoReader(BytesIO(data), num_threads=1) + total_frame_num = len(vr) + + num_frames = self.num_frames + if total_frame_num > num_frames: + uniform_sampled_frames = np.linspace(0, + total_frame_num - 1, + num_frames, + dtype=int) + frame_idx = uniform_sampled_frames.tolist() + else: + frame_idx = list(range(0, total_frame_num)) + + return vr.get_batch(frame_idx).asnumpy() + + def load_base64(self, media_type: str, data: str) -> npt.NDArray: + if media_type.lower() == "video/jpeg": + load_frame = partial( + self.image_io.load_base64, + "image/jpeg", + ) + + return np.stack([ + np.array(load_frame(frame_data)) + for frame_data in data.split(",") + ]) + + return self.load_bytes(base64.b64decode(data)) + + def load_file(self, filepath: Path) -> npt.NDArray: + with filepath.open("rb") as f: + data = f.read() + + return self.load_bytes(data) + + def encode_base64( + self, + media: npt.NDArray, + *, + video_format: str = "JPEG", + ) -> str: + video = media + + if video_format == "JPEG": + encode_frame = partial( + self.image_io.encode_base64, + image_format=video_format, + ) + + return ",".join( + encode_frame(Image.fromarray(frame)) for frame in video) + + msg = "Only JPEG format is supported for now." + raise NotImplementedError(msg) From 5ce4627a7ec4cf4e19ff4be7f030883ef486393f Mon Sep 17 00:00:00 2001 From: Chen1022 <112855051+ccjincong@users.noreply.github.com> Date: Fri, 27 Dec 2024 21:05:10 +0800 Subject: [PATCH 005/462] [Doc] Add xgrammar in doc (#11549) Signed-off-by: ccjincong --- docs/source/usage/structured_outputs.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md index 3f5d9ffc26278..7292012e36a26 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/usage/structured_outputs.md @@ -2,7 +2,7 @@ # Structured Outputs -vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines) or [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer) as backends for the guided decoding. +vLLM supports the generation of structured outputs using [outlines](https://github.com/dottxt-ai/outlines), [lm-format-enforcer](https://github.com/noamgat/lm-format-enforcer), or [xgrammar](https://github.com/mlc-ai/xgrammar) as backends for the guided decoding. This document shows you some examples of the different options that are available to generate structured outputs. ## Online Inference (OpenAI API) From 101418096ffe3c83b6d541e1303b10e9d5e03861 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 28 Dec 2024 01:22:48 +0800 Subject: [PATCH 006/462] [VLM] Support caching in merged multi-modal processor (#11396) Signed-off-by: DarkLight1337 --- docs/source/conf.py | 3 +- .../design/multimodal/multimodal_index.md | 24 +- docs/source/models/supported_models.md | 3 +- .../openai/test_vision_embedding.py | 4 +- .../mm_processor_kwargs/test_qwen2_vl.py | 2 +- .../vision_language/test_models.py | 4 +- tests/multimodal/test_processing.py | 209 ++++++- vllm/inputs/registry.py | 22 +- vllm/model_executor/models/llava.py | 178 +++--- vllm/model_executor/models/phi3v.py | 107 +++- vllm/model_executor/models/qwen.py | 4 +- vllm/model_executor/models/qwen2_audio.py | 65 ++- vllm/model_executor/models/qwen2_vl.py | 115 ++-- vllm/model_executor/models/ultravox.py | 76 ++- vllm/multimodal/base.py | 44 +- vllm/multimodal/inputs.py | 438 ++++++++++++++- vllm/multimodal/processing.py | 516 ++++++++++++------ vllm/multimodal/registry.py | 50 +- vllm/transformers_utils/processor.py | 12 +- vllm/utils.py | 27 +- 20 files changed, 1455 insertions(+), 448 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 1fe0474631140..71394c5302a39 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -191,6 +191,7 @@ def linkcode_resolve(domain, info): # Mock out external dependencies here, otherwise the autodoc pages may be blank. autodoc_mock_imports = [ + "blake3", "compressed_tensors", "cpuinfo", "cv2", @@ -207,7 +208,7 @@ def linkcode_resolve(domain, info): "tensorizer", "pynvml", "outlines", - "xgrammar," + "xgrammar", "librosa", "soundfile", "gguf", diff --git a/docs/source/design/multimodal/multimodal_index.md b/docs/source/design/multimodal/multimodal_index.md index 88af07afc7018..e4f2171e84ff7 100644 --- a/docs/source/design/multimodal/multimodal_index.md +++ b/docs/source/design/multimodal/multimodal_index.md @@ -45,39 +45,39 @@ adding_multimodal_plugin ### Base Classes ```{eval-rst} -.. autodata:: vllm.multimodal.NestedTensors +.. automodule:: vllm.multimodal.base + :members: + :show-inheritance: ``` -```{eval-rst} -.. autodata:: vllm.multimodal.BatchedTensorInputs -``` +### Input Classes ```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalDataBuiltins +.. automodule:: vllm.multimodal.inputs :members: :show-inheritance: ``` -```{eval-rst} -.. autodata:: vllm.multimodal.MultiModalDataDict -``` +### Audio Classes ```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalKwargs +.. automodule:: vllm.multimodal.audio :members: :show-inheritance: ``` +### Image Classes + ```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalPlugin +.. automodule:: vllm.multimodal.image :members: :show-inheritance: ``` -### Image Classes +### Video Classes ```{eval-rst} -.. automodule:: vllm.multimodal.image +.. automodule:: vllm.multimodal.video :members: :show-inheritance: ``` diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 95add0d71bbab..7acafda50793c 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -755,8 +755,7 @@ vLLM currently only supports adding LoRA to the language backbone of multimodal ``` ```{note} -To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have to install their GitHub repo ({code}`pip install git+https://github.com/TIGER-AI-Lab/Mantis.git`) -and pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. +To use {code}`TIGER-Lab/Mantis-8B-siglip-llama3`, you have pass {code}`--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM. ``` ```{note} diff --git a/tests/entrypoints/openai/test_vision_embedding.py b/tests/entrypoints/openai/test_vision_embedding.py index 3731b2dcdeae1..c851539c610ec 100644 --- a/tests/entrypoints/openai/test_vision_embedding.py +++ b/tests/entrypoints/openai/test_vision_embedding.py @@ -91,5 +91,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str, assert len(embeddings.data) == 1 assert len(embeddings.data[0].embedding) == 3072 assert embeddings.usage.completion_tokens == 0 - assert embeddings.usage.prompt_tokens == 765 - assert embeddings.usage.total_tokens == 765 + assert embeddings.usage.prompt_tokens == 764 + assert embeddings.usage.total_tokens == 764 diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py index cd8954ffc48c2..5897c04c89e19 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -30,7 +30,7 @@ def get_max_qwen2_vl_image_tokens(): @pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ - ({}, 1225), + ({}, 16384), ({ MIN_PIXELS: 64**2, MAX_PIXELS: 512**2 diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 3101d1d2ea831..1a9c1b4ef1be0 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -201,6 +201,7 @@ vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], + marks=[large_gpu_mark(min_gb=48)], ), "glm4": VLMTestInfo( models=["THUDM/glm-4v-9b"], @@ -212,7 +213,7 @@ dtype="bfloat16", get_stop_token_ids=lambda tok: [151329, 151336, 151338], patch_hf_runner=model_utils.glm_patch_hf_runner, - marks=[large_gpu_mark(min_gb=48)], + marks=[large_gpu_mark(min_gb=32)], ), "h2ovl": VLMTestInfo( models = [ @@ -261,6 +262,7 @@ dtype="bfloat16", use_tokenizer_eos=True, patch_hf_runner=model_utils.internvl_patch_hf_runner, + marks=[large_gpu_mark(min_gb=32)], ), "llava_next": VLMTestInfo( models=["llava-hf/llava-v1.6-mistral-7b-hf"], diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index d22d778f81fa8..1b2847ed0f534 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,12 +1,20 @@ +from functools import partial from typing import cast +import numpy as np import pytest - -from vllm.multimodal.processing import (PromptReplacement, _PlaceholderInfo, - find_text_matches, find_token_matches, - iter_placeholders, iter_token_matches, +from PIL import Image + +from vllm.config import ModelConfig +from vllm.inputs import InputProcessingContext +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, + _PlaceholderInfo, find_text_matches, + find_token_matches, iter_placeholders, + iter_token_matches, replace_text_matches, replace_token_matches) +from vllm.multimodal.utils import cached_get_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import full_groupby @@ -457,6 +465,7 @@ def test_find_replace_tokens( ), ] ) +# yapf: enable def test_iter_placeholders( repl_by_key, prompt, @@ -475,11 +484,199 @@ def test_iter_placeholders( prompt_repls, prompt, # Effectively match all occurrences in the prompt - {key: 3 for key in repl_by_key}, - )) + {key: 3 + for key in repl_by_key}, + )) # Only displayed on error print("result:", result) # Manually constructed results assert result == expected + + +def _rand_img(rng: np.random.RandomState, min_wh: int, max_wh: int): + w, h = rng.randint(min_wh, max_wh, size=(2, )) + arr = rng.randint(0, 255, size=(w, h, 3), dtype=np.uint8) + return Image.fromarray(arr) + + +def _rand_video( + rng: np.random.RandomState, + min_frames: int, + max_frames: int, + min_wh: int, + max_wh: int, +): + # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 + num_frames = rng.randint(min_frames, max_frames) + num_frames = (num_frames // 2) * 2 + + w, h = rng.randint(min_wh, max_wh, size=(2, )) + return rng.randint(0, 255, size=(num_frames, w, h, 3), dtype=np.uint8) + + +def _rand_audio( + rng: np.random.RandomState, + min_len: int, + max_len: int, + sr: int, +): + audio_len = rng.randint(min_len, max_len) + return rng.rand(audio_len), sr + + +def _test_processing_cache_correctness( + model_id: str, + modalities: set[str], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3": + hf_overrides = {"architectures": ["MantisForConditionalGeneration"]} + else: + hf_overrides = {} + + model_config = ModelConfig( + model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=True, + seed=0, + dtype="float16", + revision=None, + hf_overrides=hf_overrides, + ) + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + + processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + # Ensure that it can fit all of the data + cache = ProcessingCache(capacity=1 << 30) + + baseline_processor = processor_factory(ctx, cache=None) + cached_processor = processor_factory(ctx, cache=cache) + + rng = np.random.RandomState(0) + + input_to_hit = { + "image": Image.new("RGB", size=(128, 128)), + "video": np.zeros((4, 128, 128, 3), dtype=np.uint8), + "audio": (np.zeros((512, )), 16000), + } + input_factory = { + "image": + partial(_rand_img, rng, min_wh=128, max_wh=256), + "video": + partial(_rand_video, + rng, + min_frames=2, + max_frames=8, + min_wh=128, + max_wh=256), + "audio": + partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000), + } + input_max_count = { + "image": 3, + "video": 3, + "audio": 3, + } + + for batch_idx in range(num_batches): + mm_data = { + k: + [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) + for _ in range(rng.randint(input_max_count[k]))] + for k in modalities + } + + mm_counts = {k: len(vs) for k, vs in mm_data.items()} + prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text + + # Drop unnecessary keys and test single -> multi conversion + if rng.rand() < simplify_rate: + for k in list(mm_data.keys()): + if not mm_data[k]: + del mm_data[k] + elif len(mm_data[k]) == 1: + mm_data[k] = mm_data[k][0] + + baseline_result = baseline_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + cached_result = cached_processor.apply( + prompt, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + assert baseline_result == cached_result, ( + f"Failed ({batch_idx=}, {mm_data=})") + + +# yapf: disable +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("llava-hf/llava-1.5-7b-hf", {"image"}), + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}), + ("mistral-community/pixtral-12b", {"image"}), + ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}), + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}), + ("fixie-ai/ultravox-v0_3", {"audio"}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_cache_correctness( + model_id: str, + modalities: set[str], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + _test_processing_cache_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) + + +# yapf: disable +@pytest.mark.parametrize(("model_id", "modalities"), [ + ("microsoft/Phi-3-vision-128k-instruct", {"image"}), +]) +@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) +@pytest.mark.parametrize("num_batches", [32]) +@pytest.mark.parametrize("simplify_rate", [1.0]) +# yapf: enable +def test_processing_cache_correctness_phi3v( + model_id: str, + modalities: set[str], + hit_rate: float, + num_batches: int, + simplify_rate: float, +): + # HACK - this is an attempted workaround for the following bug + # https://github.com/huggingface/transformers/issues/34307 + from transformers import AutoImageProcessor # noqa: F401 + from transformers import AutoProcessor # noqa: F401 + + AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True) + + _test_processing_cache_correctness( + model_id, + modalities, + hit_rate=hit_rate, + num_batches=num_batches, + simplify_rate=simplify_rate, + ) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index f3ec9d115c9ba..46346b08e99c2 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -99,6 +99,9 @@ def get_hf_processor( merged_kwargs = {**base_kwargs, **kwargs} + if isinstance(typ, type): + merged_kwargs["processor_cls"] = typ + hf_processor = cached_get_processor( self.model_config.model, trust_remote_code=self.model_config.trust_remote_code, @@ -132,10 +135,13 @@ def get_hf_processor( def call_hf_processor( self, hf_processor: ProcessorMixin, - prompt: str, - processor_data: Mapping[str, object], - inference_kwargs: Mapping[str, object], + data: Mapping[str, object], + kwargs: Mapping[str, object] = {}, ) -> BatchFeature: + """ + Call :code:`hf_processor` on the prompt :code:`data` + (text, image, audio...) with configurable options :code:`kwargs`. + """ assert callable(hf_processor) base_kwargs = self.model_config.mm_processor_kwargs @@ -144,21 +150,15 @@ def call_hf_processor( merged_kwargs = resolve_mm_processor_kwargs( base_kwargs, - inference_kwargs, + kwargs, hf_processor, requires_kw_only=False, allow_var_kwargs=True, ) try: - return hf_processor( - text=prompt, - **processor_data, - **merged_kwargs, - return_tensors="pt", - ) + return hf_processor(**data, **merged_kwargs, return_tensors="pt") except Exception as exc: - data = dict(text=prompt, **processor_data) msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={merged_kwargs}") diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0662d90e79b92..0ecba5a1cae0f 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,5 +1,4 @@ from functools import cached_property -from types import MethodType from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, TypedDict, Union) @@ -7,7 +6,7 @@ import torch.nn as nn from transformers import (BatchFeature, CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, PretrainedConfig, - ProcessorMixin, SiglipVisionConfig) + SiglipVisionConfig) from transformers.models.llava import LlavaProcessor from transformers.models.pixtral import PixtralProcessor @@ -21,10 +20,12 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems, + MultiModalFieldConfig, MultiModalInputsV2, + MultiModalKwargs, NestedTensors) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + ProcessorInputs, PromptReplacement, + full_groupby_modality) from vllm.sequence import IntermediateTensors from .clip import (CLIPVisionModel, dummy_image_for_clip, @@ -116,36 +117,54 @@ def get_max_llava_image_tokens(ctx: InputContext): class LlavaMultiModalProcessor(BaseMultiModalProcessor): - def _patch_pixtral_processor(self, hf_processor: PixtralProcessor): - if getattr(hf_processor, "__is_patched__", False): - return # Already patched - - image_processor = hf_processor.image_processor # type: ignore - orig_preprocess = image_processor.preprocess + def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]: + return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor)) - def preprocess(__self, *args, **kwargs): - hf_inputs = orig_preprocess(*args, **kwargs) - hf_inputs["is_pixtral"] = torch.tensor(True) - return hf_inputs + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) - image_processor.preprocess = MethodType(preprocess, image_processor) + # NOTE: pixel_values=None for MLlavaProcessor + pixel_values = processed_outputs.get("pixel_values") + if pixel_values is not None: + images = mm_data["images"] + assert isinstance(images, list) - hf_processor.__is_patched__ = True # type: ignore + if isinstance(self._get_hf_processor(), PixtralProcessor): + # Original output: (1, num_images, C, H, W) + # New output: (num_images, C, H, W) + assert (isinstance(pixel_values, list) + and len(pixel_values) == 1 + and isinstance(pixel_values[0], list) + and len(pixel_values[0]) == len(images)) - def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]: - hf_processor = self.ctx.get_hf_processor( - (LlavaProcessor, PixtralProcessor)) + processed_outputs["pixel_values"] = pixel_values[0] - if isinstance(hf_processor, PixtralProcessor): - self._patch_pixtral_processor(hf_processor) + return processed_outputs - return hf_processor + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_config = self.ctx.get_hf_config(LlavaConfig) image_token_id = hf_config.image_token_index @@ -200,7 +219,7 @@ def _get_dummy_mm_inputs( ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config(LlavaConfig) vision_config = hf_config.vision_config - num_images = mm_counts["image"] + num_images = mm_counts.get("image", 0) if isinstance(vision_config, CLIPVisionConfig): data = dummy_image_for_clip(vision_config, num_images) @@ -218,7 +237,6 @@ def _get_dummy_mm_inputs( return ProcessorInputs( prompt_text=image_token * num_images, mm_data=data, - mm_processor_kwargs={}, ) @@ -379,7 +397,6 @@ def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) - is_pixtral = kwargs.pop("is_pixtral", torch.tensor([False])) image_embeds = kwargs.pop("image_embeds", None) if pixel_values is None and image_embeds is None: @@ -390,33 +407,6 @@ def _parse_and_validate_image_input( raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") - assert isinstance(is_pixtral, torch.Tensor) - if is_pixtral.any(): - images = pixel_values - - def flatten_to_3d_tensors(item): - if isinstance(item, torch.Tensor): - if item.dim() >= 3: - return [t for t in item.view(-1, *item.shape[-3:])] - else: - raise ValueError( - f"Unexpected tensor dimension: {item.dim()}") - elif isinstance(item, list): - return [ - t for subitem in item - for t in flatten_to_3d_tensors(subitem) - ] - else: - raise ValueError(f"Unexpected type: {type(item)}") - - # Restructure the batched images into a list of lists of images - images = flatten_to_3d_tensors(pixel_values) - - return LlavaImagePixelInputs( - type="pixel_values", - data=images, - ) - return LlavaImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( @@ -586,19 +576,71 @@ def load_weights(self, weights: Iterable[Tuple[str, class MantisMultiModalProcessor(LlavaMultiModalProcessor): - def _get_hf_processor(self) -> ProcessorMixin: - try: - from mantis.models.mllava import MLlavaProcessor - except ModuleNotFoundError as exc: - raise ModuleNotFoundError( - "You need to `pip install " - "git+https://github.com/TIGER-AI-Lab/Mantis.git` " - "to use this model") from exc - - processor = MLlavaProcessor.from_pretrained( - self.ctx.model_config.tokenizer) - assert isinstance(processor, ProcessorMixin) - return processor + def _get_hf_processor(self): + return self.ctx.get_hf_processor(LlavaProcessor) + + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + hf_config = self.ctx.get_hf_config(LlavaConfig) + image_token_id = hf_config.image_token_index + max_image_tokens = get_max_llava_image_tokens(self.ctx) + + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + mm_items = self._get_mm_items(mm_data) + mm_item_counts = mm_items.get_item_counts() + mm_kwargs = result["mm_kwargs"] + + # We reimplement the functionality of MLlavaProcessor from + # https://github.com/TIGER-AI-Lab/Mantis.git + def get_replacement_mantis(item_idx: int): + return "".join([ + f"(image {item_idx+1}: ", # 7 tokens + "" * max_image_tokens, + ")", # 3 tokens + ]) + + mantis_repls = self._bind_prompt_replacements([ + PromptReplacement( + modality="image", + target=[image_token_id] * max_image_tokens, + replacement=get_replacement_mantis, + ) + ]) + + prompt_ids, prompt_text, _ = self._apply_prompt_replacements( + result["prompt_token_ids"], + mantis_repls, + mm_item_counts, + ) + + unbound_orig_repls = self._get_prompt_replacements( + mm_items, + hf_processor_mm_kwargs, + mm_kwargs, + ) + orig_repls = self._bind_prompt_replacements(unbound_orig_repls) + + all_placeholders = self._find_placeholders(orig_repls, prompt_ids, + mm_item_counts) + assert len(all_placeholders) == mm_item_counts.get("image", 0) + + mm_placeholders = { + modality: [item.to_range() for item in items] + for modality, items in full_groupby_modality(all_placeholders) + } + + return MultiModalInputsV2( + type="multimodal", + prompt=prompt_text, + prompt_token_ids=prompt_ids, + mm_kwargs=mm_kwargs, + mm_placeholders=mm_placeholders, + ) # To use this model, please use diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4e2e7f5761544..fefa9fd62d1d0 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -12,9 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -32,10 +32,14 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems, + MultiModalFieldConfig, MultiModalInputsV2, + MultiModalKwargs, NestedTensors, + PlaceholderRange) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + ProcessorInputs, PromptReplacement, + _BoundPromptReplacement, + _PlaceholderInfo) from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -306,11 +310,11 @@ def get_max_phi3v_image_tokens( *, num_crops: Optional[int] = None, ) -> int: - mm_processor_kwargs = {} + hf_processor_mm_kwargs = {} if num_crops: - mm_processor_kwargs["num_crops"] = num_crops + hf_processor_mm_kwargs["num_crops"] = num_crops - processor = ctx.get_hf_processor(**mm_processor_kwargs) + processor = ctx.get_hf_processor(**hf_processor_mm_kwargs) return processor.calc_num_image_tokens_from_image_size( width=MAX_IMAGE_FEATURE_SIZE_WIDTH, @@ -331,39 +335,50 @@ def _get_hf_processor( def _call_hf_processor( self, - hf_processor: ProcessorMixin, prompt: str, - processor_data: Mapping[str, object], - mm_processor_kwargs: Mapping[str, object], + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], ) -> BatchFeature: processed_outputs = super()._call_hf_processor( - hf_processor, prompt=prompt, - processor_data=processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_data=mm_data, + mm_kwargs=mm_kwargs, ) + input_ids = processed_outputs["input_ids"] + assert isinstance(input_ids, torch.Tensor) + # Phi3v processor has inserted -1, -2 etc as placeholder in prompt_ids, # which will cause OverflowError when decoding the prompt_ids. # Therefore, we need to do an early replacement here - token_ids = processed_outputs['input_ids'] - token_ids[token_ids < 0] = _IMAGE_TOKEN_ID - processed_outputs['input_ids'] = token_ids + input_ids.masked_fill_(input_ids < 0, _IMAGE_TOKEN_ID) return processed_outputs + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() image_tokens: list[str] = hf_processor.img_tokens # type: ignore image_processor = hf_processor.image_processor # type: ignore - mm_config = self.ctx.get_mm_config() - max_images = mm_config.limit_per_prompt.get("image", 1) + tokenizer = self._get_tokenizer() + bos_token_id = tokenizer.bos_token_id + assert isinstance(bos_token_id, int) def get_replacement_phi3v(item_idx: int): image_size = mm_items.get_image_size(item_idx) @@ -372,21 +387,44 @@ def get_replacement_phi3v(item_idx: int): height=image_size.height, ) - return [_IMAGE_TOKEN_ID] * num_tokens + return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id] return [ PromptReplacement( modality="image", target=image_token, replacement=get_replacement_phi3v, - ) for image_token in image_tokens[:max_images] + ) for image_token in image_tokens[:len(mm_items.images)] ] + def _apply_prompt_replacements( + self, + token_ids: list[int], + prompt_repls: Sequence[_BoundPromptReplacement], + mm_item_counts: Mapping[str, int], + ) -> tuple[list[int], str, list[_PlaceholderInfo]]: + token_ids, text, placeholders = super()._apply_prompt_replacements( + token_ids=token_ids, + prompt_repls=prompt_repls, + mm_item_counts=mm_item_counts, + ) + + # Keep the behavior in line with HF processor + if text.startswith(" <|image|>"): + text = text.replace(" <|image|>", "<|image|>", 1) + token_ids = [token_ids[0], *token_ids[2:]] + placeholders = [ + _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement) + for p in placeholders + ] + + return token_ids, text, placeholders + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - num_images = mm_counts["image"] + num_images = mm_counts.get("image", 0) data = dummy_image_for_clip( CLIP_VIT_LARGE_PATCH14_336_CONFIG, @@ -401,9 +439,28 @@ def _get_dummy_mm_inputs( return ProcessorInputs( prompt_text="".join(image_tokens[:num_images]), mm_data=data, - mm_processor_kwargs={}, ) + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only <|image|> tokens should be considered as placeholders, + # so we ignore the trailing bos_token_id + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens) @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 63d1374ab4092..baf955f6b515d 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -225,7 +225,7 @@ def __init__( d_model: int, n_head: int, mlp_ratio: float = 4.0, - norm_layer: Callable = nn.LayerNorm, + norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -266,7 +266,7 @@ def __init__( layers: int, heads: int, mlp_ratio: float = 4.0, - norm_layer: Callable = nn.LayerNorm, + norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 6259166a7fc57..25a351bd9c656 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -26,7 +26,7 @@ import numpy as np import torch import torch.nn as nn -from transformers import BatchFeature, ProcessorMixin +from transformers import BatchFeature from transformers.models.qwen2_audio import (Qwen2AudioConfig, Qwen2AudioEncoder, Qwen2AudioProcessor) @@ -38,10 +38,10 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig, + MultiModalKwargs, NestedTensors) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -73,7 +73,7 @@ def forward(self, audio_features): # From Qwen2AudioEncoder._get_feat_extract_output_lengths -def _get_feat_extract_output_lengths(input_lengths: torch.LongTensor): +def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): feat_lengths = (input_lengths - 1) // 2 + 1 output_lengths = (feat_lengths - 2) // 2 + 1 return feat_lengths, output_lengths @@ -88,13 +88,18 @@ def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int: class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): - def _get_hf_processor(self) -> Qwen2AudioProcessor: + def _get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> Qwen2AudioProcessor: return self.ctx.get_hf_processor(Qwen2AudioProcessor) def _get_feature_extractor(self) -> WhisperFeatureExtractor: return self._get_hf_processor().feature_extractor # type: ignore - def _get_processor_data( + def _get_hf_mm_data( self, mm_items: MultiModalDataItems, ) -> tuple[dict[str, Any], dict[str, Any]]: @@ -102,50 +107,61 @@ def _get_processor_data( feature_extractor = self._get_feature_extractor() mm_items.resample_audios(feature_extractor.sampling_rate) - return super()._get_processor_data(mm_items) + return super()._get_hf_mm_data(mm_items) def _call_hf_processor( self, - hf_processor: ProcessorMixin, prompt: str, - processor_data: Mapping[str, object], - mm_processor_kwargs: Mapping[str, object], + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], ) -> BatchFeature: - processor_data = dict(processor_data) - audios = processor_data.pop("audios", []) + mm_data = dict(mm_data) + audios = mm_data.pop("audios", []) if audios: - processor_data["audios"] = audios + mm_data["audios"] = audios feature_extractor = self._get_feature_extractor() - mm_processor_kwargs = dict( - **mm_processor_kwargs, + mm_kwargs = dict( + **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, ) else: # NOTE: WhisperFeatureExtractor cannot handle empty list of audios pass - return super()._call_hf_processor( - hf_processor, + processed_outputs = super()._call_hf_processor( prompt=prompt, - processor_data=processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + input_features=MultiModalFieldConfig.batched("audio"), + feature_attention_mask=MultiModalFieldConfig.batched("audio"), ) def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) placeholder = hf_config.audio_token_index - feature_attention_mask = hf_inputs.get("feature_attention_mask") + feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") if feature_attention_mask is None: audio_output_lengths = [] else: + assert isinstance(feature_attention_mask, torch.Tensor) _, audio_output_lengths = _get_feat_extract_output_lengths( feature_attention_mask.sum(-1)) @@ -168,14 +184,13 @@ def _get_dummy_mm_inputs( sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate - audio_count = mm_counts["audio"] + audio_count = mm_counts.get("audio", 0) audio = np.zeros(audio_len) data = {"audio": [audio] * audio_count} return ProcessorInputs( prompt_text="<|AUDIO|>" * audio_count, mm_data=data, - mm_processor_kwargs={}, ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index fb97eb1916002..574845ef5a525 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -22,9 +22,10 @@ # limitations under the License. """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from functools import cached_property, partial -from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, Type, TypedDict, Union) +from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, + Set, Tuple, Type, TypedDict, Union) +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -54,10 +55,11 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalDataDict, NestedTensors +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems, + MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + ProcessorInputs, PromptReplacement) from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -229,9 +231,9 @@ class Qwen2VisionAttention(nn.Module): def __init__( self, - embed_dim: Optional[int] = None, - num_heads: Optional[int] = None, - projection_size: Optional[int] = None, + embed_dim: int, + num_heads: int, + projection_size: int, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -264,7 +266,7 @@ def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, - rotary_pos_emb: torch.Tensor = None, + rotary_pos_emb: torch.Tensor, ) -> torch.Tensor: # [s, b, c] --> [s, b, head * 3 * head_dim] x, _ = self.qkv(x) @@ -347,7 +349,7 @@ def __init__( num_heads: int, mlp_ratio: float, act_layer: Type[nn.Module] = QuickGELU, - norm_layer: Type[nn.Module] = None, + norm_layer: Optional[Callable[[int], nn.Module]] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", ) -> None: @@ -384,7 +386,7 @@ def __init__( self, patch_size: int = 14, temporal_patch_size: int = 2, - in_chans: int = 3, + in_channels: int = 3, embed_dim: int = 1152, ) -> None: super().__init__() @@ -392,8 +394,8 @@ def __init__( self.temporal_patch_size = temporal_patch_size self.embed_dim = embed_dim - kernel_size = [temporal_patch_size, patch_size, patch_size] - self.proj = nn.Conv3d(in_chans, + kernel_size = (temporal_patch_size, patch_size, patch_size) + self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, @@ -413,7 +415,7 @@ def __init__( self, d_model: int, context_dim: int, - norm_layer: Type[nn.Module] = None, + norm_layer: Optional[Callable[[int], nn.Module]] = None, spatial_merge_size: int = 2, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -489,15 +491,15 @@ def __init__( ) -> None: super().__init__() - patch_size: int = vision_config.patch_size - temporal_patch_size: int = vision_config.temporal_patch_size - spatial_merge_size: int = vision_config.spatial_merge_size - in_chans: int = vision_config.in_chans - hidden_size: int = vision_config.hidden_size - embed_dim: int = vision_config.embed_dim - depth: int = vision_config.depth - num_heads: int = vision_config.num_heads - mlp_ratio: float = vision_config.mlp_ratio + patch_size = vision_config.patch_size + temporal_patch_size = vision_config.temporal_patch_size + spatial_merge_size = vision_config.spatial_merge_size + in_channels = vision_config.in_channels + hidden_size = vision_config.hidden_size + embed_dim = vision_config.embed_dim + depth = vision_config.depth + num_heads = vision_config.num_heads + mlp_ratio = vision_config.mlp_ratio self.spatial_merge_size = spatial_merge_size self.num_heads = num_heads @@ -506,7 +508,7 @@ def __init__( self.patch_embed = Qwen2VisionPatchEmbed( patch_size=patch_size, temporal_patch_size=temporal_patch_size, - in_chans=in_chans, + in_channels=in_channels, embed_dim=embed_dim, ) @@ -733,8 +735,12 @@ def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": if k == "video": # Special case since even a single item can be a list multi_data[k] = ( # type: ignore[index] - v if (isinstance(v, (dict, torch.Tensor)) # type: ignore[assignment] - or is_list_of(v, list)) else [v] + v if ( + isinstance(v, (dict, torch.Tensor)) # type: ignore[assignment] + or is_list_of(v, list) + or isinstance(v[0], (np.ndarray, torch.Tensor)) + and v[0].ndim == 4 + ) else [v] ) elif k in ("image", "audio"): multi_data[k] = ( # type: ignore[index] @@ -754,6 +760,12 @@ def get_item_counts(self) -> Mapping[str, int]: for m, items in self.items() } + def has_embedding_inputs(self) -> bool: + return any( + isinstance(items, dict) or any( + isinstance(item, torch.Tensor) for item in items) + for items in self.values()) + class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): @@ -784,7 +796,7 @@ def _get_hf_processor( return hf_processor - def _get_processor_data( + def _get_hf_mm_data( self, mm_items: MultiModalDataItems, ) -> tuple[dict[str, Any], dict[str, Any]]: @@ -805,7 +817,7 @@ def _get_processor_data( and v[0].ndim == 2): # Pass through embedding inputs (multi) passthrough_data[f"{k}_embeds"] = v - else: + elif len(v) > 0: # Map keys to plural form, e.g.: image -> images processor_data[f"{k}s"] = v else: @@ -816,8 +828,8 @@ def _get_processor_data( def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() image_processor = _get_image_processor(hf_processor) @@ -831,7 +843,9 @@ def _get_prompt_replacements( merge_length = image_processor.merge_size**2 def get_replacement_qwen2vl(item_idx: int, modality: str): - grid_thw = hf_inputs[f"{modality}_grid_thw"][item_idx] + grid_thw = out_mm_kwargs[f"{modality}_grid_thw"][item_idx] + assert isinstance(grid_thw, torch.Tensor) + num_tokens = grid_thw.prod() // merge_length return placeholder[modality] * num_tokens @@ -844,11 +858,40 @@ def get_replacement_qwen2vl(item_idx: int, modality: str): ) for modality in ("image", "video") ] + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + image_grid_thw = hf_inputs.get("image_grid_thw", torch.empty((0, 3))) + image_slice_idxs = [0] + image_grid_thw.prod(-1).cumsum_(0).tolist() + image_slices = [ + slice(image_slice_idxs[i], image_slice_idxs[i + 1]) + for i in range(len(image_grid_thw)) + ] + + video_grid_thw = hf_inputs.get("video_grid_thw", torch.empty((0, 3))) + video_slice_idxs = [0] + video_grid_thw.prod(-1).cumsum_(0).tolist() + video_slices = [ + slice(video_slice_idxs[i], video_slice_idxs[i + 1]) + for i in range(len(video_grid_thw)) + ] + + return dict( + pixel_values=MultiModalFieldConfig.flat("image", image_slices), + image_embeds=MultiModalFieldConfig.flat("image", image_slices), + image_grid_thw=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.flat( + "video", video_slices), + video_embeds=MultiModalFieldConfig.flat("video", video_slices), + video_grid_thw=MultiModalFieldConfig.batched("video"), + ) + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - num_images = mm_counts["image"] + num_images = mm_counts.get("image", 0) hf_processor = self._get_hf_processor() image_token: str = hf_processor.image_token image_processor = _get_image_processor(hf_processor) @@ -869,7 +912,6 @@ def _get_dummy_mm_inputs( return ProcessorInputs( prompt_text=image_token * num_images, mm_data=data, - mm_processor_kwargs={}, ) @@ -950,9 +992,7 @@ def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig): return None return quant_config - def _validate_and_reshape_mm_tensor(self, - mm_input: Union[torch.Tensor, - List[torch.Tensor]], + def _validate_and_reshape_mm_tensor(self, mm_input: object, name: str) -> torch.Tensor: if not isinstance(mm_input, (torch.Tensor, list)): raise ValueError(f"Incorrect type of {name}. " @@ -962,7 +1002,8 @@ def _validate_and_reshape_mm_tensor(self, return mm_input if mm_input.ndim != 3: raise ValueError(f"{name} should be 2D or batched 3D tensor. " - f"Got ndim: {mm_input.ndim}") + f"Got ndim: {mm_input.ndim} " + f"(shape={mm_input.shape})") return torch.concat(list(mm_input)) else: return torch.concat(mm_input) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 509ad9e580ddf..7b4aeeec5f403 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -23,10 +23,11 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, NestedTensors +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig, + MultiModalKwargs, NestedTensors) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement) + ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig from vllm.utils import is_list_of @@ -72,11 +73,19 @@ def get_ultravox_max_audio_tokens(ctx: InputContext): class UltravoxMultiModalProcessor(BaseMultiModalProcessor): + def _get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> ProcessorMixin: + return self.ctx.get_hf_processor() + def _get_feature_extractor(self) -> WhisperFeatureExtractor: hf_processor = self._get_hf_processor() return hf_processor.audio_processor.feature_extractor # type: ignore - def _get_processor_data( + def _get_hf_mm_data( self, mm_items: MultiModalDataItems, ) -> tuple[dict[str, Any], dict[str, Any]]: @@ -84,33 +93,41 @@ def _get_processor_data( feature_extractor = self._get_feature_extractor() mm_items.resample_audios(feature_extractor.sampling_rate) - return super()._get_processor_data(mm_items) + return super()._get_hf_mm_data(mm_items) def _call_hf_processor( self, - hf_processor: ProcessorMixin, prompt: str, - processor_data: Mapping[str, object], - mm_processor_kwargs: Mapping[str, object], + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], ) -> BatchFeature: - processor_data = dict(processor_data) - audios = processor_data.pop("audios", []) + # Text-only input not supported in composite processor + if not mm_data: + tokenizer = self._get_tokenizer() + + prompt_ids = tokenizer.encode( + prompt, + add_special_tokens=False, # type: ignore + ) + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + mm_data = dict(mm_data) + audios = mm_data.pop("audios", []) if not audios: return super()._call_hf_processor( - hf_processor, prompt=prompt, - processor_data=processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_data=mm_data, + mm_kwargs=mm_kwargs, ) feature_extractor = self._get_feature_extractor() - mm_processor_kwargs = dict( - **mm_processor_kwargs, + mm_kwargs = dict( + **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, ) - # Already resampled by _get_processor_data + # Already resampled by _get_hf_mm_data assert is_list_of(audios, np.ndarray) # Ultravox processor doesn't support multiple inputs, @@ -119,13 +136,12 @@ def _call_hf_processor( shared_outputs = {} for audio in audios: # NOTE: Ultravox processor accepts "audio" instead of "audios" - item_processor_data = dict(**processor_data, audio=audio) + item_processor_data = dict(**mm_data, audio=audio) item_outputs = super()._call_hf_processor( - hf_processor, prompt=prompt, - processor_data=item_processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_data=item_processor_data, + mm_kwargs=mm_kwargs, ) audio_features.append(item_outputs.pop("audio_values")[0]) @@ -139,17 +155,28 @@ def _call_hf_processor( ) return BatchFeature(combined_outputs) + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + audio_features=MultiModalFieldConfig.batched("audio"), + audio_token_len=MultiModalFieldConfig.batched("audio"), + audio_embeds=MultiModalFieldConfig.batched("audio"), + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() placeholder = hf_processor.audio_token_replacement # type: ignore def get_replacement_ultravox(item_idx: int): - audio_token_len = hf_inputs["audio_token_len"][item_idx] + audio_token_len = out_mm_kwargs["audio_token_len"][item_idx] return placeholder * audio_token_len return [ @@ -168,14 +195,13 @@ def _get_dummy_mm_inputs( sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate - audio_count = mm_counts["audio"] + audio_count = mm_counts.get("audio", 0) audio = np.zeros(audio_len) data = {"audio": [audio] * audio_count} return ProcessorInputs( prompt_text="<|audio|>" * audio_count, mm_data=data, - mm_processor_kwargs={}, ) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 10488e24b30cc..cdda6f8052794 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -297,35 +297,37 @@ def from_seq_group( ``MultiModalPlaceholderMap`` that relates the multi-modal embedding vectors to their corresponding placeholders. - Consider the following scenarios: + Examples: - Prompt: |AAAA BBBB What's in these images?| - Positions: |.................................| + .. code-block:: - images = [A, B] - src_ranges = [(0, 4), (4, 8)] - dest_ranges = [(0, 4), (5, 9)] + Prompt: |AAAA BBBB What's in these images?| + Positions: |.................................| - Prompt: |AAAA BBBB What's in these images?| - Positions: | ..... | + images = [A, B] + src_ranges = [(0, 4), (4, 8)] + dest_ranges = [(0, 4), (5, 9)] - images = [A, B] - src_ranges = [(2, 4), (4, 6)] - dest_ranges = [(0, 2), (3, 5)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | ..... | - Prompt: |AAAA BBBB What's in these images?| - Positions: | ......... | + images = [A, B] + src_ranges = [(2, 4), (4, 6)] + dest_ranges = [(0, 2), (3, 5)] - images = [B] - src_ranges = [(0, 4)] - dest_ranges = [(0, 4)] + Prompt: |AAAA BBBB What's in these images?| + Positions: | ......... | - Prompt: |AAAA BBBB What's in these images?| - Positions: | .......................| + images = [B] + src_ranges = [(0, 4)] + dest_ranges = [(0, 4)] - images = [] - src_ranges = [] - dest_ranges = [] + Prompt: |AAAA BBBB What's in these images?| + Positions: | .......................| + + images = [] + src_ranges = [] + dest_ranges = [] """ seq_mm_data = seq_group.multi_modal_data seq_mm_placeholders = seq_group.multi_modal_placeholders diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 9ecae2c1ca2bf..1fbda6e0b8750 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -1,12 +1,16 @@ +from abc import ABC, abstractmethod from collections import UserDict, defaultdict -from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple, - TypedDict, TypeVar, Union, cast, final) +from collections.abc import Mapping, Sequence +from dataclasses import dataclass +from typing import (Any, Literal, NamedTuple, TypedDict, TypeVar, Union, cast, + final) import numpy as np import torch import torch.types from PIL.Image import Image -from typing_extensions import NotRequired, TypeAlias +from transformers import BatchFeature +from typing_extensions import NotRequired, TypeAlias, assert_never from vllm.utils import JSONTree, is_list_of, json_map_leaves @@ -44,7 +48,7 @@ """ # yapf: enable -MultiModalData: TypeAlias = Union[_T, List[_T]] +MultiModalData: TypeAlias = Union[_T, list[_T]] """ Either a single data item, or a list of data items. @@ -79,13 +83,135 @@ class MultiModalDataBuiltins(TypedDict, total=False): """ +class ImageSize(NamedTuple): + width: int + height: int + + +class MultiModalDataItems(UserDict[str, list[Any]]): + """ + As :class:`MultiModalDataDict`, but normalized such that each entry + corresponds to a list. + """ + + @staticmethod + def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": + """ + Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. + """ + multi_data = MultiModalDataItems() + + for k, v in data.items(): + # TODO: Make a separate modality for embedding inputs + # to avoid confusion + # yapf: disable + if k == "video": + # Special case since even a single item can be a list + multi_data[k] = ( # type: ignore[index] + v if ( + isinstance(v, torch.Tensor) + or is_list_of(v, list) + or isinstance(v[0], (np.ndarray, torch.Tensor)) + and v[0].ndim == 4 + ) else [v] + ) + elif k in ("image", "audio"): + multi_data[k] = ( # type: ignore[index] + v if isinstance(v, (torch.Tensor, list)) else [v] + ) + else: + multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] + # yapf: enable + + return multi_data + + # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to + # `self.images` doesn't update this dictionary, which may be confusing + # We annotate the getter methods as `Sequence` to prevent others from + # trying to update the list in this way + @property + def images(self) -> Sequence[ImageItem]: + return self.get("image", []) + + @property + def videos(self) -> Sequence[VideoItem]: + return self.get("video", []) + + @property + def audios(self) -> Sequence[AudioItem]: + return self.get("audio", []) + + def get_item_counts(self) -> Mapping[str, int]: + return {m: len(items) for m, items in self.items()} + + def has_embedding_inputs(self) -> bool: + return any( + any(isinstance(item, torch.Tensor) for item in items) + for items in self.values()) + + def get_image_size(self, item_idx: int) -> ImageSize: + image = self.images[item_idx] + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + + def get_audio_with_sr( + self, + item_idx: int, + *, + default_sr: float, + ) -> tuple[np.ndarray, float]: + audio = self.audios[item_idx] + + if isinstance(audio, tuple): + return audio + if isinstance(audio, list): + return np.array(audio), default_sr + if isinstance(audio, np.ndarray): + return audio, default_sr + + assert_never(audio) + + def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None: + """ + If :code:`drop_sr=True`, the audio items in this dictionary are updated + to be NumPy arrays which implicitly means that their sampling rate is + the same as the model's expected sampling rate; otherwise, they remain + as :code:`(audio, new_sr)` tuples. + """ + # Avoid circular import + from .audio import resample_audio + + if not self.audios: + return + + new_audios = [] + for item_idx in range(len(self.audios)): + audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr) + audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr) + + new_audios.append(audio if drop_sr else (audio, new_sr)) + + self["audio"] = new_audios + + class PlaceholderRange(TypedDict): """ Placeholder location information for multi-modal data. - For example: - Prompt: AAAA BBBB What is in these images? + Example: + + Prompt: :code:`AAAA BBBB What is in these images?` + Images A and B will have: + + .. code-block:: + A: { "offset": 0, "length": 4 } B: { "offset": 5, "length": 4 } """ @@ -97,25 +223,256 @@ class PlaceholderRange(TypedDict): """The length of the placeholder.""" -NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor, - Tuple[torch.Tensor, ...]] +NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor, + tuple[torch.Tensor, ...]] """ Uses a list instead of a tensor if the dimensions of each element do not match. """ -BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] + +def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: + """Equality check between :data:`NestedTensors` objects.""" + if isinstance(a, torch.Tensor): + return isinstance(b, torch.Tensor) and bool((a == b).all().item()) + elif isinstance(b, torch.Tensor): + return isinstance(a, torch.Tensor) and bool((b == a).all().item()) + + if isinstance(a, list): + return (isinstance(b, list) + and all(nested_tensors_equal(a_, b_) for a_, b_ in zip(a, b))) + if isinstance(b, list): + return (isinstance(a, list) + and all(nested_tensors_equal(b_, a_) for b_, a_ in zip(b, a))) + + # Both a and b are scalars + return a == b + + +BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors] """ A dictionary containing nested tensors which have been batched via :meth:`MultiModalKwargs.batch`. """ +@dataclass(frozen=True) +class MultiModalFieldItem: + """ + Contains metadata and data in :class:`MultiModalKwargs` + corresponding to a data item in :class:`MultiModalDataItems`. + """ + field: "BaseMultiModalField" + data: NestedTensors + + def __eq__(self, other: object) -> bool: + if not isinstance(other, self.__class__): + return False + + return (self.field == other.field + and nested_tensors_equal(self.data, other.data)) + + +@dataclass(frozen=True) +class BaseMultiModalField(ABC): + """Abstract base class for a field in :class:`MultiModalKwargs`.""" + key: str + modality: str + + @abstractmethod + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + raise NotImplementedError + + def _build_item(self, data: NestedTensors) -> MultiModalFieldItem: + return MultiModalFieldItem(self, data) + + def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem: + """Merge multiple instances of :class:`MultiModalFieldItem` together.""" + fields = [item.field for item in batch] + if len(set(fields)) > 1: + raise ValueError(f"Cannot merge different {fields=}") + + data = self._reduce_data([item.data for item in batch]) + + return self._build_item(data) + + +@dataclass(frozen=True) +class MultiModalBatchedField(BaseMultiModalField): + """ + A :class:`BaseMultiModalField` implementation where an item is obtained by + directly indexing into the first dimension of the underlying data. + """ + + def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]: + return [self._build_item(item) for item in batch] + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + first_shape = batch[0].shape + if all(item.shape == first_shape for item in batch): + return torch.stack(batch) + + return batch + + +@dataclass(frozen=True) +class MultiModalFlatField(BaseMultiModalField): + """ + A :class:`BaseMultiModalField` implementation where an item is obtained by + slicing along the first dimension of the underlying data. + """ + + def build_items( + self, + batch: NestedTensors, + slices: Sequence[slice], + ) -> list[MultiModalFieldItem]: + return [self._build_item(batch[slice_]) for slice_ in slices] + + def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: + if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): + first_shape = batch[0].shape + if all(item.shape[1:] == first_shape[1:] for item in batch): + return torch.concat(batch) + + return [elem for item in batch for elem in item] + + +class MultiModalFieldConfig: + + @staticmethod + def batched(modality: str): + return MultiModalFieldConfig( + field_cls=MultiModalBatchedField, + modality=modality, + ) + + @staticmethod + def flat(modality: str, slices: Sequence[slice]): + return MultiModalFieldConfig( + field_cls=MultiModalFlatField, + modality=modality, + slices=slices, + ) + + def __init__( + self, + field_cls: type[BaseMultiModalField], + modality: str, + **field_config: Any, + ) -> None: + super().__init__() + + self._field_cls = field_cls + self._modality = modality + self._field_config = field_config + + def build_items( + self, + key: str, + batch: NestedTensors, + ) -> list[MultiModalFieldItem]: + field = self._field_cls(key=key, modality=self._modality) + return field.build_items(batch, **self._field_config) # type: ignore + + class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to :meth:`~torch.nn.Module.forward`. + + The metadata :code:`items_by_key` defines how to split batched keyword + arguments corresponding to each data item in :class:`MultiModalDataItems`: + + - For a keyword argument, we can access the :code:`i` th item in the batch + via :code:`items_by_key[key][i]`. + - We can gather the keyword arguments belonging to a modality by finding + the keys with items that belong to that modality, then accessing + the :code:`i` th item in the batch for each such key. + + Example: + + .. code-block:: python + + # All items belong to the "image" modality + items_by_key={ + "pixel_values": [a, b, c, d], # "image" modality + "image_grid_thw": [e, f, g, h], # "image" modality + "pixel_values_video": [h, i, j], # "video" modality + "video_grid_thw": [k, l, m], # "video" modality + } + + - The keyword arguments belonging to the first image are + :code:`{"pixel_values": a, "image_grid_thw": e}`. + - The keyword arguments belonging to the second video are + :code:`{"pixel_values_video": i, "video_grid_thw": l}`. """ + @staticmethod + def from_hf_inputs( + hf_inputs: BatchFeature, + config_by_key: Mapping[str, MultiModalFieldConfig], + *, + enable_sanity_checks: bool = False, + ): + # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` + # We assume that those fields are not used in vLLM + items_by_key = { + key: config.build_items(key, batch) + for key, config in config_by_key.items() + if (batch := hf_inputs.get(key)) is not None + } + + return MultiModalKwargs.from_items_by_key( + items_by_key, + enable_sanity_checks=enable_sanity_checks, + ) + + @staticmethod + def from_items_by_key( + items_by_key: Mapping[str, list[MultiModalFieldItem]], + *, + enable_sanity_checks: bool = False, + ) -> "MultiModalKwargs": + data = { + key: items[0].field.reduce(items).data + for key, items in items_by_key.items() + } + + return MultiModalKwargs(data, + items_by_key=items_by_key, + enable_sanity_checks=enable_sanity_checks) + + def __init__( + self, + data: Mapping[str, NestedTensors], + *, + items_by_key: Mapping[str, list[MultiModalFieldItem]] = {}, + enable_sanity_checks: bool = False, + ) -> None: + super().__init__(data) + + # Shallow copy to avoid footgun in case a defaultdict is passed in + self._items_by_key = dict(items_by_key) + + keys_by_modality = defaultdict[str, set[str]](set) + for key, items in items_by_key.items(): + for item in items: + keys_by_modality[item.field.modality].add(key) + + self._keys_by_modality = dict(keys_by_modality) + + if enable_sanity_checks: + for modality, keys in keys_by_modality.items(): + items_in_modality = {k: items_by_key[k] for k in keys} + batch_sizes = {k: len(v) for k, v in items_in_modality.items()} + batch_size = next(iter(batch_sizes.values()), 0) + assert all(bs == batch_size + for bs in batch_sizes.values()), dict( + modality=modality, + batch_sizes=batch_sizes, + items_by_key=items_by_key) + @staticmethod def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: """ @@ -139,7 +496,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: # Only tensors (not lists) can be stacked. return stacked - tensors_ = cast(List[torch.Tensor], stacked) + tensors_ = cast(list[torch.Tensor], stacked) if any(t.shape != tensors_[0].shape for t in tensors_): # The tensors have incompatible shapes and can't be stacked. return tensors_ @@ -147,7 +504,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: return torch.stack(tensors_) @staticmethod - def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: + def batch(inputs_list: list["MultiModalKwargs"]) -> BatchedTensorInputs: """ Batch multiple inputs together into a dictionary. @@ -162,7 +519,7 @@ def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: # We need to consider the case where each item in the batch # contains different modalities (i.e. different keys). - item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) + item_lists = defaultdict[str, list[NestedTensors]](list) for inputs in inputs_list: for k, v in inputs.items(): @@ -188,6 +545,57 @@ def as_kwargs( return cast(BatchedTensorInputs, json_mapped) + def __eq__(self, other: object) -> bool: + if not isinstance(other, self.__class__): + return False + if self._items_by_key != other._items_by_key: + return False + + ks = self.keys() + return (ks == other.keys() + and all(nested_tensors_equal(self[k], other[k]) for k in ks)) + + def get_item(self, key: str, item_index: int) -> MultiModalFieldItem: + return self._items_by_key[key][item_index] + + def get_items_by_modality( + self, + modality: str, + item_index: int, + ) -> Mapping[str, MultiModalFieldItem]: + """ + Get the keyword arguments corresponding to an item identified by + its modality and index. + """ + keys_to_gather = self._keys_by_modality[modality] + + return { + key: self.get_item(key, item_index) + for key in keys_to_gather if key in self + } + + @staticmethod + def from_items_by_modality( + items_by_modality: Mapping[str, list[Mapping[str, + MultiModalFieldItem]]], + *, + enable_sanity_checks: bool = False, + ) -> "MultiModalKwargs": + """ + Construct a new :class:`MultiModalKwargs` from multiple items returned + by :meth:`get_fields_by_modality`. + """ + items_by_key = defaultdict[str, list[MultiModalFieldItem]](list) + for fields in items_by_modality.values(): + for field in fields: + for k, v in field.items(): + items_by_key[k].append(v) + + return MultiModalKwargs.from_items_by_key( + items_by_key, + enable_sanity_checks=enable_sanity_checks, + ) + MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] """ @@ -207,16 +615,16 @@ class MultiModalInputsV2(TypedDict): prompt: str """The processed prompt text.""" - prompt_token_ids: List[int] + prompt_token_ids: list[int] """The processed token IDs which includes placeholder tokens.""" - token_type_ids: NotRequired[List[int]] + token_type_ids: NotRequired[list[int]] """The token type IDs of the prompt.""" mm_kwargs: MultiModalKwargs """Keyword arguments to be directly passed to the model after batching.""" - mm_hashes: NotRequired[List[str]] + mm_hashes: NotRequired[list[str]] """The hashes of the multi-modal data.""" mm_placeholders: MultiModalPlaceholderDict diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 6baf19d675d50..3ece0762e3228 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,6 +1,6 @@ +import pickle import re from abc import ABC, abstractmethod -from collections import UserDict from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass, field from functools import lru_cache @@ -8,19 +8,18 @@ import numpy as np import torch +from blake3 import blake3 from PIL.Image import Image from transformers import BatchFeature, ProcessorMixin -from typing_extensions import assert_never from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import flatten_2d_lists, full_groupby, is_list_of +from vllm.utils import LRUCache, flatten_2d_lists, full_groupby, is_list_of -from .audio import resample_audio -from .inputs import (AudioItem, ImageItem, MultiModalDataDict, - MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, - VideoItem) +from .inputs import (MultiModalDataDict, MultiModalDataItems, + MultiModalFieldConfig, MultiModalFieldItem, + MultiModalInputsV2, MultiModalKwargs, PlaceholderRange) logger = init_logger(__name__) @@ -201,111 +200,6 @@ def get_replacement(self, item_idx: int) -> _BoundPromptSequence: return bound_replacement -class ImageSize(NamedTuple): - width: int - height: int - - -class MultiModalDataItems(UserDict[str, list[Any]]): - """ - As :class:`MultiModalDataDict`, but normalized such that each entry - corresponds to a list. - """ - - @staticmethod - def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": - """ - Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. - """ - multi_data = MultiModalDataItems() - - for k, v in data.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - # yapf: disable - if k == "video": - # Special case since even a single item can be a list - multi_data[k] = ( # type: ignore[index] - v if (isinstance(v, torch.Tensor) - or is_list_of(v, list)) else [v] - ) - elif k in ("image", "audio"): - multi_data[k] = ( # type: ignore[index] - v if isinstance(v, (torch.Tensor, list)) else [v] - ) - else: - multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] - # yapf: enable - - return multi_data - - # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to - # `self.images` doesn't update this dictionary, which may be confusing - # We annotate the getter methods as `Sequence` to prevent others from - # trying to update the list in this way - @property - def images(self) -> Sequence[ImageItem]: - return self.get("image", []) - - @property - def videos(self) -> Sequence[VideoItem]: - return self.get("video", []) - - @property - def audios(self) -> Sequence[AudioItem]: - return self.get("audio", []) - - def get_item_counts(self) -> Mapping[str, int]: - return {m: len(items) for m, items in self.items()} - - def get_image_size(self, item_idx: int) -> ImageSize: - image = self.images[item_idx] - - if isinstance(image, Image): - return ImageSize(*image.size) - if isinstance(image, (np.ndarray, torch.Tensor)): - _, h, w = image.shape - return ImageSize(w, h) - - assert_never(image) - - def get_audio_with_sr( - self, - item_idx: int, - *, - default_sr: float, - ) -> tuple[np.ndarray, float]: - audio = self.audios[item_idx] - - if isinstance(audio, tuple): - return audio - if isinstance(audio, list): - return np.array(audio), default_sr - if isinstance(audio, np.ndarray): - return audio, default_sr - - assert_never(audio) - - def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None: - """ - If :code:`drop_sr=True`, the audio items in this dictionary are updated - to be NumPy arrays which implicitly means that their sampling rate is - the same as the model's expected sampling rate; otherwise, they remain - as :code:`(audio, new_sr)` tuples. - """ - if not self.audios: - return - - new_audios = [] - for item_idx in range(len(self.audios)): - audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr) - audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr) - - new_audios.append(audio if drop_sr else (audio, new_sr)) - - self["audio"] = new_audios - - class _TokenMatch(NamedTuple): start_idx: int end_idx: int @@ -583,11 +477,124 @@ def iter_placeholders( ) -class ProcessorInputs(NamedTuple): - """Keyword arguments to :meth:`BaseMultiModalProcessor`""" +@dataclass +class ProcessorInputs: + """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" prompt_text: str mm_data: MultiModalDataDict - mm_processor_kwargs: Mapping[str, object] + hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) + + +class ProcessingCache: + + def __init__(self, capacity: int) -> None: + super().__init__() + + # DEBUG: Set to None to disable + self.debug_cache_hit_ratio_steps: Optional[int] = None + + self._cache = LRUCache[str, Mapping[str, + MultiModalFieldItem]](capacity) + + def _maybe_log_cache_stats(self) -> None: + steps = self.debug_cache_hit_ratio_steps + if not steps: + return + + cache_stats = self._cache.stat() + if cache_stats.total % steps == 0: + logger.debug("ProcessingCache: hit_ratio = %.2f", + cache_stats.hit_ratio) + + def _serialize_item(self, obj: object) -> bytes: + # Simple cases + if isinstance(obj, str): + return obj.encode("utf-8") + if isinstance(obj, bytes): + return obj + if isinstance(obj, Image): + return obj.tobytes() + + # Convertible to NumPy arrays + if isinstance(obj, torch.Tensor): + obj = obj.numpy() + if isinstance(obj, (int, float)): + obj = np.array(obj) + if isinstance(obj, np.ndarray): + return obj.tobytes() + + logger.warning( + "No serialization method found for %s. " + "Falling back to pickle.", type(obj)) + + return pickle.dumps(obj) + + def _item_to_bytes( + self, + key: str, + obj: object, + ) -> Iterable[tuple[bytes, bytes]]: + # Recursive cases + if isinstance(obj, (list, tuple)): + for i, elem in enumerate(obj): + yield from self._item_to_bytes(f"{key}.{i}", elem) + elif isinstance(obj, dict): + for k, v in obj.items(): + yield from self._item_to_bytes(f"{key}.{k}", v) + else: + key_bytes = self._serialize_item(key) + value_bytes = self._serialize_item(obj) + yield key_bytes, value_bytes + + def _hash_kwargs(self, **kwargs: object) -> str: + hasher = blake3() + + for k, v in kwargs.items(): + for k_bytes, v_bytes in self._item_to_bytes(k, v): + hasher.update(k_bytes) + hasher.update(v_bytes) + + return hasher.hexdigest() + + def get( + self, + model_id: str, + modality: str, + input_item: object, + input_kwargs: Mapping[str, object], + ) -> Optional[Mapping[str, MultiModalFieldItem]]: + """ + Get a processed multi-modal item from the cache + according to its dependencies, including: + + - The model ID + - The modality of the item + - The original data item passed to the HF processor + - The configuration options of the HF processor + """ + self._maybe_log_cache_stats() + + cache_key = self._hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) + return self._cache.get(cache_key) + + def put( + self, + model_id: str, + modality: str, + input_item: object, + input_kwargs: Mapping[str, object], + output_kwargs: Mapping[str, MultiModalFieldItem], + ) -> None: + """ + Put a processed multi-modal item into the cache + according to its dependencies (see :meth:`get`). + """ + cache_key = self._hash_kwargs(model_id=model_id, + **{modality: input_item}, + **input_kwargs) + self._cache.put(cache_key, output_kwargs) class BaseMultiModalProcessor(ABC): @@ -595,18 +602,24 @@ class BaseMultiModalProcessor(ABC): Abstract base class to process multi-modal inputs to be used in vLLM. """ - def __init__(self, ctx: InputProcessingContext) -> None: + def __init__(self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: super().__init__() self.ctx = ctx + self.cache = cache + self.enable_sanity_checks = enable_sanity_checks def __call__( self, prompt: str, mm_data: MultiModalDataDict, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - return self.apply(prompt, mm_data, mm_processor_kwargs) + return self.apply(prompt, mm_data, hf_processor_mm_kwargs) def _get_hf_processor(self) -> ProcessorMixin: """ @@ -624,12 +637,21 @@ def _get_mm_items( ) -> MultiModalDataItems: return MultiModalDataItems.from_dict(mm_data) + @abstractmethod + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + """Given the HF-processed data, output the metadata of each field.""" + raise NotImplementedError + @abstractmethod def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_inputs: BatchFeature, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: """ Given the original multi-modal items for this modality @@ -651,7 +673,7 @@ def _find_placeholders( return list( iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts)) - def _get_processor_data( + def _get_hf_mm_data( self, mm_items: MultiModalDataItems, ) -> tuple[dict[str, Any], dict[str, Any]]: @@ -669,7 +691,7 @@ def _get_processor_data( and v[0].ndim == 2): # Pass through embedding inputs (multi) passthrough_data[f"{k}_embeds"] = v - else: + elif len(v) > 0: # Map keys to plural form, e.g.: image -> images processor_data[f"{k}s"] = v else: @@ -679,39 +701,181 @@ def _get_processor_data( def _call_hf_processor( self, - hf_processor: ProcessorMixin, prompt: str, - processor_data: Mapping[str, object], - mm_processor_kwargs: Mapping[str, object], + # Not to be confused with `mm_data` in `self.apply`. + # This refers to the data to be passed to HF processor. + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], ) -> BatchFeature: return self.ctx.call_hf_processor( - hf_processor, - prompt, - processor_data, - mm_processor_kwargs, + self._get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + mm_kwargs, ) def _apply_hf_processor( self, - prompt: str, + prompt_text: str, mm_items: MultiModalDataItems, - mm_processor_kwargs: Mapping[str, object], - ) -> BatchFeature: - # some mm_processor_kwargs may be used in processor initialization - # instead of processor call - hf_processor = self._get_hf_processor(**mm_processor_kwargs) + hf_processor_mm_kwargs: Mapping[str, object], + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the full prompt text and multi-modal data. + """ + processor_data, passthrough_data = self._get_hf_mm_data(mm_items) + + processed_data = self._call_hf_processor( + prompt=prompt_text, + mm_data=processor_data, + mm_kwargs=hf_processor_mm_kwargs, + ) + processed_data.update(passthrough_data) - processor_data, passthrough_data = self._get_processor_data(mm_items) + prompt_ids, = processed_data.pop("input_ids").tolist() - hf_inputs = self._call_hf_processor( - hf_processor, - prompt=prompt, - processor_data=processor_data, - mm_processor_kwargs=mm_processor_kwargs, + mm_kwargs = MultiModalKwargs.from_hf_inputs( + processed_data, + self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), + enable_sanity_checks=self.enable_sanity_checks, ) - hf_inputs.update(passthrough_data) - return hf_inputs + return prompt_ids, mm_kwargs + + def _apply_hf_processor_missing( + self, + prompt_text: str, + mm_missing_data_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ): + """ + Apply the HF processor on the full prompt text, but only on the + multi-modal data that are missing from the cache. + + Note: We pass prompt text and multi-modal data into the HF processor + in separate calls to avoid HF prompt replacement being done for + cached items; instead, we rely on our own prompt replacement logic + for the full text. + """ + mm_missing_counts = mm_missing_data_items.get_item_counts() + + prompt_ids, _ = self._apply_hf_processor( + prompt_text=prompt_text, + mm_items=MultiModalDataItems({}), + hf_processor_mm_kwargs={}, + ) + + # Some HF processors (e.g. Qwen2-VL) expect corresponding + # multi-modal tokens to be in the prompt text + dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts) + + _, mm_missing_kwargs = self._apply_hf_processor( + prompt_text=dummy_inputs.prompt_text, + mm_items=mm_missing_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + return prompt_ids, mm_missing_kwargs + + def _cached_apply_hf_processor( + self, + prompt_text: str, + mm_data_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> tuple[list[int], MultiModalKwargs]: + """ + Apply the HF processor on the full prompt text, + caching the results and reusing cached results. + """ + cache = self.cache + model_id = self.ctx.model_config.model + + if cache is None or mm_data_items.has_embedding_inputs(): + return self._apply_hf_processor( + prompt_text=prompt_text, + mm_items=mm_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + mm_maybe_cached_field_items = { + modality: [ + cache.get(model_id, modality, item, hf_processor_mm_kwargs) + for item in items + ] + for modality, items in mm_data_items.items() + } + + mm_missing_idxs = { + modality: [idx for idx, out in enumerate(fields) if out is None] + for modality, fields in mm_maybe_cached_field_items.items() + } + mm_missing_data = { + modality: [mm_data_items[modality][idx] for idx in idxs] + for modality, idxs in mm_missing_idxs.items() + } + mm_missing_data_items = self._get_mm_items(mm_missing_data) + + prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing( + prompt_text=prompt_text, + mm_missing_data_items=mm_missing_data_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + mm_missing_next_idx = { + modality: 0 + for modality in mm_missing_data_items + } + + mm_merged_field_items = dict[str, list[Mapping[str, + MultiModalFieldItem]]]() + for modality, modal_items_lst in mm_maybe_cached_field_items.items(): + merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]() + + for idx, modal_items in enumerate(modal_items_lst): + if modal_items is None: + modal_items = mm_missing_kwargs.get_items_by_modality( + modality, + mm_missing_next_idx[modality], + ) + + cache.put( + model_id, + modality, + mm_data_items[modality][idx], + hf_processor_mm_kwargs, + modal_items, + ) + + mm_missing_next_idx[modality] += 1 + + merged_modal_items_lst.append(modal_items) + + mm_merged_field_items[modality] = merged_modal_items_lst + + if self.enable_sanity_checks: + mm_missing_counts = mm_missing_data_items.get_item_counts() + assert all( + item_count == mm_missing_counts[modality] + for modality, item_count in mm_missing_next_idx.items()), dict( + mm_missing_next_idx=mm_missing_next_idx, + mm_missing_counts=mm_missing_counts) + + mm_kwargs = MultiModalKwargs.from_items_by_modality( + mm_merged_field_items, + enable_sanity_checks=self.enable_sanity_checks, + ) + + if self.enable_sanity_checks: + mm_item_counts = mm_data_items.get_item_counts() + + for modality, item_count in mm_item_counts.items(): + for item_idx in range(item_count): + try: + mm_kwargs.get_items_by_modality(modality, item_idx) + except Exception as e: + # Make it easy to set a breakpoint in the debugger + raise e + + return prompt_ids, mm_kwargs def _bind_prompt_replacements( self, @@ -730,6 +894,10 @@ def _apply_prompt_replacements( tokenizer = self._get_tokenizer() token_matches = find_token_matches(token_ids, prompt_repls) + mm_match_counts = { + modality: len(matches) + for modality, matches in full_groupby_modality(token_matches) + } # If the search text does not represent a special token, # it may have different token IDs in the prompt, because @@ -742,8 +910,8 @@ def _apply_prompt_replacements( # of the search text in the prompt, we instead perform string # replacement on the decoded token IDs, then encode them back. if all( - len(matches) >= mm_item_counts[modality] - for modality, matches in full_groupby_modality(token_matches) + mm_match_counts.get(modality, 0) >= item_count + for modality, item_count in mm_item_counts.items() ): # yapf: disable token_ids = replace_token_matches( token_ids, @@ -775,7 +943,7 @@ def apply( self, prompt_text: str, mm_data: MultiModalDataDict, - mm_processor_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: """ Process multi-modal inputs to be used in vLLM. @@ -792,20 +960,24 @@ def apply( """ mm_items = self._get_mm_items(mm_data) - hf_inputs = self._apply_hf_processor(prompt_text, mm_items, - mm_processor_kwargs) - prompt_ids, = hf_inputs.pop("input_ids").tolist() - mm_kwargs = MultiModalKwargs(hf_inputs) + prompt_ids, mm_kwargs = self._cached_apply_hf_processor( + prompt_text, + mm_items, + hf_processor_mm_kwargs, + ) - prompt_repls = self._get_prompt_replacements(mm_items, hf_inputs, - mm_processor_kwargs) - all_prompt_repls = self._bind_prompt_replacements(prompt_repls) + unbound_prompt_repls = self._get_prompt_replacements( + mm_items, + hf_processor_mm_kwargs, + mm_kwargs, + ) + prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls) # If HF processor already inserts placeholder tokens, # there is no need for us to insert them mm_item_counts = mm_items.get_item_counts() - all_placeholders = self._find_placeholders(all_prompt_repls, - prompt_ids, mm_item_counts) + all_placeholders = self._find_placeholders(prompt_repls, prompt_ids, + mm_item_counts) if all_placeholders: tokenizer = self._get_tokenizer() @@ -817,7 +989,7 @@ def apply( all_placeholders, ) = self._apply_prompt_replacements( prompt_ids, - all_prompt_repls, + prompt_repls, mm_item_counts, ) @@ -855,23 +1027,29 @@ def get_dummy_data( from vllm.sequence import SequenceData processor_inputs = self._get_dummy_mm_inputs(mm_counts) - mm_inputs = self.apply(*processor_inputs) + mm_inputs = self.apply( + prompt_text=processor_inputs.prompt_text, + mm_data=processor_inputs.mm_data, + hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, + ) prompt_token_ids = mm_inputs["prompt_token_ids"] placeholders_by_modality = mm_inputs["mm_placeholders"] - total_placeholders_by_modality = dict[str, int]() - for modality, placeholders in placeholders_by_modality.items(): - num_placeholders = sum(item["length"] for item in placeholders) - max_tokens = mm_max_tokens[modality] - - if num_placeholders != max_tokens: - logger.warning( - "The processed dummy data has a total of %d placeholder " - "tokens for the '%s' modality, which is not the expected " - "%d tokens.", num_placeholders, modality, max_tokens) - - total_placeholders_by_modality[modality] = num_placeholders + total_placeholders_by_modality = { + modality: sum(item["length"] for item in placeholders) + for modality, placeholders in placeholders_by_modality.items() + } + expected_placeholders_by_modality = { + modality: mm_max_tokens[modality] + for modality in placeholders_by_modality + } + if total_placeholders_by_modality != expected_placeholders_by_modality: + raise AssertionError( + f"The processed dummy data has a total of " + f"{total_placeholders_by_modality} placeholder tokens, which " + f"is not the expected {expected_placeholders_by_modality} " + "tokens.") total_len = len(prompt_token_ids) if total_len > seq_len: diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index ded45a7184b5d..3a5e11867ad9e 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,10 +1,9 @@ import functools from collections import UserDict -from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, +from typing import (TYPE_CHECKING, Any, Dict, Mapping, Optional, Protocol, Sequence, Type, TypeVar) import torch.nn as nn -from typing_extensions import TypeAlias from vllm.inputs import InputProcessingContext from vllm.logger import init_logger @@ -15,7 +14,7 @@ from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc from .image import ImagePlugin from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors -from .processing import BaseMultiModalProcessor +from .processing import BaseMultiModalProcessor, ProcessingCache from .video import VideoPlugin if TYPE_CHECKING: @@ -23,15 +22,22 @@ logger = init_logger(__name__) +# TODO: Tune the MM cache size +MM_CACHE_SIZE = 256 + N = TypeVar("N", bound=Type[nn.Module]) -MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext], - BaseMultiModalProcessor] -""" -Constructs a :class:`MultiModalProcessor` instance from the context. -The processing metadata should be derived from the context. -""" +class MultiModalProcessorFactory(Protocol): + """Constructs a :class:`MultiModalProcessor` instance from the context.""" + + def __call__( + self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + ) -> BaseMultiModalProcessor: + ... class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): @@ -71,6 +77,8 @@ def __init__( self._limits_by_model = _MultiModalLimits() + self._processing_cache = ProcessingCache(MM_CACHE_SIZE) + def register_plugin(self, plugin: MultiModalPlugin) -> None: """ Register a multi-modal plugin so it can be recognized by vLLM. @@ -328,15 +336,18 @@ def wrapper(model_cls: N) -> N: return wrapper - def has_processor(self, model_config: "ModelConfig") -> bool: - """ - Test whether a multi-modal processor is defined for a specific model. - """ + def _get_model_cls(self, model_config: "ModelConfig"): # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture model_cls, _ = get_model_architecture(model_config) - return model_cls in self._processor_factories + return model_cls + + def has_processor(self, model_config: "ModelConfig") -> bool: + """ + Test whether a multi-modal processor is defined for a specific model. + """ + return self._get_model_cls(model_config) in self._processor_factories def create_processor( self, @@ -346,12 +357,11 @@ def create_processor( """ Create a multi-modal processor for a specific model and tokenizer. """ - - # Avoid circular import - from vllm.model_executor.model_loader import get_model_architecture - - model_cls, _ = get_model_architecture(model_config) + model_cls = self._get_model_cls(model_config) processor_factory = self._processor_factories[model_cls] ctx = InputProcessingContext(model_config, tokenizer) - return processor_factory(ctx) + cache = (None if model_config.disable_mm_preprocessor_cache else + self._processing_cache) + + return processor_factory(ctx, cache=cache) diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index f1523667b0466..b12cc83a22970 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -1,25 +1,31 @@ from functools import lru_cache from typing import Any, cast +from transformers.processing_utils import ProcessorMixin + def get_processor( processor_name: str, *args: Any, trust_remote_code: bool = False, + processor_cls: type[ProcessorMixin] = ProcessorMixin, **kwargs: Any, ): """Load a processor for the given model name via HuggingFace.""" # don't put this import at the top level # it will call torch.cuda.device_count() from transformers import AutoProcessor - from transformers.processing_utils import ProcessorMixin + + processor_factory = (AutoProcessor + if processor_cls == ProcessorMixin else processor_cls) try: - processor = AutoProcessor.from_pretrained( + processor = processor_factory.from_pretrained( processor_name, *args, trust_remote_code=trust_remote_code, - **kwargs) + **kwargs, + ) except ValueError as e: # If the error pertains to the processor class not existing or not # currently being imported, suggest using the --trust-remote-code flag. diff --git a/vllm/utils.py b/vllm/utils.py index 3d198887021dc..5eb4e8c4180c4 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -25,11 +25,11 @@ import weakref from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task from collections import OrderedDict, UserDict, defaultdict -from collections.abc import Iterable, Mapping +from collections.abc import Hashable, Iterable, Mapping from dataclasses import dataclass, field from functools import lru_cache, partial, wraps from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, - Dict, Generator, Generic, Hashable, List, Literal, + Dict, Generator, Generic, List, Literal, NamedTuple, Optional, Tuple, Type, TypeVar, Union, overload) from uuid import uuid4 @@ -194,13 +194,29 @@ def reset(self) -> None: self.counter = 0 +class CacheInfo(NamedTuple): + hits: int + total: int + + @property + def hit_ratio(self) -> float: + if self.total == 0: + return 0 + + return self.hits / self.total + + class LRUCache(Generic[_K, _V]): + """Note: This class is not thread safe!""" def __init__(self, capacity: int) -> None: self.cache = OrderedDict[_K, _V]() self.pinned_items = set[_K]() self.capacity = capacity + self._hits = 0 + self._total = 0 + def __contains__(self, key: _K) -> bool: return key in self.cache @@ -218,6 +234,9 @@ def __setitem__(self, key: _K, value: _V) -> None: def __delitem__(self, key: _K) -> None: self.pop(key) + def stat(self) -> CacheInfo: + return CacheInfo(hits=self._hits, total=self._total) + def touch(self, key: _K) -> None: self.cache.move_to_end(key) @@ -226,8 +245,12 @@ def get(self, key: _K, default: Optional[_V] = None) -> Optional[_V]: if key in self.cache: value = self.cache[key] self.cache.move_to_end(key) + + self._hits += 1 else: value = default + + self._total += 1 return value def put(self, key: _K, value: _V) -> None: From 55509c2114718c1292c11348f002461ba44cb23b Mon Sep 17 00:00:00 2001 From: ErezSC42 Date: Fri, 27 Dec 2024 19:58:21 +0200 Subject: [PATCH 007/462] [MODEL] LoRA support for Jamba model (#11209) Signed-off-by: Erez Schwartz --- tests/lora/conftest.py | 24 +++++++++ tests/lora/test_jamba.py | 54 +++++++++++++++++++ .../layers/mamba/mamba_mixer.py | 22 ++++++-- vllm/model_executor/models/jamba.py | 50 ++++++++--------- vllm/model_executor/models/mamba.py | 14 +++-- 5 files changed, 132 insertions(+), 32 deletions(-) create mode 100644 tests/lora/test_jamba.py diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 8b247fb9b2388..57ebaa424fc59 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -4,6 +4,7 @@ from unittest.mock import MagicMock, patch import pytest +import safetensors import torch import torch.nn as nn from huggingface_hub import snapshot_download @@ -169,6 +170,29 @@ def mixtral_lora_files_all_target_modules(): return snapshot_download(repo_id="dyang415/mixtral-lora-v0") +@pytest.fixture(scope="session") +def jamba_lora_files(): + # some of the adapters have unnecessary weights for serving, + # hence we remove them + def remove_unnecessary_weights(path): + lora_path = f"{adapter_path}/adapter_model.safetensors" + tensors = safetensors.torch.load_file(lora_path) + nonlora_keys = [] + for k in list(tensors.keys()): + if "lora" not in k: + nonlora_keys.append(k) + for k in nonlora_keys: + del tensors[k] + safetensors.torch.save_file(tensors, lora_path) + + adapter_path = snapshot_download( + repo_id= + "hf-100/Jamba-1.5-mini-Spellbound-StoryWriter-0.1-6583896-ckpt53-lora") + + remove_unnecessary_weights(adapter_path) + return adapter_path + + @pytest.fixture(scope="session") def gemma_lora_files(): return snapshot_download(repo_id="wskwon/gemma-7b-test-lora") diff --git a/tests/lora/test_jamba.py b/tests/lora/test_jamba.py new file mode 100644 index 0000000000000..6aa33926cb6b8 --- /dev/null +++ b/tests/lora/test_jamba.py @@ -0,0 +1,54 @@ +from typing import List + +import pytest +import torch + +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "ai21labs/AI21-Jamba-1.5-Mini" + +MAX_TOKENS = 40 + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int, + prompts: List[str]) -> List[str]: + + sampling_params = vllm.SamplingParams(temperature=0, max_tokens=MAX_TOKENS) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + # Print the outputs. + generated_texts: List[str] = [] + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text.strip() + generated_texts.append(generated_text) + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + return generated_texts + + +@pytest.mark.parametrize("tp_size", [4]) +def test_jamba_lora(jamba_lora_files, tp_size): + """Original test, the LoRA model has the common target modules, not all""" + if torch.cuda.device_count() < tp_size: + pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}") + + prompts = ["Write a story about a sheep and a goat."] + + llm = vllm.LLM( + MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_loras=4, + distributed_executor_backend="ray", + tensor_parallel_size=tp_size, + ) + + expected_jamba_output = [ + """Once upon a time, in a lush green meadow, there lived a sheep named Clara and a goat named Billy. Clara was a gentle creature, always nibbling on the soft grass and humming""" # noqa: E501 + ] + assert do_sample(llm, jamba_lora_files, lora_id=1, + prompts=prompts) == expected_jamba_output diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 10bec75f49fdf..606c796d503cf 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -42,12 +42,14 @@ def __init__(self, use_rms_norm: bool, rms_norm_has_weight: bool = True, rms_norm_eps: float = 1e-5, - activation="silu"): + activation="silu", + is_lora_enabled: bool = False): super().__init__() self.time_step_rank = time_step_rank self.ssm_state_size = ssm_state_size self.use_rms_norm = use_rms_norm self.activation = activation + self.is_lora_enabled = is_lora_enabled self.conv1d = ColumnParallelLinear( input_size=conv_kernel_size, @@ -63,6 +65,7 @@ def __init__(self, self.in_proj = MergedColumnParallelLinear(hidden_size, [intermediate_size] * 2, bias=use_bias) + # selective projection used to make dt, B and C input dependent self.x_proj = RowParallelLinear( intermediate_size, @@ -170,7 +173,13 @@ def forward_cuda(self, hidden_states: torch.Tensor, # 3. State Space Model sequence transformation # 3.a. input varying initialization of time_step, B and C - ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] + + if self.is_lora_enabled: + # lora kernel requires contiguous tensor + ssm_parameters = self.x_proj( + hidden_states.transpose(-2, -1).contiguous())[0] + else: + ssm_parameters = self.x_proj(hidden_states.transpose(-2, -1))[0] time_step, B, C = torch.split( ssm_parameters, @@ -222,6 +231,11 @@ def forward_cuda(self, hidden_states: torch.Tensor, scan_outputs = scan_outputs.transpose(0, 1) # 4. Final linear projection - contextualized_states = self.out_proj(scan_outputs.transpose(-2, - -1))[0] + if self.is_lora_enabled: + # lora kernel requires contiguous tensor + contextualized_states = self.out_proj( + scan_outputs.transpose(-2, -1).contiguous())[0] + else: + contextualized_states = self.out_proj( + scan_outputs.transpose(-2, -1))[0] return contextualized_states diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 91786db5ddc96..890b5530b97d6 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -107,9 +107,11 @@ def __init__(self, layer_idx: int, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - prefix: str = "") -> None: + is_lora_enabled: Optional[bool] = False, + **kwargs) -> None: super().__init__() self.config = config + self.is_lora_enabled = is_lora_enabled self.mamba = MambaMixer(hidden_size= config.hidden_size, ssm_state_size = config.mamba_d_state, conv_kernel_size = config.mamba_d_conv, @@ -120,7 +122,9 @@ def __init__(self, use_bias = config.mamba_proj_bias, use_rms_norm=True, rms_norm_eps=config.rms_norm_eps, - activation=config.hidden_act) + activation=config.hidden_act, + is_lora_enabled = self.is_lora_enabled + ) num_experts = config.layers_num_experts[layer_idx] ffn_layer_class = JambaMoE if num_experts > 1 else JambaMLP @@ -156,14 +160,13 @@ def forward( class JambaAttentionDecoderLayer(nn.Module): - def __init__( - self, - config: JambaConfig, - layer_idx: int, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - prefix: str = "", - ) -> None: + def __init__(self, + config: JambaConfig, + layer_idx: int, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + **kwargs) -> None: super().__init__() self.hidden_size = config.hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -287,17 +290,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): org_num_embeddings=config.vocab_size, ) + extra_kwargs = {"is_lora_enabled": bool(vllm_config.lora_config)} + def get_layer(prefix: str): layer_idx = int(prefix.rsplit(".", 1)[1]) layer_class = ALL_DECODER_LAYER_TYPES[ config.layers_block_type[layer_idx]] - return layer_class( - config, - layer_idx, - cache_config, - quant_config=quant_config, - prefix=prefix, - ) + return layer_class(config, + layer_idx, + cache_config, + quant_config=quant_config, + prefix=prefix, + **extra_kwargs) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers") @@ -371,14 +375,13 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP, "k_proj", "v_proj", ], + "in_proj": ["in_proj"], } # LoRA specific attributes supported_lora_modules = [ - "qkv_proj", - "o_proj", - "embed_tokens", - "lm_head", + "qkv_proj", "o_proj", "embed_tokens", "lm_head", "up_proj", + "down_proj", "gate_proj", "out_proj", "in_proj", "x_proj" ] embedding_modules = { "embed_tokens": "input_embeddings", @@ -423,9 +426,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) if self.scheduler_config is not None and \ - not self.model_config.enforce_eager: + not self.model_config.enforce_eager: if self.scheduler_config.max_num_seqs > \ - vllm_config.compilation_config.max_capture_size: + vllm_config.compilation_config.max_capture_size: self.max_batch_size = \ vllm_config.compilation_config.max_capture_size else: @@ -446,7 +449,6 @@ def forward(self, inputs_embeds: Optional[torch.Tensor] = None, **kwargs): if self.mamba_cache is None: - num_mamba_layers = self.model_config.get_num_layers_by_block_type( self.vllm_config.parallel_config, LayerBlockType.mamba) self.mamba_cache = MambaCacheManager( diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index 06c8d9723cd01..553bc9c28cb21 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -38,10 +38,12 @@ class MambaDecoderLayer(nn.Module): def __init__(self, config: MambaConfig, cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None) -> None: + quant_config: Optional[QuantizationConfig] = None, + is_lora_enabled: Optional[bool] = False) -> None: super().__init__() self.config = config self.is_falcon_mamba = config.model_type == "falcon_mamba" + self.is_lora_enabled = is_lora_enabled mixer_rms_eps = config.mixer_rms_eps if self.is_falcon_mamba else None self.mixer = MambaMixer(hidden_size=config.hidden_size, ssm_state_size=config.state_size, @@ -53,7 +55,8 @@ def __init__(self, use_rms_norm=self.is_falcon_mamba, rms_norm_has_weight=not self.is_falcon_mamba, rms_norm_eps=mixer_rms_eps, - activation=config.hidden_act) + activation=config.hidden_act, + is_lora_enabled=self.is_lora_enabled) self.norm = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) @@ -85,6 +88,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config lora_config = vllm_config.lora_config + is_lora_enabled = bool(lora_config) self.config = config self.padding_idx = config.pad_token_id @@ -101,8 +105,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, - lambda prefix: MambaDecoderLayer( - config, cache_config=cache_config, quant_config=quant_config), + lambda prefix: MambaDecoderLayer(config, + cache_config=cache_config, + quant_config=quant_config, + is_lora_enabled=is_lora_enabled), prefix=f"{prefix}.layers") self.norm_f = RMSNorm(config.hidden_size, From 0240402c4632604c9cd02f7eae4ae36fa990b38f Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 28 Dec 2024 02:48:24 +0800 Subject: [PATCH 008/462] [Misc]Add BNB quantization for MolmoForCausalLM (#11551) Signed-off-by: Jee Jee Li --- vllm/model_executor/model_loader/loader.py | 26 +++++-- vllm/model_executor/models/molmo.py | 90 ++++++++++++++++------ 2 files changed, 83 insertions(+), 33 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index f2d9293b31a83..4bca13cb2f60c 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -11,7 +11,8 @@ import warnings from abc import ABC, abstractmethod from contextlib import contextmanager -from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast +from typing import (Any, Callable, Dict, Generator, Iterable, List, Optional, + Tuple, cast) import gguf import huggingface_hub @@ -706,6 +707,8 @@ def __init__(self, load_config: LoadConfig): # Store all module names (from transformers) that support # BNB quantization. self.target_modules: List[str] = [] + # mapping weight names from transformers to vllm. + self.weight_mapper: Callable = lambda name: name def _get_weight_files( self, @@ -763,9 +766,12 @@ def _prepare_weights(self, model_name_or_path: str, def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool): if use_safetensors: - return safetensors_weights_iterator(hf_weights_files) + iterator = safetensors_weights_iterator(hf_weights_files) else: - return pt_weights_iterator(hf_weights_files) + iterator = pt_weights_iterator(hf_weights_files) + for name, param in iterator: + # mapping weight names from transformers to vllm. + yield self.weight_mapper(name), param def _get_quantized_weights_iterator( self, @@ -782,12 +788,12 @@ def _get_quantized_weights_iterator( try: import bitsandbytes - if bitsandbytes.__version__ < "0.44.0": + if bitsandbytes.__version__ < "0.45.0": raise ImportError("bitsandbytes version is wrong. Please " - "install bitsandbytes>=0.44.0.") + "install bitsandbytes>=0.45.0.") except ImportError as err: - raise ImportError("Please install bitsandbytes>=0.44.0 via " - "`pip install bitsandbytes>=0.44.0` to use " + raise ImportError("Please install bitsandbytes>=0.45.0 via " + "`pip install bitsandbytes>=0.45.0` to use " "bitsandbytes quantizer.") from err hf_weights_files, use_safetensors = self._prepare_weights( @@ -991,7 +997,7 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: if isinstance(module, (LinearBase, )): last_name = name.split(".")[-1] if sub_modules := inverse_stacked_mapping.get(last_name, []): - # Map vllm's names to transformers' names. + # Map vllm's names to transformers's names. for sub_name in sub_modules: self.target_modules.append( name.replace(last_name, sub_name)) @@ -1013,6 +1019,10 @@ def _load_weights(self, model_config: ModelConfig, f"Model {type(model).__name__} does not support BitsAndBytes " "quantization yet.") + # For some models like Molmo, we need to use hf_to_vllm_mapper + # to ensure correct loading of weights. + if hf_to_vllm_mapper := getattr(model, "hf_to_vllm_mapper", None): + self.weight_mapper = lambda name: hf_to_vllm_mapper._map_name(name) # Modules whose weights might have fused on disk # we need their output_sizes to make shard in flight correctly with TP self.maybe_fused_weights_modules: Dict[str, List[int]] = {} diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 8938f62d0c494..5d52d2c3e6b48 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -461,30 +461,71 @@ def forward( return output -class MolmoMLP(nn.Module): +class SwiGLU(nn.Module): + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x, gate = x.chunk(2, dim=-1) + # Note that the order is reversed compared to + # SiluAndMul. + return x * F.silu(gate) + + +class LanuageModelMLP(nn.Module): """Molmo's LLM mlp.""" def __init__(self, config: PretrainedConfig, input_dim: Optional[int] = None, - quant_config: Optional[QuantizationConfig] = None, - proj_name: str = "gate_up_proj") -> None: + quant_config: Optional[QuantizationConfig] = None) -> None: super().__init__() self.hidden_size = config.hidden_size self.intermediate_size = config.intermediate_size // 2 - # Molmo's LLM proj weights are already merged into the disk, while - # image_projector proj is separate. If the same proj_name were used, it - # would create ambiguity and make it difficult to support BNB and LoRA. - self.proj_name = proj_name - setattr( - self, proj_name, - MergedColumnParallelLinear( - input_dim or self.hidden_size, - [self.intermediate_size] * 2, - bias=False, - quant_config=quant_config, - )) + self.gate_up_proj = MergedColumnParallelLinear( + input_dim or self.hidden_size, + [self.intermediate_size] * 2, + bias=False, + quant_config=quant_config, + ) + # Activation function. + self.act_fn = SwiGLU() + # Feed-forward output projection. + self.down_proj = RowParallelLinear( + self.intermediate_size, + self.hidden_size, + bias=False, + quant_config=quant_config, + ) + + def forward( + self, + x: torch.Tensor, + ) -> torch.Tensor: + gate_up, _ = self.gate_up_proj(x) + x = self.act_fn(gate_up) + x, _ = self.down_proj(x) + return x + + +class ImageProjectorMLP(nn.Module): + """Molmo's image_projector mlp.""" + + def __init__( + self, + config: PretrainedConfig, + input_dim: Optional[int] = None, + quant_config: Optional[QuantizationConfig] = None, + ) -> None: + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size // 2 + + self.merged_linear = MergedColumnParallelLinear( + input_dim or self.hidden_size, + [self.intermediate_size] * 2, + bias=False, + quant_config=quant_config, + ) # Activation function. self.act_fn = SiluAndMul() @@ -500,7 +541,7 @@ def forward( self, x: torch.Tensor, ) -> torch.Tensor: - gate_up, _ = getattr(self, self.proj_name)(x) + gate_up, _ = self.merged_linear(x) x = self.act_fn(gate_up) x, _ = self.down_proj(x) return x @@ -523,9 +564,7 @@ def __init__( prefix=f"{prefix}.self_attn") # MLP block. - self.mlp = MolmoMLP(config, - quant_config=quant_config, - proj_name="gate_up_proj") + self.mlp = LanuageModelMLP(config, quant_config=quant_config) # LayerNorm assert config.layer_norm_type == "rms" @@ -617,11 +656,10 @@ def __init__( vision_config, nlayers=len(self.vit_layers), quant_config=quant_config) - self.image_projector = MolmoMLP( + self.image_projector = ImageProjectorMLP( config, input_dim=vision_config.image_emb_dim, quant_config=quant_config, - proj_name="merged_linear", ) image_dim = vision_config.image_emb_dim * len(self.vit_layers) @@ -842,10 +880,6 @@ def load_weights(self, weights: Iterable[Tuple[str, loaded_params: Set[str] = set() for name, loaded_weight in weights: - if "gate_up_proj" in name: - up_proj, gate_proj = loaded_weight.chunk(2, dim=0) - loaded_weight = torch.cat([gate_proj, up_proj], dim=0) - if name.endswith(".bias") and name not in params_dict: continue if is_pp_missing_parameter(name, self): @@ -1157,6 +1191,12 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): }, ) + # BitandBytes specific attributes + bitsandbytes_stacked_params_mapping = { + "gate_proj": ("merged_linear", 0), + "up_proj": ("merged_linear", 1), + } + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() config = vllm_config.model_config.hf_config From dde1fa18c9f9ba992a8300a300543d6c18d5f08d Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 28 Dec 2024 03:45:13 +0800 Subject: [PATCH 009/462] [Misc] Improve BNB loader to handle mixture of sharded and merged weights with same suffix (#11566) Signed-off-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/model_loader/loader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 4bca13cb2f60c..a9c1fa7221217 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -1001,8 +1001,11 @@ def _get_bnb_target_modules(self, model: nn.Module) -> None: for sub_name in sub_modules: self.target_modules.append( name.replace(last_name, sub_name)) - else: - self.target_modules.append(name) + # Add original module name even if the module has stacked map, + # in case model has a mixture of disk-merged and disk-splitted + # weights with same last name. + self.target_modules.append(name) + assert (self.target_modules ), "vllm currently does not support BNB quantization for" f" {type(model).__name__}" From ac797994039ba9e6ed0c2b3a503099cb122a936e Mon Sep 17 00:00:00 2001 From: Selali Date: Fri, 27 Dec 2024 12:12:11 -0800 Subject: [PATCH 010/462] [Bugfix] Fix for ROCM compressed tensor support (#11561) --- .../schemes/compressed_tensors_w8a8_fp8.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 73cc8ce0d2a4b..1d4e4bd52adaa 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -41,10 +41,12 @@ def process_weights_after_loading(self, layer) -> None: ) if current_platform.is_rocm(): + input_scale = getattr(layer, 'input_scale', None) + weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( weight=weight, weight_scale=max_w_scale, - input_scale=layer.input_scale) + input_scale=input_scale) if input_scale is not None: layer.input_scale = Parameter(input_scale, requires_grad=False) @@ -57,11 +59,13 @@ def process_weights_after_loading(self, layer) -> None: weight = layer.weight if current_platform.is_rocm(): + input_scale = getattr(layer, 'input_scale', None) + weight, weight_scale, input_scale = \ normalize_e4m3fn_to_e4m3fnuz( weight=weight, weight_scale=layer.weight_scale, - input_scale=layer.input_scale) + input_scale=input_scale) if input_scale is not None: layer.input_scale = Parameter(input_scale, requires_grad=False) @@ -76,7 +80,7 @@ def process_weights_after_loading(self, layer) -> None: raise ValueError(f"Unknown quantization strategy {self.strategy}") # INPUT SCALE - if self.is_static_input_scheme: + if self.is_static_input_scheme and hasattr(layer, 'input_scale'): layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False) else: From a60731247fba82fae5e71af7a19ea0df96de1caa Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Sat, 28 Dec 2024 08:31:10 +0800 Subject: [PATCH 011/462] [Doc] Update mllama example based on official doc (#11567) Signed-off-by: Chen Zhang --- examples/offline_inference_vision_language.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index d5a71862656e7..77af914a6ef02 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -308,7 +308,20 @@ def run_mllama(question: str, modality: str): disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) - prompt = f"<|image|><|begin_of_text|>{question}" + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [{ + "role": + "user", + "content": [{ + "type": "image" + }, { + "type": "text", + "text": f"{question}" + }] + }] + prompt = tokenizer.apply_chat_template(messages, + add_generation_prompt=True, + tokenize=False) stop_token_ids = None return llm, prompt, stop_token_ids From df04dffade84c87cafd74de4c39e6fd7cb95c24f Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 27 Dec 2024 20:45:08 -0500 Subject: [PATCH 012/462] [V1] [4/N] API Server: ZMQ/MP Utilities (#11541) --- docs/requirements-docs.txt | 1 + tests/v1/engine/test_engine_core.py | 13 +-- tests/v1/engine/test_engine_core_client.py | 10 +- vllm/entrypoints/openai/api_server.py | 11 +- vllm/executor/multiproc_worker_utils.py | 22 +--- vllm/utils.py | 90 ++++++++++++++++- vllm/v1/engine/async_llm.py | 6 +- vllm/v1/engine/core.py | 111 ++++----------------- vllm/v1/engine/core_client.py | 92 ++++++++--------- vllm/v1/engine/llm_engine.py | 6 +- vllm/v1/executor/multiproc_executor.py | 11 +- vllm/v1/utils.py | 89 +++++++++++------ 12 files changed, 247 insertions(+), 215 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index 4859c8ac08bea..25a700033cc9e 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -19,3 +19,4 @@ openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entr fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args requests +zmq diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index c529cd21f384b..954cec734b956 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -7,7 +7,6 @@ from vllm import SamplingParams from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform -from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core import EngineCore @@ -43,13 +42,11 @@ def test_engine_core(monkeypatch): m.setenv("VLLM_USE_V1", "1") """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config( - usage_context=UsageContext.UNKNOWN_CONTEXT) + vllm_config = engine_args.create_engine_config() executor_class = AsyncLLM._get_executor_cls(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, - executor_class=executor_class, - usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class=executor_class) """Test basic request lifecycle.""" # First request. @@ -151,13 +148,11 @@ def test_engine_core_advanced_sampling(monkeypatch): m.setenv("VLLM_USE_V1", "1") """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) - vllm_config = engine_args.create_engine_config( - usage_context=UsageContext.UNKNOWN_CONTEXT) + vllm_config = engine_args.create_engine_config() executor_class = AsyncLLM._get_executor_cls(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, - executor_class=executor_class, - usage_context=UsageContext.UNKNOWN_CONTEXT) + executor_class=executor_class) """Test basic request lifecycle.""" # First request. request: EngineCoreRequest = make_request() diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 2f1cbec607a91..729975e4ea8c4 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -86,11 +86,10 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( - vllm_config, - executor_class, - UsageContext.UNKNOWN_CONTEXT, multiprocess_mode=multiprocessing_mode, asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, ) MAX_TOKENS = 20 @@ -158,11 +157,10 @@ async def test_engine_core_client_asyncio(monkeypatch): usage_context=UsageContext.UNKNOWN_CONTEXT) executor_class = AsyncLLM._get_executor_cls(vllm_config) client = EngineCoreClient.make_client( - vllm_config, - executor_class, - UsageContext.UNKNOWN_CONTEXT, multiprocess_mode=True, asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, ) MAX_TOKENS = 20 diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 2e45b474237f9..094cc15a317e9 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -68,7 +68,7 @@ from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address, set_ulimit) + is_valid_ipv6_address, kill_process_tree, set_ulimit) from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -737,6 +737,15 @@ def signal_handler(*_) -> None: signal.signal(signal.SIGTERM, signal_handler) + # The child processes will send SIGQUIT to this process when + # any error happens. This process then clean up the whole tree. + # TODO(rob): move this into AsyncLLM.__init__ once we remove + # the context manager below. + def sigquit_handler(signum, frame): + kill_process_tree(os.getpid()) + + signal.signal(signal.SIGQUIT, sigquit_handler) + async with build_async_engine_client(args) as engine_client: app = build_app(args) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index c4d90f0856f86..bc32826529eef 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -1,5 +1,4 @@ import asyncio -import multiprocessing import os import sys import threading @@ -13,10 +12,9 @@ import torch -import vllm.envs as envs from vllm.logger import init_logger from vllm.triton_utils.importing import HAS_TRITON -from vllm.utils import cuda_is_initialized +from vllm.utils import _check_multiproc_method, get_mp_context if HAS_TRITON: from vllm.triton_utils import maybe_set_triton_cache_manager @@ -274,24 +272,6 @@ def write_with_prefix(s: str): file.write = write_with_prefix # type: ignore[method-assign] -def _check_multiproc_method(): - if (cuda_is_initialized() - and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): - logger.warning("CUDA was previously initialized. We must use " - "the `spawn` multiprocessing start method. Setting " - "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " - "See https://docs.vllm.ai/en/latest/getting_started/" - "debugging.html#python-multiprocessing " - "for more information.") - os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" - - -def get_mp_context(): - _check_multiproc_method() - mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD - return multiprocessing.get_context(mp_method) - - def set_multiprocessing_worker_envs(parallel_config): """ Set up environment variables that should be used when there are workers in a multiprocessing environment. This should be called by the parent diff --git a/vllm/utils.py b/vllm/utils.py index 5eb4e8c4180c4..2b46c1fef0d09 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -10,6 +10,7 @@ import importlib.util import inspect import ipaddress +import multiprocessing import os import re import resource @@ -20,6 +21,7 @@ import tempfile import threading import time +import traceback import uuid import warnings import weakref @@ -29,8 +31,9 @@ from dataclasses import dataclass, field from functools import lru_cache, partial, wraps from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, - Dict, Generator, Generic, List, Literal, NamedTuple, - Optional, Tuple, Type, TypeVar, Union, overload) + Dict, Generator, Generic, Iterator, List, Literal, + NamedTuple, Optional, Tuple, Type, TypeVar, Union, + overload) from uuid import uuid4 import numpy as np @@ -39,6 +42,8 @@ import torch import torch.types import yaml +import zmq +import zmq.asyncio from packaging.version import Version from torch.library import Library from typing_extensions import ParamSpec, TypeIs, assert_never @@ -1844,7 +1849,7 @@ def memory_profiling( result.non_kv_cache_memory_in_bytes = result.non_torch_increase_in_bytes + result.torch_peak_increase_in_bytes + result.weights_memory_in_bytes # noqa -# Adapted from: https://github.com/sgl-project/sglang/blob/f46f394f4d4dbe4aae85403dec006199b34d2840/python/sglang/srt/utils.py#L630 # noqa: E501Curre +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L630 # noqa: E501 def set_ulimit(target_soft_limit=65535): resource_type = resource.RLIMIT_NOFILE current_soft, current_hard = resource.getrlimit(resource_type) @@ -1859,3 +1864,82 @@ def set_ulimit(target_soft_limit=65535): "with error %s. This can cause fd limit errors like" "`OSError: [Errno 24] Too many open files`. Consider " "increasing with ulimit -n", current_soft, e) + + +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/utils.py#L28 # noqa: E501 +def get_exception_traceback(): + etype, value, tb = sys.exc_info() + err_str = "".join(traceback.format_exception(etype, value, tb)) + return err_str + + +# Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501 +def make_zmq_socket( + ctx: Union[zmq.asyncio.Context, zmq.Context], # type: ignore[name-defined] + path: str, + type: Any, +) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] + """Make a ZMQ socket with the proper bind/connect semantics.""" + + mem = psutil.virtual_memory() + socket = ctx.socket(type) + + # Calculate buffer size based on system memory + total_mem = mem.total / 1024**3 + available_mem = mem.available / 1024**3 + # For systems with substantial memory (>32GB total, >16GB available): + # - Set a large 0.5GB buffer to improve throughput + # For systems with less memory: + # - Use system default (-1) to avoid excessive memory consumption + if total_mem > 32 and available_mem > 16: + buf_size = int(0.5 * 1024**3) # 0.5GB in bytes + else: + buf_size = -1 # Use system default buffer size + + if type == zmq.constants.PULL: + socket.setsockopt(zmq.constants.RCVHWM, 0) + socket.setsockopt(zmq.constants.RCVBUF, buf_size) + socket.connect(path) + elif type == zmq.constants.PUSH: + socket.setsockopt(zmq.constants.SNDHWM, 0) + socket.setsockopt(zmq.constants.SNDBUF, buf_size) + socket.bind(path) + else: + raise ValueError(f"Unknown Socket Type: {type}") + + return socket + + +@contextlib.contextmanager +def zmq_socket_ctx( + path: str, + type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] + """Context manager for a ZMQ socket""" + + ctx = zmq.Context(io_threads=2) # type: ignore[attr-defined] + try: + yield make_zmq_socket(ctx, path, type) + + except KeyboardInterrupt: + logger.debug("Got Keyboard Interrupt.") + + finally: + ctx.destroy(linger=0) + + +def _check_multiproc_method(): + if (cuda_is_initialized() + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): + logger.warning("CUDA was previously initialized. We must use " + "the `spawn` multiprocessing start method. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " + "See https://docs.vllm.ai/en/latest/getting_started/" + "debugging.html#python-multiprocessing " + "for more information.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + +def get_mp_context(): + _check_multiproc_method() + mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD + return multiprocessing.get_context(mp_method) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ba2b8377759d6..da3da6dad6436 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -75,11 +75,11 @@ def __init__( # EngineCore (starts the engine in background process). self.engine_core = EngineCoreClient.make_client( - vllm_config=vllm_config, - executor_class=executor_class, - usage_context=usage_context, multiprocess_mode=True, asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=self.log_stats, ) self.output_handler: Optional[asyncio.Task] = None diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 0aef61fc7f680..5840541d774ba 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -3,20 +3,19 @@ import signal import threading import time -from dataclasses import dataclass -from multiprocessing.process import BaseProcess +from multiprocessing.connection import Connection from typing import List, Tuple, Type +import psutil import zmq import zmq.asyncio from msgspec import msgpack from vllm.config import CacheConfig, VllmConfig -from vllm.executor.multiproc_worker_utils import get_mp_context from vllm.logger import init_logger from vllm.transformers_utils.config import ( maybe_register_config_serialize_by_value) -from vllm.usage.usage_lib import UsageContext +from vllm.utils import get_exception_traceback, zmq_socket_ctx from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, @@ -25,14 +24,13 @@ from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import PickleEncoder -from vllm.v1.utils import make_zmq_socket from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) POLLING_TIMEOUT_MS = 5000 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000 -LOGGING_TIME_S = POLLING_TIMEOUT_S +LOGGING_TIME_S = 5 class EngineCore: @@ -42,9 +40,10 @@ def __init__( self, vllm_config: VllmConfig, executor_class: Type[Executor], - usage_context: UsageContext, + log_stats: bool = False, ): assert vllm_config.model_config.runner_type != "pooling" + self.log_stats = log_stats logger.info("Initializing an LLM engine (v%s) with config: %s", VLLM_VERSION, vllm_config) @@ -134,29 +133,19 @@ def profile(self, is_start: bool = True): self.model_executor.profile(is_start) -@dataclass -class EngineCoreProcHandle: - proc: BaseProcess - ready_path: str - input_path: str - output_path: str - - class EngineCoreProc(EngineCore): """ZMQ-wrapper for running EngineCore in background process.""" - READY_STR = "READY" - def __init__( self, - vllm_config: VllmConfig, - executor_class: Type[Executor], - usage_context: UsageContext, input_path: str, output_path: str, - ready_path: str, + ready_pipe: Connection, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False, ): - super().__init__(vllm_config, executor_class, usage_context) + super().__init__(vllm_config, executor_class, log_stats) # Background Threads and Queues for IO. These enable us to # overlap ZMQ socket IO with GPU since they release the GIL, @@ -173,68 +162,7 @@ def __init__( daemon=True).start() # Send Readiness signal to EngineClient. - with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket: - ready_socket.send_string(EngineCoreProc.READY_STR) - - @staticmethod - def wait_for_startup( - proc: BaseProcess, - ready_path: str, - ) -> None: - """Wait until the EngineCore is ready.""" - - try: - sync_ctx = zmq.Context() # type: ignore[attr-defined] - socket = sync_ctx.socket(zmq.constants.PULL) - socket.connect(ready_path) - - # Wait for EngineCore to send EngineCoreProc.READY_STR. - while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: - logger.debug("Waiting for EngineCoreProc to startup.") - - if not proc.is_alive(): - raise RuntimeError("EngineCoreProc failed to start.") - - message = socket.recv_string() - assert message == EngineCoreProc.READY_STR - - except BaseException as e: - logger.exception(e) - raise e - - finally: - sync_ctx.destroy(linger=0) - - @staticmethod - def make_engine_core_process( - vllm_config: VllmConfig, - executor_class: Type[Executor], - usage_context: UsageContext, - input_path: str, - output_path: str, - ready_path: str, - ) -> EngineCoreProcHandle: - context = get_mp_context() - - process_kwargs = { - "input_path": input_path, - "output_path": output_path, - "ready_path": ready_path, - "vllm_config": vllm_config, - "executor_class": executor_class, - "usage_context": usage_context, - } - # Run EngineCore busy loop in background process. - proc = context.Process(target=EngineCoreProc.run_engine_core, - kwargs=process_kwargs) - proc.start() - - # Wait for startup - EngineCoreProc.wait_for_startup(proc, ready_path) - return EngineCoreProcHandle(proc=proc, - ready_path=ready_path, - input_path=input_path, - output_path=output_path) + ready_pipe.send({"status": "READY"}) @staticmethod def run_engine_core(*args, **kwargs): @@ -258,6 +186,7 @@ def signal_handler(signum, frame): signal.signal(signal.SIGTERM, signal_handler) signal.signal(signal.SIGINT, signal_handler) + parent_process = psutil.Process().parent() engine_core = None try: engine_core = EngineCoreProc(*args, **kwargs) @@ -266,9 +195,10 @@ def signal_handler(signum, frame): except SystemExit: logger.debug("EngineCore interrupted.") - except BaseException as e: - logger.exception(e) - raise e + except Exception: + traceback = get_exception_traceback() + logger.error("EngineCore hit an exception: %s", traceback) + parent_process.send_signal(signal.SIGQUIT) finally: if engine_core is not None: @@ -309,6 +239,9 @@ def run_busy_loop(self): def _log_stats(self): """Log basic stats every LOGGING_TIME_S""" + if not self.log_stats: + return + now = time.time() if now - self._last_logging_time > LOGGING_TIME_S: @@ -339,7 +272,7 @@ def process_input_socket(self, input_path: str): decoder_add_req = PickleEncoder() decoder_abort_req = PickleEncoder() - with make_zmq_socket(input_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket: while True: # (RequestType, RequestData) type_frame, data_frame = socket.recv_multipart(copy=False) @@ -367,7 +300,7 @@ def process_output_socket(self, output_path: str): # Reuse send buffer. buffer = bytearray() - with make_zmq_socket(output_path, zmq.constants.PUSH) as socket: + with zmq_socket_ctx(output_path, zmq.constants.PUSH) as socket: while True: engine_core_outputs = self.output_queue.get() outputs = EngineCoreOutputs(outputs=engine_core_outputs) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index d56fcbdb1e7c4..beb5d57c20c83 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,19 +1,19 @@ -import os -import weakref -from typing import List, Optional +from typing import List, Optional, Type import msgspec import zmq import zmq.asyncio +from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import get_open_zmq_ipc_path, kill_process_tree +from vllm.utils import get_open_zmq_ipc_path from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) -from vllm.v1.engine.core import (EngineCore, EngineCoreProc, - EngineCoreProcHandle) +from vllm.v1.engine.core import EngineCore, EngineCoreProc +from vllm.v1.executor.abstract import Executor from vllm.v1.serial_utils import PickleEncoder +from vllm.v1.utils import BackgroundProcHandle logger = init_logger(__name__) @@ -31,10 +31,11 @@ class EngineCoreClient: @staticmethod def make_client( - *args, multiprocess_mode: bool, asyncio_mode: bool, - **kwargs, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False, ) -> "EngineCoreClient": # TODO: support this for debugging purposes. @@ -44,12 +45,12 @@ def make_client( "is not currently supported.") if multiprocess_mode and asyncio_mode: - return AsyncMPClient(*args, **kwargs) + return AsyncMPClient(vllm_config, executor_class, log_stats) if multiprocess_mode and not asyncio_mode: - return SyncMPClient(*args, **kwargs) + return SyncMPClient(vllm_config, executor_class, log_stats) - return InprocClient(*args, **kwargs) + return InprocClient(vllm_config, executor_class, log_stats) def shutdown(self): pass @@ -128,9 +129,10 @@ class MPClient(EngineCoreClient): def __init__( self, - *args, asyncio_mode: bool, - **kwargs, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False, ): # Serialization setup. self.encoder = PickleEncoder() @@ -143,7 +145,6 @@ def __init__( self.ctx = zmq.Context() # type: ignore[attr-defined] # Path for IPC. - ready_path = get_open_zmq_ipc_path() output_path = get_open_zmq_ipc_path() input_path = get_open_zmq_ipc_path() @@ -156,47 +157,40 @@ def __init__( self.input_socket.bind(input_path) # Start EngineCore in background process. - self.proc_handle: Optional[EngineCoreProcHandle] - self.proc_handle = EngineCoreProc.make_engine_core_process( - *args, - input_path= - input_path, # type: ignore[misc] # MyPy incorrectly flags duplicate keywords - output_path=output_path, # type: ignore[misc] - ready_path=ready_path, # type: ignore[misc] - **kwargs, - ) - self._finalizer = weakref.finalize(self, self.shutdown) + self.proc_handle: Optional[BackgroundProcHandle] + self.proc_handle = BackgroundProcHandle( + input_path=input_path, + output_path=output_path, + process_name="EngineCore", + target_fn=EngineCoreProc.run_engine_core, + process_kwargs={ + "vllm_config": vllm_config, + "executor_class": executor_class, + "log_stats": log_stats, + }) def shutdown(self): # Shut down the zmq context. self.ctx.destroy(linger=0) if hasattr(self, "proc_handle") and self.proc_handle: - # Shutdown the process if needed. - if self.proc_handle.proc.is_alive(): - self.proc_handle.proc.terminate() - self.proc_handle.proc.join(5) - - if self.proc_handle.proc.is_alive(): - kill_process_tree(self.proc_handle.proc.pid) - - # Remove zmq ipc socket files - ipc_sockets = [ - self.proc_handle.ready_path, self.proc_handle.output_path, - self.proc_handle.input_path - ] - for ipc_socket in ipc_sockets: - socket_file = ipc_socket.replace("ipc://", "") - if os and os.path.exists(socket_file): - os.remove(socket_file) + self.proc_handle.shutdown() self.proc_handle = None class SyncMPClient(MPClient): """Synchronous client for multi-proc EngineCore.""" - def __init__(self, *args, **kwargs): - super().__init__(*args, asyncio_mode=False, **kwargs) + def __init__(self, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False): + super().__init__( + asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=log_stats, + ) def get_output(self) -> List[EngineCoreOutput]: @@ -225,8 +219,16 @@ def profile(self, is_start: bool = True) -> None: class AsyncMPClient(MPClient): """Asyncio-compatible client for multi-proc EngineCore.""" - def __init__(self, *args, **kwargs): - super().__init__(*args, asyncio_mode=True, **kwargs) + def __init__(self, + vllm_config: VllmConfig, + executor_class: Type[Executor], + log_stats: bool = False): + super().__init__( + asyncio_mode=True, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=log_stats, + ) async def get_output_async(self) -> List[EngineCoreOutput]: diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index b58f62778ffe9..fc323184abc8f 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -72,11 +72,11 @@ def __init__( # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( - vllm_config, - executor_class, - usage_context, multiprocess_mode=multiprocess_mode, asyncio_mode=False, + vllm_config=vllm_config, + executor_class=executor_class, + log_stats=False, ) @classmethod diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 128101aa6956d..ed64e7741390d 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -17,13 +17,12 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle, MessageQueue) from vllm.executor.multiproc_worker_utils import ( - _add_prefix, get_mp_context, set_multiprocessing_worker_envs) + _add_prefix, set_multiprocessing_worker_envs) from vllm.logger import init_logger -from vllm.utils import (get_distributed_init_method, get_open_port, - get_open_zmq_ipc_path) +from vllm.utils import (get_distributed_init_method, get_mp_context, + get_open_port, get_open_zmq_ipc_path, zmq_socket_ctx) from vllm.v1.executor.abstract import Executor from vllm.v1.outputs import ModelRunnerOutput -from vllm.v1.utils import make_zmq_socket from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -250,7 +249,7 @@ def __init__( worker_response_mq_handle = self.worker_response_mq.export_handle() # Send Readiness signal to EngineCore process. - with make_zmq_socket(ready_path, zmq.constants.PUSH) as ready_socket: + with zmq_socket_ctx(ready_path, zmq.constants.PUSH) as ready_socket: payload = pickle.dumps(worker_response_mq_handle, protocol=pickle.HIGHEST_PROTOCOL) ready_socket.send_string(WorkerProc.READY_STR) @@ -352,7 +351,7 @@ def wait_for_startup( ready_path: str, ) -> Optional[Handle]: """Wait until the Worker is ready.""" - with make_zmq_socket(ready_path, zmq.constants.PULL) as socket: + with zmq_socket_ctx(ready_path, zmq.constants.PULL) as socket: # Wait for Worker to send READY. while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0: diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index e802c6439b740..19e0dd17237c9 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,11 +1,11 @@ +import os +import weakref from collections.abc import Sequence -from contextlib import contextmanager -from typing import (Any, Generic, Iterator, List, Optional, TypeVar, Union, - overload) - -import zmq +from typing import (Any, Callable, Dict, Generic, List, Optional, TypeVar, + Union, overload) from vllm.logger import init_logger +from vllm.utils import get_mp_context, kill_process_tree logger = init_logger(__name__) @@ -77,27 +77,58 @@ def __len__(self): return len(self._x) -@contextmanager -def make_zmq_socket( - path: str, - type: Any) -> Iterator[zmq.Socket]: # type: ignore[name-defined] - """Context manager for a ZMQ socket""" - - ctx = zmq.Context() # type: ignore[attr-defined] - try: - socket = ctx.socket(type) - - if type == zmq.constants.PULL: - socket.connect(path) - elif type == zmq.constants.PUSH: - socket.bind(path) - else: - raise ValueError(f"Unknown Socket Type: {type}") - - yield socket - - except KeyboardInterrupt: - logger.debug("Worker had Keyboard Interrupt.") - - finally: - ctx.destroy(linger=0) +class BackgroundProcHandle: + """ + Utility class to handle creation, readiness, and shutdown + of background processes used by the AsyncLLM and LLMEngine. + """ + + def __init__( + self, + input_path: str, + output_path: str, + process_name: str, + target_fn: Callable, + process_kwargs: Dict[Any, Any], + ): + self._finalizer = weakref.finalize(self, self.shutdown) + + context = get_mp_context() + reader, writer = context.Pipe(duplex=False) + + assert ("ready_pipe" not in process_kwargs + and "input_path" not in process_kwargs + and "output_path" not in process_kwargs) + process_kwargs["ready_pipe"] = writer + process_kwargs["input_path"] = input_path + process_kwargs["output_path"] = output_path + self.input_path = input_path + self.output_path = output_path + + # Run Detokenizer busy loop in background process. + self.proc = context.Process(target=target_fn, kwargs=process_kwargs) + self.proc.start() + + # Wait for startup. + if reader.recv()["status"] != "READY": + raise RuntimeError(f"{process_name} initialization failed. " + "See root cause above.") + + def __del__(self): + self.shutdown() + + def shutdown(self): + # Shutdown the process if needed. + if hasattr(self, "proc") and self.proc.is_alive(): + self.proc.terminate() + self.proc.join(5) + + if self.proc.is_alive(): + kill_process_tree(self.proc.pid) + + # Remove zmq ipc socket files + ipc_sockets = [self.output_path, self.input_path] + for ipc_socket in ipc_sockets: + socket_file = ipc_socket.replace("ipc://", "") + if os and os.path.exists(socket_file): + os.remove(socket_file) From b5cbe8eeb30e86c8477d91c66f5c7a10e4ee754b Mon Sep 17 00:00:00 2001 From: Rajveer Bachkaniwala <46040700+rajveerb@users.noreply.github.com> Date: Fri, 27 Dec 2024 22:34:46 -0500 Subject: [PATCH 013/462] [Bugfix] Last token measurement fix (#11376) Signed-off-by: rajveerb <46040700+rajveerb@users.noreply.github.com> Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- vllm/engine/llm_engine.py | 8 ++++++-- vllm/sequence.py | 24 ++++++++++++++---------- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 39f59e55da1f7..1db3e59ff3bae 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1124,6 +1124,8 @@ def _process_model_outputs(self, seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) + if not seq_group.is_prefill(): + seq_group.set_last_token_time(now) request_output = RequestOutputFactory.create( seq_group, self.seq_id_to_seq_group, @@ -1166,6 +1168,8 @@ def _process_model_outputs(self, seq_group = scheduled_seq_group.seq_group seq_group.maybe_set_first_token_time(now) + if not seq_group.is_prefill(): + seq_group.set_last_token_time(now) request_output = RequestOutputFactory.create( seq_group, self.seq_id_to_seq_group, @@ -1686,7 +1690,7 @@ def _get_stats(self, # If the seq_group just finished the prefill state # get TTFT. if not seq_group.is_prefill(): - latency = seq_group.get_last_latency(now) + latency = seq_group.get_last_token_latency() time_to_first_tokens_iter.append(latency) # One generation token per finished prefill. @@ -1694,7 +1698,7 @@ def _get_stats(self, seq_group.num_seqs()) else: # TPOTs. - latency = seq_group.get_last_latency(now) + latency = seq_group.get_last_token_latency() time_per_output_tokens_iter.append(latency) if seq_group.state.current_step == 0: # For async_output_proc, the do_log_stats() diff --git a/vllm/sequence.py b/vllm/sequence.py index cc3d96fc93a79..34f910d47b7d9 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -667,6 +667,7 @@ def __init__( first_scheduled_time=None, first_token_time=None, time_in_queue=None) + self.last_token_latency = 0.0 self.lora_request = lora_request self.prompt_logprobs: Optional[PromptLogprobs] = None self.state = SequenceGroupState() @@ -762,18 +763,21 @@ def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int, assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill self.init_multi_step(num_steps=num_lookahead_slots + 1) - def get_last_latency(self, now: float) -> float: + def set_last_token_time(self, now: float) -> None: """Sets the last token time for Request level timings.""" - # If still in prefill phase, raise Error. - if self.is_prefill(): - raise ValueError( - "seq_group.get_last_latency() should not be called " - "if the seq_group is in prefill phase.") - - # Otherwise return token latency. - latency = now - self.metrics.last_token_time + # If still in prefill phase, assertion fails. + assert not self.is_prefill(), ( + "seq_group.set_last_token_time() should not be called " + "if the seq_group is in prefill phase.") + self.last_token_latency = now - self.metrics.last_token_time self.metrics.last_token_time = now - return latency + + def get_last_token_latency(self) -> float: + """Returns the latency of the last token.""" + assert not self.is_prefill(), ( + "seq_group.get_last_token_latency() should not be called " + "if the seq_group is in prefill phase.") + return self.last_token_latency def maybe_set_first_token_time(self, time: float) -> None: """Sets the first token time for Request level timings.""" From d34be24bb196cb0cce167257c97449f0cd6858f7 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Sat, 28 Dec 2024 14:14:10 +0800 Subject: [PATCH 014/462] [Model] Support InternLM2 Reward models (#11571) Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Cyrus Leung --- docs/source/models/supported_models.md | 5 +++ tests/models/registry.py | 2 + vllm/model_executor/models/internlm2.py | 60 ++++++++++++++++++++++++- vllm/model_executor/models/registry.py | 1 + 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 7acafda50793c..fa7102cd88063 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -450,6 +450,11 @@ of the whole prompt are extracted from the normalized hidden state corresponding - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`InternLM2ForRewardModel` + - InternLM2-based + - :code:`internlm/internlm2-1_8b-reward`, :code:`internlm/internlm2-7b-reward`, etc. + - ✅︎ + - ✅︎ * - :code:`LlamaForCausalLM` - Llama-based - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. diff --git a/tests/models/registry.py b/tests/models/registry.py index f5a37420a2909..e5dfb2822745d 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -140,6 +140,8 @@ class _HfExamplesInfo: "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"), "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"), "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"), + "InternLM2ForRewardModel": _HfExamplesInfo("internlm/internlm2-1_8b-reward", + trust_remote_code=True), "JambaForSequenceClassification": _HfExamplesInfo("ai21labs/Jamba-tiny-reward-dev"), # noqa: E501 "LlamaModel": _HfExamplesInfo("llama", is_available_online=False), "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"), diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 41b9f110d771f..28c23edd4c8e8 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -18,14 +18,16 @@ QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, PoolerOutput from .interfaces import SupportsLoRA, SupportsPP from .utils import (is_pp_missing_parameter, @@ -433,3 +435,59 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + + +class InternLM2ForRewardModel(InternLM2ForCausalLM): + + def __init__( + self, + *, + vllm_config: VllmConfig, + prefix: str = "", + model_type: Type[InternLM2Model] = InternLM2Model, + ): + super().__init__(vllm_config=vllm_config, + prefix=prefix, + model_type=model_type) + + for attr in ("output", "logits_processor", "sampler"): + delattr(self, attr) + + config = vllm_config.model_config.hf_config + self.v_head = RowParallelLinear( + config.hidden_size, + 1, + bias=False, + input_is_parallel=False, + prefix=maybe_prefix(prefix, "v_head"), + ) + + pooler_config = vllm_config.model_config.pooler_config + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.ALL, + normalize=False, + softmax=False, + ) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> Union[torch.Tensor, IntermediateTensors]: + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, intermediate_tensors, + inputs_embeds) + logits, _ = self.v_head(hidden_states) + return logits + + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 89992de7e238d..67268eb4bb85f 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -113,6 +113,7 @@ "Gemma2Model": ("gemma2", "Gemma2ForCausalLM"), "GlmForCausalLM": ("glm", "GlmForCausalLM"), "GritLM": ("gritlm", "GritLM"), + "InternLM2ForRewardModel": ("internlm2", "InternLM2ForRewardModel"), "JambaForSequenceClassification": ("jamba", "JambaForSequenceClassification"), # noqa: E501 "LlamaModel": ("llama", "LlamaForCausalLM"), **{ From b7dcc003dc1ace7605946d52b7e077ba1d3bbe86 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 28 Dec 2024 02:54:23 -0800 Subject: [PATCH 015/462] [Model] Remove hardcoded image tokens ids from Pixtral (#11582) Signed-off-by: Roger Wang --- vllm/model_executor/models/pixtral.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index f3d66c2313198..22d29f5bbc50c 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -45,13 +45,6 @@ except ImportError: USE_XFORMERS_OPS = False -# These token ids cannot be retrieved from model config -# so we hardcode them here. -PIXTRAL_12B_IMAGE_BREAK_ID = 12 -PIXTRAL_12B_IMAGE_END_ID = 13 -PIXTRAL_LARGE_IMAGE_BREAK_ID = 14 -PIXTRAL_LARGE_IMAGE_END_ID = 15 - def get_max_pixtral_image_tokens(ctx: InputContext): tokenizer = cached_get_tokenizer( @@ -201,6 +194,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): if key in dataclass_fields } + if not ("image_break_token_id" in vision_args + and "image_end_token_id" in vision_args): + raise ValueError( + "'image_break_token_id' and 'image_end_token_id' not found " + "in the vision_encoder arguments. Please download the latest " + "version of 'params.json' from the model repository.") + self.vision_args = VisionEncoderArgs(**vision_args) # init MistralForCausalLM @@ -240,9 +240,8 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: # NOTE: Image embeddings are split into separate tensors for each image # by the indices of `[IMG_END]` token. - image_end_condition = (image_tokens == PIXTRAL_12B_IMAGE_END_ID) | ( - image_tokens == PIXTRAL_LARGE_IMAGE_END_ID) - split_indices = torch.where(image_end_condition)[0] + 1 + image_end_mask = image_tokens == self.vision_args.image_end_token_id + split_indices = torch.where(image_end_mask)[0] + 1 if len(split_indices) <= 1: # Do not split, return as tensor of shape [1, fs, hs] return image_embeds.unsqueeze(0) @@ -265,10 +264,8 @@ def get_input_embeddings( inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, [ self.vision_args.image_token_id, - PIXTRAL_12B_IMAGE_END_ID, - PIXTRAL_12B_IMAGE_BREAK_ID, - PIXTRAL_LARGE_IMAGE_BREAK_ID, - PIXTRAL_LARGE_IMAGE_END_ID, + self.vision_args.image_break_token_id, + self.vision_args.image_end_token_id, ]) return inputs_embeds @@ -409,6 +406,8 @@ class VisionEncoderArgs: num_attention_heads: int rope_theta: float # for rope-2D image_token_id: int + image_break_token_id: int + image_end_token_id: int adapter_bias: bool = True From 59d6bb4c863e511e58799efac847065c28c52c8b Mon Sep 17 00:00:00 2001 From: hj-wei Date: Sat, 28 Dec 2024 19:17:35 +0800 Subject: [PATCH 016/462] [Hardware][AMD]: Replace HIPCC version with more precise ROCm version (#11515) Signed-off-by: hjwei --- setup.py | 52 +++++++++++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/setup.py b/setup.py index 61d2d710aa20e..ba6953dbdc174 100644 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +import ctypes import importlib.util import logging import os @@ -13,7 +14,7 @@ from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext from setuptools_scm import get_version -from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME def load_module_from_path(module_name, path): @@ -379,25 +380,31 @@ def _build_custom_ops() -> bool: return _is_cuda() or _is_hip() or _is_cpu() -def get_hipcc_rocm_version(): - # Run the hipcc --version command - result = subprocess.run(['hipcc', '--version'], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True) +def get_rocm_version(): + # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so + # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21 + try: + librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so" + if not librocm_core_file.is_file(): + return None + librocm_core = ctypes.CDLL(librocm_core_file) + VerErrors = ctypes.c_uint32 + get_rocm_core_version = librocm_core.getROCmVersion + get_rocm_core_version.restype = VerErrors + get_rocm_core_version.argtypes = [ + ctypes.POINTER(ctypes.c_uint32), + ctypes.POINTER(ctypes.c_uint32), + ctypes.POINTER(ctypes.c_uint32), + ] + major = ctypes.c_uint32() + minor = ctypes.c_uint32() + patch = ctypes.c_uint32() - # Check if the command was executed successfully - if result.returncode != 0: - print("Error running 'hipcc --version'") + if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor), + ctypes.byref(patch)) == 0): + return "%d.%d.%d" % (major.value, minor.value, patch.value) return None - - # Extract the version using a regular expression - match = re.search(r'HIP version: (\S+)', result.stdout) - if match: - # Return the version string - return match.group(1) - else: - print("Could not find HIP version in the output") + except Exception: return None @@ -479,11 +486,10 @@ def get_vllm_version() -> str: if "sdist" not in sys.argv: version += f"{sep}cu{cuda_version_str}" elif _is_hip(): - # Get the HIP version - hipcc_version = get_hipcc_rocm_version() - if hipcc_version != MAIN_CUDA_VERSION: - rocm_version_str = hipcc_version.replace(".", "")[:3] - version += f"{sep}rocm{rocm_version_str}" + # Get the Rocm Version + rocm_version = get_rocm_version() or torch.version.hip + if rocm_version and rocm_version != MAIN_CUDA_VERSION: + version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}" elif _is_neuron(): # Get the Neuron version neuron_version = str(get_neuronxcc_version()) From 42bb201fd6f79d6ed2e28e0263ffa891cd993c4c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sat, 28 Dec 2024 22:33:12 +0900 Subject: [PATCH 017/462] [V1][Minor] Set pin_memory=False for token_ids_cpu tensor (#11581) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_input_batch.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 6c4d300ec6efe..e79145300fe06 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -57,11 +57,13 @@ def __init__( # TODO(woosuk): This buffer could be too large if max_model_len is big. # Find a way to reduce the CPU memory usage. + # This buffer is not directly transferred to the GPU, so it does not + # need to be pinned. self.token_ids_cpu_tensor = torch.zeros( (max_num_reqs, max_model_len), device="cpu", dtype=torch.int32, - pin_memory=pin_memory, + pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) From d427e5cfda8d2536b81e6021128e71b2dbc281aa Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 28 Dec 2024 21:53:59 +0800 Subject: [PATCH 018/462] [Doc] Minor documentation fixes (#11580) Signed-off-by: DarkLight1337 --- docs/source/contributing/dockerfile/dockerfile.md | 6 +++--- docs/source/contributing/overview.md | 2 +- docs/source/getting_started/arm-installation.md | 2 +- docs/source/getting_started/cpu-installation.md | 4 ++-- docs/source/getting_started/gaudi-installation.md | 8 +++++--- docs/source/getting_started/neuron-installation.md | 2 +- docs/source/getting_started/quickstart.md | 4 ++-- docs/source/getting_started/tpu-installation.md | 2 +- docs/source/models/supported_models.md | 6 +++--- docs/source/serving/deploying_with_cerebrium.md | 6 +++--- docs/source/serving/deploying_with_dstack.md | 2 +- docs/source/serving/distributed_serving.md | 6 +++--- docs/source/serving/runai_model_streamer.md | 2 +- 13 files changed, 27 insertions(+), 25 deletions(-) diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/source/contributing/dockerfile/dockerfile.md index 6535414a7dca4..7ffec83333d7d 100644 --- a/docs/source/contributing/dockerfile/dockerfile.md +++ b/docs/source/contributing/dockerfile/dockerfile.md @@ -11,11 +11,11 @@ Below is a visual representation of the multi-stage Dockerfile. The build graph The edges of the build graph represent: -- FROM ... dependencies (with a solid line and a full arrow head) +- `FROM ...` dependencies (with a solid line and a full arrow head) -- COPY --from=... dependencies (with a dashed line and an empty arrow head) +- `COPY --from=...` dependencies (with a dashed line and an empty arrow head) -- RUN --mount=(.\*)from=... dependencies (with a dotted line and an empty diamond arrow head) +- `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head) > ```{figure} ../../assets/dev/dockerfile-stages-dependency.png > :align: center diff --git a/docs/source/contributing/overview.md b/docs/source/contributing/overview.md index 9dac41cff0bcb..c960790f47a13 100644 --- a/docs/source/contributing/overview.md +++ b/docs/source/contributing/overview.md @@ -34,7 +34,7 @@ pytest tests/ ``` ```{note} -Currently, the repository does not pass the `mypy` tests. +Currently, the repository is not fully checked by `mypy`. ``` # Contribution Guidelines diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/arm-installation.md index de807e198b4f6..799b597b3ad5d 100644 --- a/docs/source/getting_started/arm-installation.md +++ b/docs/source/getting_started/arm-installation.md @@ -20,7 +20,7 @@ Contents: ## Requirements - **Operating System**: Linux or macOS -- **Compiler**: gcc/g++ >= 12.3.0 (optional, but recommended) +- **Compiler**: `gcc/g++ >= 12.3.0` (optional, but recommended) - **Instruction Set Architecture (ISA)**: NEON support is required (arm-backend-quick-start-dockerfile)= diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/cpu-installation.md index b6f181ace6274..c3d3f715ed804 100644 --- a/docs/source/getting_started/cpu-installation.md +++ b/docs/source/getting_started/cpu-installation.md @@ -24,7 +24,7 @@ Table of contents: ## Requirements - OS: Linux -- Compiler: gcc/g++>=12.3.0 (optional, recommended) +- Compiler: `gcc/g++>=12.3.0` (optional, recommended) - Instruction set architecture (ISA) requirement: AVX512 (optional, recommended) (cpu-backend-quick-start-dockerfile)= @@ -69,7 +69,7 @@ $ VLLM_TARGET_DEVICE=cpu python setup.py install ```{note} - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. -- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building. +- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building. ``` (env-intro)= diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md index acf42f210dffb..447bf98084a5d 100644 --- a/docs/source/getting_started/gaudi-installation.md +++ b/docs/source/getting_started/gaudi-installation.md @@ -167,6 +167,8 @@ Currently in vLLM for HPU we support four execution modes, depending on selected In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. ``` +(gaudi-bucketing-mechanism)= + ### Bucketing mechanism Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution. @@ -185,7 +187,7 @@ INFO 08-01 21:37:59 hpu_model_runner.py:504] Decode bucket config (min, step, ma INFO 08-01 21:37:59 hpu_model_runner.py:509] Generated 48 decode buckets: [(1, 128), (1, 256), (1, 384), (1, 512), (1, 640), (1, 768), (1, 896), (1, 1024), (1, 1152), (1, 1280), (1, 1408), (1, 1536), (1, 1664), (1, 1792), (1, 1920), (1, 2048), (2, 128), (2, 256), (2, 384), (2, 512), (2, 640), (2, 768), (2, 896), (2, 1024), (2, 1152), (2, 1280), (2, 1408), (2, 1536), (2, 1664), (2, 1792), (2, 1920), (2, 2048), (4, 128), (4, 256), (4, 384), (4, 512), (4, 640), (4, 768), (4, 896), (4, 1024), (4, 1152), (4, 1280), (4, 1408), (4, 1536), (4, 1664), (4, 1792), (4, 1920), (4, 2048)] ``` -`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling - `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. +`min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, interval between `min` and `step` has special handling -- `min` gets multiplied by consecutive powers of two, until `step` gets reached. We call this the ramp-up phase and it is used for handling lower batch sizes with minimum wastage, while allowing larger padding on larger batch sizes. Example (with ramp-up) @@ -214,7 +216,7 @@ If a request exceeds maximum bucket size in any dimension, it will be processed As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket. ```{note} -Bucketing is transparent to a client - padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. +Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests. ``` ### Warmup @@ -235,7 +237,7 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB ``` -This example uses the same buckets as in *Bucketing mechanism* section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. +This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations. ```{tip} Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment. diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/neuron-installation.md index d6de5760cc82c..baaeeb9f53a10 100644 --- a/docs/source/getting_started/neuron-installation.md +++ b/docs/source/getting_started/neuron-installation.md @@ -26,7 +26,7 @@ Installation steps: (build-from-source-neuron)= ```{note} -The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. +The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel. ``` ## Build from source diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 165e5df146dcd..9c8b7e4f592c9 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -114,7 +114,7 @@ $ "temperature": 0 $ }' ``` -Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` python package: +Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the `openai` Python package: ```python from openai import OpenAI @@ -151,7 +151,7 @@ $ ] $ }' ``` -Alternatively, you can use the `openai` python package: +Alternatively, you can use the `openai` Python package: ```python from openai import OpenAI diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md index f2a949e7247d8..17eded4a51fec 100644 --- a/docs/source/getting_started/tpu-installation.md +++ b/docs/source/getting_started/tpu-installation.md @@ -103,7 +103,7 @@ Connect to your TPU using SSH: gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE ``` -Install Miniconda +Install Miniconda: ```bash wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index fa7102cd88063..f6e00fa71a310 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -435,7 +435,7 @@ despite being described otherwise on its model card. ``` If your model is not in the above list, we will try to automatically convert the model using -:func:`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings +{func}`vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings of the whole prompt are extracted from the normalized hidden state corresponding to the last token. #### Reward Modeling (`--task reward`) @@ -468,7 +468,7 @@ of the whole prompt are extracted from the normalized hidden state corresponding ``` If your model is not in the above list, we will try to automatically convert the model using -:func:`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. +{func}`vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly. ```{important} For process-supervised reward models such as {code}`peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly, @@ -500,7 +500,7 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 ``` If your model is not in the above list, we will try to automatically convert the model using -:func:`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. +{func}`vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token. #### Sentence Pair Scoring (`--task score`) diff --git a/docs/source/serving/deploying_with_cerebrium.md b/docs/source/serving/deploying_with_cerebrium.md index 4863936236119..950064c8c1b10 100644 --- a/docs/source/serving/deploying_with_cerebrium.md +++ b/docs/source/serving/deploying_with_cerebrium.md @@ -33,7 +33,7 @@ docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04" vllm = "latest" ``` -Next, let us add our code to handle inference for the LLM of your choice(`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your main.py\`: +Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`: ```python from vllm import LLM, SamplingParams @@ -55,13 +55,13 @@ def run(prompts: list[str], temperature: float = 0.8, top_p: float = 0.95): return {"results": results} ``` -Then, run the following code to deploy it to the cloud +Then, run the following code to deploy it to the cloud: ```console $ cerebrium deploy ``` -If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case /run) +If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case` /run`) ```python curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \ diff --git a/docs/source/serving/deploying_with_dstack.md b/docs/source/serving/deploying_with_dstack.md index 65ef1c0016208..381f5f786ca2c 100644 --- a/docs/source/serving/deploying_with_dstack.md +++ b/docs/source/serving/deploying_with_dstack.md @@ -25,7 +25,7 @@ $ cd vllm-dstack $ dstack init ``` -Next, to provision a VM instance with LLM of your choice(`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: +Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`: ```yaml type: service diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index c0a4b23f6dc70..7446b7c84cf46 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -8,7 +8,7 @@ Before going into the details of distributed inference and serving, let's first - **Single GPU (no distributed inference)**: If your model fits in a single GPU, you probably don't need to use distributed inference. Just use the single GPU to run the inference. - **Single-Node Multi-GPU (tensor parallel inference)**: If your model is too large to fit in a single GPU, but it can fit in a single node with multiple GPUs, you can use tensor parallelism. The tensor parallel size is the number of GPUs you want to use. For example, if you have 4 GPUs in a single node, you can set the tensor parallel size to 4. -- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. +- **Multi-Node Multi-GPU (tensor parallel plus pipeline parallel inference)**: If your model is too large to fit in a single node, you can use tensor parallel together with pipeline parallelism. The tensor parallel size is the number of GPUs you want to use in each node, and the pipeline parallel size is the number of nodes you want to use. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2. In short, you should increase the number of GPUs and the number of nodes until you have enough GPU memory to hold the model. The tensor parallel size should be the number of GPUs in each node, and the pipeline parallel size should be the number of nodes. @@ -77,7 +77,7 @@ Then you get a ray cluster of containers. Note that you need to keep the shells Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. -After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: +After that, on any node, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: ```console $ vllm serve /path/to/the/model/in/the/container \ @@ -85,7 +85,7 @@ $ --tensor-parallel-size 8 \ $ --pipeline-parallel-size 2 ``` -You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8GPUs per node), you can set the tensor parallel size to 16: +You can also use tensor parallel without pipeline parallel, just set the tensor parallel size to the number of GPUs in the cluster. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 16: ```console $ vllm serve /path/to/the/model/in/the/container \ diff --git a/docs/source/serving/runai_model_streamer.md b/docs/source/serving/runai_model_streamer.md index 1b5756a95075a..d4269050ff574 100644 --- a/docs/source/serving/runai_model_streamer.md +++ b/docs/source/serving/runai_model_streamer.md @@ -41,7 +41,7 @@ For reading from S3, it will be the number of client instances the host is openi $ vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}' ``` -You can controls the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. +You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size. You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit). ```console From 328841d00294fb8226f0368cc380350b3d671d77 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 29 Dec 2024 00:55:42 +0800 Subject: [PATCH 019/462] [bugfix] interleaving sliding window for cohere2 model (#11583) Signed-off-by: youkaichao --- docs/source/models/supported_models.md | 2 +- tests/models/test_initialization.py | 4 - vllm/config.py | 2 +- vllm/model_executor/models/commandr.py | 10 +- vllm/transformers_utils/config.py | 7 +- vllm/transformers_utils/configs/__init__.py | 2 + vllm/transformers_utils/configs/cohere2.py | 192 ++++++++++++++++++++ 7 files changed, 206 insertions(+), 13 deletions(-) create mode 100644 vllm/transformers_utils/configs/cohere2.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index f6e00fa71a310..e11befbb8dd30 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -112,7 +112,7 @@ See [this page](#generative-models) for more information on how to use generativ - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. - ✅︎ - ✅︎ - * - :code:`CohereForCausalLM`,:code:`Cohere2ForCausalLM` + * - :code:`CohereForCausalLM`, :code:`Cohere2ForCausalLM` - Command-R - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc. - ✅︎ diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py index a4eea7f035c91..3b728f2744fca 100644 --- a/tests/models/test_initialization.py +++ b/tests/models/test_initialization.py @@ -1,7 +1,6 @@ from unittest.mock import patch import pytest -import transformers from transformers import PretrainedConfig from vllm import LLM @@ -12,9 +11,6 @@ @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs()) def test_can_initialize(model_arch): model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch) - if (model_arch == "Cohere2ForCausalLM" - and transformers.__version__ < "4.48.0"): - pytest.skip(reason="Model introduced in HF >= 4.48.0") if not model_info.is_available_online: pytest.skip("Model is not available online") diff --git a/vllm/config.py b/vllm/config.py index ac767bbe14be4..6ae1d4d944447 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -301,7 +301,7 @@ def __init__(self, sliding_window = getattr(self.hf_text_config, "sliding_window", None) has_interleaved_attention = (sliding_window is not None) and ( isinstance(sliding_window, list) or - (self.hf_text_config.model_type in ["gemma2"])) + (self.hf_text_config.model_type in ["gemma2", "cohere2"])) if (not self.disable_sliding_window and has_interleaved_attention): if envs.VLLM_ATTENTION_BACKEND == "XFORMERS": diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index c846e42f1b0c3..d22d1f3171463 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -172,16 +172,18 @@ def __init__( is_neox_style=False, ) - sliding_window = getattr(config, "sliding_window", None) - # Model v2 has sliding windows, v1 does not - self.v1 = sliding_window is None + # Model v2 has interleaved sliding windows, v1 does not + interleaved_sliding_window = getattr(config, + "interleaved_sliding_window", + None) + self.v1 = interleaved_sliding_window is None layer_idx = extract_layer_index(prefix) layer_has_sliding_window = ( getattr(config, "sliding_window_pattern", False) and (layer_idx + 1) % self.config.sliding_window_pattern != 0) - self.sliding_window = (sliding_window + self.sliding_window = (interleaved_sliding_window if layer_has_sliding_window else None) self.attn = Attention(self.num_heads, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 4529cf27ef565..58417980e7b47 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -22,9 +22,9 @@ from vllm.logger import init_logger # yapf conflicts with isort for this block # yapf: disable -from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, - EAGLEConfig, ExaoneConfig, - H2OVLChatConfig, +from vllm.transformers_utils.configs import (ChatGLMConfig, Cohere2Config, + DbrxConfig, EAGLEConfig, + ExaoneConfig, H2OVLChatConfig, InternVLChatConfig, JAISConfig, MedusaConfig, MllamaConfig, MLPSpeculatorConfig, MPTConfig, @@ -52,6 +52,7 @@ _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = { "chatglm": ChatGLMConfig, + "cohere2": Cohere2Config, "dbrx": DbrxConfig, "mpt": MPTConfig, "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index c24433cd436b4..a41a35c88b3a1 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -1,4 +1,5 @@ from vllm.transformers_utils.configs.chatglm import ChatGLMConfig +from vllm.transformers_utils.configs.cohere2 import Cohere2Config from vllm.transformers_utils.configs.dbrx import DbrxConfig from vllm.transformers_utils.configs.eagle import EAGLEConfig from vllm.transformers_utils.configs.exaone import ExaoneConfig @@ -22,6 +23,7 @@ __all__ = [ "ChatGLMConfig", + "Cohere2Config", "DbrxConfig", "MPTConfig", "RWConfig", diff --git a/vllm/transformers_utils/configs/cohere2.py b/vllm/transformers_utils/configs/cohere2.py new file mode 100644 index 0000000000000..1509330fc2179 --- /dev/null +++ b/vllm/transformers_utils/configs/cohere2.py @@ -0,0 +1,192 @@ +# ruff: noqa + +# Adapted from +# https://github.com/huggingface/transformers/blob/main/src/transformers/models/cohere2/configuration_cohere2.py +from transformers import PretrainedConfig +from transformers.modeling_rope_utils import rope_config_validation + + +class Cohere2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere + model according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a configuration + with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model. + + + Args: + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CohereModel`] + hidden_size (`int`, *optional*, defaults to 8192): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 22528): + Dimension of the MLP representations. + logit_scale (`float`, *optional*, defaults to 0.0625): + The scaling factor for the output logits. + num_hidden_layers (`int`, *optional*, defaults to 40): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 64): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 8192): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 5): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 255001): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `True`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + sliding_window (`int`, *optional*, defaults to 4096): + Size of the sliding window attention context. + sliding_window_pattern (`int`, *optional*, defaults to 4): + Pattern for the sliding window attention. + cache_implementation (`str`, *optional*, defaults to `"hybrid"`): the cache type to be used with `generate`. + + ```python + >>> from transformers import Cohere2Model, Cohere2Config + + >>> # Initializing a Cohere Nextmodel configuration + >>> configuration = Cohere2Config() + + >>> # Initializing a model from the Cohere2 configuration + >>> model = Cohere2Model(configuration) # doctest: +SKIP + + >>> # Accessing the model configuration + >>> configuration = model.config # doctest: +SKIP + ``` + """ + + model_type = "cohere2" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=256000, + hidden_size=8192, + intermediate_size=22528, + logit_scale=0.0625, + num_hidden_layers=40, + num_attention_heads=64, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=8192, + initializer_range=0.02, + layer_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=5, + eos_token_id=255001, + tie_word_embeddings=True, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + sliding_window=4096, + sliding_window_pattern=4, + cache_implementation="hybrid", + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.logit_scale = logit_scale + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.sliding_window = sliding_window + self.sliding_window_pattern = sliding_window_pattern + # Need to specify head_dim in the config so it can be used in the attention forward functions + self.head_dim = hidden_size // num_attention_heads + self.cache_implementation = cache_implementation + + # Validate the correctness of rotary position embeddings parameters + rope_config_validation(self) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + +__all__ = ["Cohere2Config"] From 4fb8e329fd6f51d576bcf4b7e8907e0d83c4b5cf Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Sat, 28 Dec 2024 15:51:57 -0500 Subject: [PATCH 020/462] [V1] [5/N] API Server: unify `Detokenizer` and `EngineCore` input (#11545) Signed-off-by: rshaw@neuralmagic.com --- tests/v1/engine/test_detokenizer.py | 57 ++++++++++++++++++----------- vllm/v1/engine/__init__.py | 16 +------- vllm/v1/engine/async_llm.py | 14 ++++--- vllm/v1/engine/detokenizer.py | 21 ++++++----- vllm/v1/engine/llm_engine.py | 12 +++--- vllm/v1/engine/processor.py | 23 ++---------- 6 files changed, 66 insertions(+), 77 deletions(-) diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py index 07f343666cb5e..aeae697ca32b0 100644 --- a/tests/v1/engine/test_detokenizer.py +++ b/tests/v1/engine/test_detokenizer.py @@ -3,9 +3,9 @@ import pytest from transformers import AutoTokenizer -from vllm.sampling_params import RequestOutputKind -from vllm.v1.engine import EngineCoreOutput -from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest +from vllm.sampling_params import RequestOutputKind, SamplingParams +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest +from vllm.v1.engine.detokenizer import Detokenizer TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3" tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME) @@ -71,16 +71,22 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind): # Make N requests. requests = [ - DetokenizerRequest( - request_id=f"request-{idx}", - prompt=prompt, - prompt_token_ids=prompt_tokens, - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=request_output_kind, - stop=[], - include_stop_str_in_output=False, - ) for idx, ( + EngineCoreRequest(request_id=f"request-{idx}", + prompt=prompt, + prompt_token_ids=prompt_tokens, + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=request_output_kind, + stop=[], + include_stop_str_in_output=False)) + for idx, ( prompt, prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] @@ -133,18 +139,25 @@ def test_stop_string(include_stop_str_in_output: bool): # Make N requests. requests = [ - DetokenizerRequest( + EngineCoreRequest( request_id=f"request-{idx}", prompt=prompt, prompt_token_ids=prompt_tokens, - skip_special_tokens=False, - spaces_between_special_tokens=False, - output_kind=RequestOutputKind.DELTA, - stop=STOP_STRINGS, - include_stop_str_in_output=include_stop_str_in_output, - ) for idx, ( - prompt, - prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) + arrival_time=0, + mm_inputs=None, + mm_hashes=None, + mm_placeholders=None, + eos_token_id=None, + lora_request=None, + sampling_params=SamplingParams( + skip_special_tokens=False, + spaces_between_special_tokens=False, + output_kind=RequestOutputKind.DELTA, + stop=STOP_STRINGS, + include_stop_str_in_output=include_stop_str_in_output, + )) for idx, ( + prompt, + prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS)) ] # Add requests to the detokenizer. diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index cc0c7ea23469a..f70464fc88298 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -6,21 +6,7 @@ from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict -from vllm.sampling_params import RequestOutputKind, SamplingParams - - -@dataclass -class DetokenizerRequest: - - request_id: str - prompt: Optional[str] - prompt_token_ids: List[int] - skip_special_tokens: bool - spaces_between_special_tokens: bool - output_kind: RequestOutputKind - - stop: List[str] - include_stop_str_in_output: bool +from vllm.sampling_params import SamplingParams @dataclass diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index da3da6dad6436..213ddaa023dbc 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -158,16 +158,18 @@ async def add_request( raise ValueError(f"Request id {request_id} already running.") self.rid_to_queue[request_id] = asyncio.Queue() - # 2) Convert input --> DetokenizerRequest / EngineCoreRequest. - detokenizer_req, engine_core_req = self.processor.process_inputs( - request_id, prompt, params, arrival_time, lora_request, - trace_headers, prompt_adapter_request, priority) + # 2) Convert Input --> Request. + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) # 3) Add the request to Detokenizer (this process). - self.detokenizer.add_request(detokenizer_req) + self.detokenizer.add_request(request) # 4) Add the EngineCoreRequest to EngineCore (separate process). - await self.engine_core.add_request_async(engine_core_req) + await self.engine_core.add_request_async(request) if self.log_requests: logger.info("Added request %s.", request_id) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 02f34e2b54dd5..65be9e58e03c8 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -8,7 +8,7 @@ from vllm.transformers_utils.detokenizer_utils import ( AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally) from vllm.transformers_utils.tokenizer import get_tokenizer -from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput +from vllm.v1.engine import EngineCoreOutput, EngineCoreRequest logger = init_logger(__name__) @@ -55,19 +55,19 @@ def output_token_ids(self) -> List[int]: def from_new_request( cls, tokenizer: AnyTokenizer, - request: DetokenizerRequest, + request: EngineCoreRequest, ) -> "IncrementalDetokenizer": tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens( tokenizer=tokenizer, prompt_ids=request.prompt_token_ids, - skip_special_tokens=request.skip_special_tokens, + skip_special_tokens=request.sampling_params.skip_special_tokens, ) - stops = request.stop + stops = request.sampling_params.stop # Number of chars to hold back when stop strings are to be excluded # from streamed output. - if stops and not request.include_stop_str_in_output: + if stops and not request.sampling_params.include_stop_str_in_output: stop_buffer_length = max(len(s) for s in stops) - 1 else: stop_buffer_length = 0 @@ -79,13 +79,14 @@ def from_new_request( # NOTE(Nick): could we take ownership of it though? token_ids=request.prompt_token_ids.copy(), stop=stops, - include_stop_str_in_output=request.include_stop_str_in_output, + include_stop_str_in_output=request.sampling_params. + include_stop_str_in_output, prefix_offset=prefix_offset, read_offset=read_offset, - skip_special_tokens=request.skip_special_tokens, - spaces_between_special_tokens=request. + skip_special_tokens=request.sampling_params.skip_special_tokens, + spaces_between_special_tokens=request.sampling_params. spaces_between_special_tokens, - output_kind=request.output_kind, + output_kind=request.sampling_params.output_kind, request_id=request.request_id, prompt=request.prompt, prompt_token_ids=request.prompt_token_ids, @@ -227,7 +228,7 @@ def abort_requests( def add_request( self, - request: DetokenizerRequest, + request: EngineCoreRequest, ): """Add new request to the Detokenizer.""" diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index fc323184abc8f..a19109559eabf 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -152,15 +152,17 @@ def add_request( ) -> None: # 1) Process raw inputs into the request. - detokenizer_req, engine_core_req = self.processor.process_inputs( - request_id, prompt, params, arrival_time, lora_request, - trace_headers, prompt_adapter_request, priority) + request = self.processor.process_inputs(request_id, prompt, params, + arrival_time, lora_request, + trace_headers, + prompt_adapter_request, + priority) # 2) Add the request to Detokenizer. - self.detokenizer.add_request(detokenizer_req) + self.detokenizer.add_request(request) # 3) Add the request to EngineCore. - self.engine_core.add_request(engine_core_req) + self.engine_core.add_request(request) def step(self) -> List[RequestOutput]: diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 6ee8732bc902c..5b5a5a61cea7d 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -1,5 +1,5 @@ import time -from typing import Mapping, Optional, Tuple, Union +from typing import Mapping, Optional, Union from vllm.config import CacheConfig, LoRAConfig, ModelConfig from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, @@ -13,7 +13,7 @@ from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup -from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest +from vllm.v1.engine import EngineCoreRequest from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient @@ -62,7 +62,7 @@ def process_inputs( trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, - ) -> Tuple[DetokenizerRequest, EngineCoreRequest]: + ) -> EngineCoreRequest: # TODO(woosuk): Support pooling models. # TODO(woosuk): Check max_logprobs @@ -123,20 +123,7 @@ def process_inputs( decoder_inputs.multi_modal_data, mm_hashes, decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs) - # Make Request for Detokenizer. - detokenizer_request = DetokenizerRequest( - request_id, - decoder_inputs.prompt, - decoder_inputs.prompt_token_ids, - sampling_params.skip_special_tokens, - sampling_params.spaces_between_special_tokens, - sampling_params.output_kind, - sampling_params.stop, - sampling_params.include_stop_str_in_output, - ) - - # Make Request for EngineCore. - engine_core_request = EngineCoreRequest( + return EngineCoreRequest( request_id, decoder_inputs.prompt, decoder_inputs.prompt_token_ids, @@ -149,8 +136,6 @@ def process_inputs( lora_request, ) - return detokenizer_request, engine_core_request - def _validate_model_inputs(self, inputs: ProcessorInputs): if is_encoder_decoder_inputs(inputs): # For encoder-decoder multimodal models, the max_prompt_len From 32b4c63f02b2ab28a49a040b1d170a903a5cd9dc Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 29 Dec 2024 15:56:22 +0800 Subject: [PATCH 021/462] [Doc] Convert list tables to MyST (#11594) Signed-off-by: DarkLight1337 --- docs/source/getting_started/debugging.md | 2 +- .../getting_started/gaudi-installation.md | 39 +- .../getting_started/tpu-installation.md | 53 +- docs/source/models/supported_models.md | 1206 ++++++++--------- .../source/quantization/supported_hardware.md | 227 ++-- docs/source/serving/deploying_with_helm.md | 409 +++--- 6 files changed, 961 insertions(+), 975 deletions(-) diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/debugging.md index 3b0029f2e88ce..19eb699572a08 100644 --- a/docs/source/getting_started/debugging.md +++ b/docs/source/getting_started/debugging.md @@ -197,4 +197,4 @@ if __name__ == '__main__': ## Known Issues - In `v0.5.2`, `v0.5.3`, and `v0.5.3.post1`, there is a bug caused by [zmq](https://github.com/zeromq/pyzmq/issues/2000) , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of `vllm` to include the [fix](gh-pr:6759). -- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable ``NCCL_CUMEM_ENABLE=0`` to disable NCCL's ``cuMem`` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . +- To circumvent a NCCL [bug](https://github.com/NVIDIA/nccl/issues/1234) , all vLLM processes will set an environment variable `NCCL_CUMEM_ENABLE=0` to disable NCCL's `cuMem` allocator. It does not affect performance but only gives memory benefits. When external processes want to set up a NCCL connection with vLLM's processes, they should also set this environment variable, otherwise, inconsistent environment setup will cause NCCL to hang or crash, as observed in the [RLHF integration](https://github.com/OpenRLHF/OpenRLHF/pull/604) and the [discussion](gh-issue:5723#issuecomment-2554389656) . diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/gaudi-installation.md index 447bf98084a5d..1f2ee62860dec 100644 --- a/docs/source/getting_started/gaudi-installation.md +++ b/docs/source/getting_started/gaudi-installation.md @@ -141,26 +141,25 @@ Gaudi2 devices. Configurations that are not listed may or may not work. Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag. -```{eval-rst} -.. list-table:: vLLM execution modes - :widths: 25 25 50 - :header-rows: 1 - - * - ``PT_HPU_LAZY_MODE`` - - ``enforce_eager`` - - execution mode - * - 0 - - 0 - - torch.compile - * - 0 - - 1 - - PyTorch eager mode - * - 1 - - 0 - - HPU Graphs - * - 1 - - 1 - - PyTorch lazy mode +```{list-table} vLLM execution modes +:widths: 25 25 50 +:header-rows: 1 + +* - `PT_HPU_LAZY_MODE` + - `enforce_eager` + - execution mode +* - 0 + - 0 + - torch.compile +* - 0 + - 1 + - PyTorch eager mode +* - 1 + - 0 + - HPU Graphs +* - 1 + - 1 + - PyTorch lazy mode ``` ```{warning} diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/tpu-installation.md index 17eded4a51fec..4d3ac541c90ce 100644 --- a/docs/source/getting_started/tpu-installation.md +++ b/docs/source/getting_started/tpu-installation.md @@ -68,33 +68,32 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \ --service-account SERVICE_ACCOUNT ``` -```{eval-rst} -.. list-table:: Parameter descriptions - :header-rows: 1 - - * - Parameter name - - Description - * - QUEUED_RESOURCE_ID - - The user-assigned ID of the queued resource request. - * - TPU_NAME - - The user-assigned name of the TPU which is created when the queued - resource request is allocated. - * - PROJECT_ID - - Your Google Cloud project - * - ZONE - - The GCP zone where you want to create your Cloud TPU. The value you use - depends on the version of TPUs you are using. For more information, see - `TPU regions and zones `_ - * - ACCELERATOR_TYPE - - The TPU version you want to use. Specify the TPU version, for example - `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, - see `TPU versions `_. - * - RUNTIME_VERSION - - The TPU VM runtime version to use. For more information see `TPU VM images `_. - * - SERVICE_ACCOUNT - - The email address for your service account. You can find it in the IAM - Cloud Console under *Service Accounts*. For example: - `tpu-service-account@.iam.gserviceaccount.com` +```{list-table} Parameter descriptions +:header-rows: 1 + +* - Parameter name + - Description +* - QUEUED_RESOURCE_ID + - The user-assigned ID of the queued resource request. +* - TPU_NAME + - The user-assigned name of the TPU which is created when the queued + resource request is allocated. +* - PROJECT_ID + - Your Google Cloud project +* - ZONE + - The GCP zone where you want to create your Cloud TPU. The value you use + depends on the version of TPUs you are using. For more information, see + `TPU regions and zones `_ +* - ACCELERATOR_TYPE + - The TPU version you want to use. Specify the TPU version, for example + `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, + see `TPU versions `_. +* - RUNTIME_VERSION + - The TPU VM runtime version to use. For more information see `TPU VM images `_. +* - SERVICE_ACCOUNT + - The email address for your service account. You can find it in the IAM + Cloud Console under *Service Accounts*. For example: + `tpu-service-account@.iam.gserviceaccount.com` ``` Connect to your TPU using SSH: diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index e11befbb8dd30..518505abeb2a9 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -72,291 +72,290 @@ See [this page](#generative-models) for more information on how to use generativ #### Text Generation (`--task generate`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`AquilaForCausalLM` - - Aquila, Aquila2 - - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`ArcticForCausalLM` - - Arctic - - :code:`Snowflake/snowflake-arctic-base`, :code:`Snowflake/snowflake-arctic-instruct`, etc. - - - - ✅︎ - * - :code:`BaiChuanForCausalLM` - - Baichuan2, Baichuan - - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`BloomForCausalLM` - - BLOOM, BLOOMZ, BLOOMChat - - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. - - - - ✅︎ - * - :code:`BartForConditionalGeneration` - - BART - - :code:`facebook/bart-base`, :code:`facebook/bart-large-cnn`, etc. - - - - - * - :code:`ChatGLMModel` - - ChatGLM - - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. - - ✅︎ - - ✅︎ - * - :code:`CohereForCausalLM`, :code:`Cohere2ForCausalLM` - - Command-R - - :code:`CohereForAI/c4ai-command-r-v01`, :code:`CohereForAI/c4ai-command-r7b-12-2024`, etc. - - ✅︎ - - ✅︎ - * - :code:`DbrxForCausalLM` - - DBRX - - :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc. - - - - ✅︎ - * - :code:`DeciLMForCausalLM` - - DeciLM - - :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc. - - - - ✅︎ - * - :code:`DeepseekForCausalLM` - - DeepSeek - - :code:`deepseek-ai/deepseek-llm-67b-base`, :code:`deepseek-ai/deepseek-llm-7b-chat` etc. - - - - ✅︎ - * - :code:`DeepseekV2ForCausalLM` - - DeepSeek-V2 - - :code:`deepseek-ai/DeepSeek-V2`, :code:`deepseek-ai/DeepSeek-V2-Chat` etc. - - - - ✅︎ - * - :code:`DeepseekV3ForCausalLM` - - DeepSeek-V3 - - :code:`deepseek-ai/DeepSeek-V3-Base`, :code:`deepseek-ai/DeepSeek-V3` etc. - - - - ✅︎ - * - :code:`ExaoneForCausalLM` - - EXAONE-3 - - :code:`LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`FalconForCausalLM` - - Falcon - - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. - - - - ✅︎ - * - :code:`FalconMambaForCausalLM` - - FalconMamba - - :code:`tiiuae/falcon-mamba-7b`, :code:`tiiuae/falcon-mamba-7b-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`GemmaForCausalLM` - - Gemma - - :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`Gemma2ForCausalLM` - - Gemma2 - - :code:`google/gemma-2-9b`, :code:`google/gemma-2-27b`, etc. - - ✅︎ - - ✅︎ - * - :code:`GlmForCausalLM` - - GLM-4 - - :code:`THUDM/glm-4-9b-chat-hf`, etc. - - ✅︎ - - ✅︎ - * - :code:`GPT2LMHeadModel` - - GPT-2 - - :code:`gpt2`, :code:`gpt2-xl`, etc. - - - - ✅︎ - * - :code:`GPTBigCodeForCausalLM` - - StarCoder, SantaCoder, WizardCoder - - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. - - ✅︎ - - ✅︎ - * - :code:`GPTJForCausalLM` - - GPT-J - - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. - - - - ✅︎ - * - :code:`GPTNeoXForCausalLM` - - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM - - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. - - - - ✅︎ - * - :code:`GraniteForCausalLM` - - Granite 3.0, Granite 3.1, PowerLM - - :code:`ibm-granite/granite-3.0-2b-base`, :code:`ibm-granite/granite-3.1-8b-instruct`, :code:`ibm/PowerLM-3b`, etc. - - ✅︎ - - ✅︎ - * - :code:`GraniteMoeForCausalLM` - - Granite 3.0 MoE, PowerMoE - - :code:`ibm-granite/granite-3.0-1b-a400m-base`, :code:`ibm-granite/granite-3.0-3b-a800m-instruct`, :code:`ibm/PowerMoE-3b`, etc. - - ✅︎ - - ✅︎ - * - :code:`GritLM` - - GritLM - - :code:`parasail-ai/GritLM-7B-vllm`. - - ✅︎ - - ✅︎ - * - :code:`InternLMForCausalLM` - - InternLM - - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`InternLM2ForCausalLM` - - InternLM2 - - :code:`internlm/internlm2-7b`, :code:`internlm/internlm2-chat-7b`, etc. - - ✅︎ - - ✅︎ - * - :code:`JAISLMHeadModel` - - Jais - - :code:`inceptionai/jais-13b`, :code:`inceptionai/jais-13b-chat`, :code:`inceptionai/jais-30b-v3`, :code:`inceptionai/jais-30b-chat-v3`, etc. - - - - ✅︎ - * - :code:`JambaForCausalLM` - - Jamba - - :code:`ai21labs/AI21-Jamba-1.5-Large`, :code:`ai21labs/AI21-Jamba-1.5-Mini`, :code:`ai21labs/Jamba-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`LlamaForCausalLM` - - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi - - :code:`meta-llama/Meta-Llama-3.1-405B-Instruct`, :code:`meta-llama/Meta-Llama-3.1-70B`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-70b-hf`, :code:`01-ai/Yi-34B`, etc. - - ✅︎ - - ✅︎ - * - :code:`MambaForCausalLM` - - Mamba - - :code:`state-spaces/mamba-130m-hf`, :code:`state-spaces/mamba-790m-hf`, :code:`state-spaces/mamba-2.8b-hf`, etc. - - - - ✅︎ - * - :code:`MiniCPMForCausalLM` - - MiniCPM - - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, :code:`openbmb/MiniCPM-S-1B-sft`, etc. - - ✅︎ - - ✅︎ - * - :code:`MiniCPM3ForCausalLM` - - MiniCPM3 - - :code:`openbmb/MiniCPM3-4B`, etc. - - ✅︎ - - ✅︎ - * - :code:`MistralForCausalLM` - - Mistral, Mistral-Instruct - - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`MixtralForCausalLM` - - Mixtral-8x7B, Mixtral-8x7B-Instruct - - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc. - - ✅︎ - - ✅︎ - * - :code:`MPTForCausalLM` - - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter - - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. - - - - ✅︎ - * - :code:`NemotronForCausalLM` - - Nemotron-3, Nemotron-4, Minitron - - :code:`nvidia/Minitron-8B-Base`, :code:`mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. - - ✅︎ - - ✅︎ - * - :code:`OLMoForCausalLM` - - OLMo - - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc. - - - - ✅︎ - * - :code:`OLMo2ForCausalLM` - - OLMo2 - - :code:`allenai/OLMo2-7B-1124`, etc. - - - - ✅︎ - * - :code:`OLMoEForCausalLM` - - OLMoE - - :code:`allenai/OLMoE-1B-7B-0924`, :code:`allenai/OLMoE-1B-7B-0924-Instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`OPTForCausalLM` - - OPT, OPT-IML - - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. - - - - ✅︎ - * - :code:`OrionForCausalLM` - - Orion - - :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc. - - - - ✅︎ - * - :code:`PhiForCausalLM` - - Phi - - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. - - ✅︎ - - ✅︎ - * - :code:`Phi3ForCausalLM` - - Phi-3 - - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, :code:`microsoft/Phi-3-medium-128k-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`Phi3SmallForCausalLM` - - Phi-3-Small - - :code:`microsoft/Phi-3-small-8k-instruct`, :code:`microsoft/Phi-3-small-128k-instruct`, etc. - - - - ✅︎ - * - :code:`PhiMoEForCausalLM` - - Phi-3.5-MoE - - :code:`microsoft/Phi-3.5-MoE-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`PersimmonForCausalLM` - - Persimmon - - :code:`adept/persimmon-8b-base`, :code:`adept/persimmon-8b-chat`, etc. - - - - ✅︎ - * - :code:`QWenLMHeadModel` - - Qwen - - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2ForCausalLM` - - Qwen2 - - :code:`Qwen/QwQ-32B-Preview`, :code:`Qwen/Qwen2-7B-Instruct`, :code:`Qwen/Qwen2-7B`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2MoeForCausalLM` - - Qwen2MoE - - :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. - - - - ✅︎ - * - :code:`StableLmForCausalLM` - - StableLM - - :code:`stabilityai/stablelm-3b-4e1t`, :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc. - - - - ✅︎ - * - :code:`Starcoder2ForCausalLM` - - Starcoder2 - - :code:`bigcode/starcoder2-3b`, :code:`bigcode/starcoder2-7b`, :code:`bigcode/starcoder2-15b`, etc. - - - - ✅︎ - * - :code:`SolarForCausalLM` - - Solar Pro - - :code:`upstage/solar-pro-preview-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`TeleChat2ForCausalLM` - - TeleChat2 - - :code:`TeleAI/TeleChat2-3B`, :code:`TeleAI/TeleChat2-7B`, :code:`TeleAI/TeleChat2-35B`, etc. - - ✅︎ - - ✅︎ - * - :code:`XverseForCausalLM` - - XVERSE - - :code:`xverse/XVERSE-7B-Chat`, :code:`xverse/XVERSE-13B-Chat`, :code:`xverse/XVERSE-65B-Chat`, etc. - - ✅︎ - - ✅︎ +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 + +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `AquilaForCausalLM` + - Aquila, Aquila2 + - `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc. + - ✅︎ + - ✅︎ +* - `ArcticForCausalLM` + - Arctic + - `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc. + - + - ✅︎ +* - `BaiChuanForCausalLM` + - Baichuan2, Baichuan + - `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc. + - ✅︎ + - ✅︎ +* - `BloomForCausalLM` + - BLOOM, BLOOMZ, BLOOMChat + - `bigscience/bloom`, `bigscience/bloomz`, etc. + - + - ✅︎ +* - `BartForConditionalGeneration` + - BART + - `facebook/bart-base`, `facebook/bart-large-cnn`, etc. + - + - +* - `ChatGLMModel` + - ChatGLM + - `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc. + - ✅︎ + - ✅︎ +* - `CohereForCausalLM`, `Cohere2ForCausalLM` + - Command-R + - `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc. + - ✅︎ + - ✅︎ +* - `DbrxForCausalLM` + - DBRX + - `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc. + - + - ✅︎ +* - `DeciLMForCausalLM` + - DeciLM + - `Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc. + - + - ✅︎ +* - `DeepseekForCausalLM` + - DeepSeek + - `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc. + - + - ✅︎ +* - `DeepseekV2ForCausalLM` + - DeepSeek-V2 + - `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc. + - + - ✅︎ +* - `DeepseekV3ForCausalLM` + - DeepSeek-V3 + - `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc. + - + - ✅︎ +* - `ExaoneForCausalLM` + - EXAONE-3 + - `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc. + - ✅︎ + - ✅︎ +* - `FalconForCausalLM` + - Falcon + - `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc. + - + - ✅︎ +* - `FalconMambaForCausalLM` + - FalconMamba + - `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc. + - ✅︎ + - ✅︎ +* - `GemmaForCausalLM` + - Gemma + - `google/gemma-2b`, `google/gemma-7b`, etc. + - ✅︎ + - ✅︎ +* - `Gemma2ForCausalLM` + - Gemma2 + - `google/gemma-2-9b`, `google/gemma-2-27b`, etc. + - ✅︎ + - ✅︎ +* - `GlmForCausalLM` + - GLM-4 + - `THUDM/glm-4-9b-chat-hf`, etc. + - ✅︎ + - ✅︎ +* - `GPT2LMHeadModel` + - GPT-2 + - `gpt2`, `gpt2-xl`, etc. + - + - ✅︎ +* - `GPTBigCodeForCausalLM` + - StarCoder, SantaCoder, WizardCoder + - `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc. + - ✅︎ + - ✅︎ +* - `GPTJForCausalLM` + - GPT-J + - `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc. + - + - ✅︎ +* - `GPTNeoXForCausalLM` + - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM + - `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. + - + - ✅︎ +* - `GraniteForCausalLM` + - Granite 3.0, Granite 3.1, PowerLM + - `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc. + - ✅︎ + - ✅︎ +* - `GraniteMoeForCausalLM` + - Granite 3.0 MoE, PowerMoE + - `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc. + - ✅︎ + - ✅︎ +* - `GritLM` + - GritLM + - `parasail-ai/GritLM-7B-vllm`. + - ✅︎ + - ✅︎ +* - `InternLMForCausalLM` + - InternLM + - `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc. + - ✅︎ + - ✅︎ +* - `InternLM2ForCausalLM` + - InternLM2 + - `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc. + - ✅︎ + - ✅︎ +* - `JAISLMHeadModel` + - Jais + - `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc. + - + - ✅︎ +* - `JambaForCausalLM` + - Jamba + - `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc. + - ✅︎ + - ✅︎ +* - `LlamaForCausalLM` + - Llama 3.1, Llama 3, Llama 2, LLaMA, Yi + - `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc. + - ✅︎ + - ✅︎ +* - `MambaForCausalLM` + - Mamba + - `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc. + - + - ✅︎ +* - `MiniCPMForCausalLM` + - MiniCPM + - `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc. + - ✅︎ + - ✅︎ +* - `MiniCPM3ForCausalLM` + - MiniCPM3 + - `openbmb/MiniCPM3-4B`, etc. + - ✅︎ + - ✅︎ +* - `MistralForCausalLM` + - Mistral, Mistral-Instruct + - `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc. + - ✅︎ + - ✅︎ +* - `MixtralForCausalLM` + - Mixtral-8x7B, Mixtral-8x7B-Instruct + - `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc. + - ✅︎ + - ✅︎ +* - `MPTForCausalLM` + - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter + - `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc. + - + - ✅︎ +* - `NemotronForCausalLM` + - Nemotron-3, Nemotron-4, Minitron + - `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc. + - ✅︎ + - ✅︎ +* - `OLMoForCausalLM` + - OLMo + - `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc. + - + - ✅︎ +* - `OLMo2ForCausalLM` + - OLMo2 + - `allenai/OLMo2-7B-1124`, etc. + - + - ✅︎ +* - `OLMoEForCausalLM` + - OLMoE + - `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc. + - ✅︎ + - ✅︎ +* - `OPTForCausalLM` + - OPT, OPT-IML + - `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc. + - + - ✅︎ +* - `OrionForCausalLM` + - Orion + - `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc. + - + - ✅︎ +* - `PhiForCausalLM` + - Phi + - `microsoft/phi-1_5`, `microsoft/phi-2`, etc. + - ✅︎ + - ✅︎ +* - `Phi3ForCausalLM` + - Phi-3 + - `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc. + - ✅︎ + - ✅︎ +* - `Phi3SmallForCausalLM` + - Phi-3-Small + - `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc. + - + - ✅︎ +* - `PhiMoEForCausalLM` + - Phi-3.5-MoE + - `microsoft/Phi-3.5-MoE-instruct`, etc. + - ✅︎ + - ✅︎ +* - `PersimmonForCausalLM` + - Persimmon + - `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc. + - + - ✅︎ +* - `QWenLMHeadModel` + - Qwen + - `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2ForCausalLM` + - Qwen2 + - `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2MoeForCausalLM` + - Qwen2MoE + - `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc. + - + - ✅︎ +* - `StableLmForCausalLM` + - StableLM + - `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc. + - + - ✅︎ +* - `Starcoder2ForCausalLM` + - Starcoder2 + - `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc. + - + - ✅︎ +* - `SolarForCausalLM` + - Solar Pro + - `upstage/solar-pro-preview-instruct`, etc. + - ✅︎ + - ✅︎ +* - `TeleChat2ForCausalLM` + - TeleChat2 + - `TeleAI/TeleChat2-3B`, `TeleAI/TeleChat2-7B`, `TeleAI/TeleChat2-35B`, etc. + - ✅︎ + - ✅︎ +* - `XverseForCausalLM` + - XVERSE + - `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. + - ✅︎ + - ✅︎ ``` ```{note} @@ -374,51 +373,50 @@ you should explicitly specify the task type to ensure that the model is used in #### Text Embedding (`--task embed`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`BertModel` - - BERT-based - - :code:`BAAI/bge-base-en-v1.5`, etc. - - - - - * - :code:`Gemma2Model` - - Gemma2-based - - :code:`BAAI/bge-multilingual-gemma2`, etc. - - - - ✅︎ - * - :code:`GritLM` - - GritLM - - :code:`parasail-ai/GritLM-7B-vllm`. - - ✅︎ - - ✅︎ - * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc. - - Llama-based - - :code:`intfloat/e5-mistral-7b-instruct`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM` - - Qwen2-based - - :code:`ssmits/Qwen2-7B-Instruct-embed-base` (see note), :code:`Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. - - ✅︎ - - ✅︎ - * - :code:`RobertaModel`, :code:`RobertaForMaskedLM` - - RoBERTa-based - - :code:`sentence-transformers/all-roberta-large-v1`, :code:`sentence-transformers/all-roberta-large-v1`, etc. - - - - - * - :code:`XLMRobertaModel` - - XLM-RoBERTa-based - - :code:`intfloat/multilingual-e5-large`, etc. - - - - +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 + +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `BertModel` + - BERT-based + - `BAAI/bge-base-en-v1.5`, etc. + - + - +* - `Gemma2Model` + - Gemma2-based + - `BAAI/bge-multilingual-gemma2`, etc. + - + - ✅︎ +* - `GritLM` + - GritLM + - `parasail-ai/GritLM-7B-vllm`. + - ✅︎ + - ✅︎ +* - `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. + - Llama-based + - `intfloat/e5-mistral-7b-instruct`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2Model`, `Qwen2ForCausalLM` + - Qwen2-based + - `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc. + - ✅︎ + - ✅︎ +* - `RobertaModel`, `RobertaForMaskedLM` + - RoBERTa-based + - `sentence-transformers/all-roberta-large-v1`, `sentence-transformers/all-roberta-large-v1`, etc. + - + - +* - `XLMRobertaModel` + - XLM-RoBERTa-based + - `intfloat/multilingual-e5-large`, etc. + - + - ``` ```{note} @@ -440,31 +438,30 @@ of the whole prompt are extracted from the normalized hidden state corresponding #### Reward Modeling (`--task reward`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`InternLM2ForRewardModel` - - InternLM2-based - - :code:`internlm/internlm2-1_8b-reward`, :code:`internlm/internlm2-7b-reward`, etc. - - ✅︎ - - ✅︎ - * - :code:`LlamaForCausalLM` - - Llama-based - - :code:`peiyi9979/math-shepherd-mistral-7b-prm`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2ForRewardModel` - - Qwen2-based - - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc. - - ✅︎ - - ✅︎ +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 + +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `InternLM2ForRewardModel` + - InternLM2-based + - `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. + - ✅︎ + - ✅︎ +* - `LlamaForCausalLM` + - Llama-based + - `peiyi9979/math-shepherd-mistral-7b-prm`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2ForRewardModel` + - Qwen2-based + - `Qwen/Qwen2.5-Math-RM-72B`, etc. + - ✅︎ + - ✅︎ ``` If your model is not in the above list, we will try to automatically convert the model using @@ -477,26 +474,25 @@ e.g.: {code}`--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 1 #### Classification (`--task classify`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`JambaForSequenceClassification` - - Jamba - - :code:`ai21labs/Jamba-tiny-reward-dev`, etc. - - ✅︎ - - ✅︎ - * - :code:`Qwen2ForSequenceClassification` - - Qwen2-based - - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc. - - ✅︎ - - ✅︎ +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 + +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `JambaForSequenceClassification` + - Jamba + - `ai21labs/Jamba-tiny-reward-dev`, etc. + - ✅︎ + - ✅︎ +* - `Qwen2ForSequenceClassification` + - Qwen2-based + - `jason9693/Qwen2.5-1.5B-apeach`, etc. + - ✅︎ + - ✅︎ ``` If your model is not in the above list, we will try to automatically convert the model using @@ -504,31 +500,30 @@ If your model is not in the above list, we will try to automatically convert the #### Sentence Pair Scoring (`--task score`) -```{eval-rst} -.. list-table:: - :widths: 25 25 50 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`BertForSequenceClassification` - - BERT-based - - :code:`cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. - - - - - * - :code:`RobertaForSequenceClassification` - - RoBERTa-based - - :code:`cross-encoder/quora-roberta-base`, etc. - - - - - * - :code:`XLMRobertaForSequenceClassification` - - XLM-RoBERTa-based - - :code:`BAAI/bge-reranker-v2-m3`, etc. - - - - +```{list-table} +:widths: 25 25 50 5 5 +:header-rows: 1 + +* - Architecture + - Models + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `BertForSequenceClassification` + - BERT-based + - `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. + - + - +* - `RobertaForSequenceClassification` + - RoBERTa-based + - `cross-encoder/quora-roberta-base`, etc. + - + - +* - `XLMRobertaForSequenceClassification` + - XLM-RoBERTa-based + - `BAAI/bge-reranker-v2-m3`, etc. + - + - ``` (supported-mm-models)= @@ -558,186 +553,182 @@ See [this page](#generative-models) for more information on how to use generativ #### Text Generation (`--task generate`) -```{eval-rst} -.. list-table:: - :widths: 25 25 15 20 5 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Inputs - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - - V1 - * - :code:`AriaForConditionalGeneration` - - Aria - - T + I - - :code:`rhymes-ai/Aria` - - - - ✅︎ - - - * - :code:`Blip2ForConditionalGeneration` - - BLIP-2 - - T + I\ :sup:`E` - - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. - - - - ✅︎ - - - * - :code:`ChameleonForConditionalGeneration` - - Chameleon - - T + I - - :code:`facebook/chameleon-7b` etc. - - - - ✅︎ - - - * - :code:`FuyuForCausalLM` - - Fuyu - - T + I - - :code:`adept/fuyu-8b` etc. - - - - ✅︎ - - - * - :code:`ChatGLMModel` - - GLM-4V - - T + I - - :code:`THUDM/glm-4v-9b` etc. - - ✅︎ - - ✅︎ - - - * - :code:`H2OVLChatModel` - - H2OVL - - T + I\ :sup:`E+` - - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc. - - - - ✅︎ - - - * - :code:`Idefics3ForConditionalGeneration` - - Idefics3 - - T + I - - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. - - ✅︎ - - - - - * - :code:`InternVLChatModel` - - InternVL 2.5, Mono-InternVL, InternVL 2.0 - - T + I\ :sup:`E+` - - :code:`OpenGVLab/InternVL2_5-4B`, :code:`OpenGVLab/Mono-InternVL-2B`, :code:`OpenGVLab/InternVL2-4B`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`LlavaForConditionalGeneration` - - LLaVA-1.5 - - T + I\ :sup:`E+` - - :code:`llava-hf/llava-1.5-7b-hf`, :code:`TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. - - - - ✅︎ - - ✅︎ - * - :code:`LlavaNextForConditionalGeneration` - - LLaVA-NeXT - - T + I\ :sup:`E+` - - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - - - ✅︎ - - - * - :code:`LlavaNextVideoForConditionalGeneration` - - LLaVA-NeXT-Video - - T + V - - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. - - - - ✅︎ - - - * - :code:`LlavaOnevisionForConditionalGeneration` - - LLaVA-Onevision - - T + I\ :sup:`+` + V\ :sup:`+` - - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - - - ✅︎ - - - * - :code:`MiniCPMV` - - MiniCPM-V - - T + I\ :sup:`E+` - - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`MllamaForConditionalGeneration` - - Llama 3.2 - - T + I\ :sup:`+` - - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - - - - - - - * - :code:`MolmoForCausalLM` - - Molmo - - T + I - - :code:`allenai/Molmo-7B-D-0924`, :code:`allenai/Molmo-72B-0924`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`NVLM_D_Model` - - NVLM-D 1.0 - - T + I\ :sup:`E+` - - :code:`nvidia/NVLM-D-72B`, etc. - - - - ✅︎ - - ✅︎ - * - :code:`PaliGemmaForConditionalGeneration` - - PaliGemma, PaliGemma 2 - - T + I\ :sup:`E` - - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, :code:`google/paligemma2-3b-ft-docci-448`, etc. - - - - ✅︎ - - - * - :code:`Phi3VForCausalLM` - - Phi-3-Vision, Phi-3.5-Vision - - T + I\ :sup:`E+` - - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. - - - - ✅︎ - - ✅︎ - * - :code:`PixtralForConditionalGeneration` - - Pixtral - - T + I\ :sup:`+` - - :code:`mistralai/Pixtral-12B-2409`, :code:`mistral-community/pixtral-12b` etc. - - - - ✅︎ - - ✅︎ - * - :code:`QWenLMHeadModel` - - Qwen-VL - - T + I\ :sup:`E+` - - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`Qwen2AudioForConditionalGeneration` - - Qwen2-Audio - - T + A\ :sup:`+` - - :code:`Qwen/Qwen2-Audio-7B-Instruct` - - - - ✅︎ - - - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL - - T + I\ :sup:`E+` + V\ :sup:`E+` - - :code:`Qwen/QVQ-72B-Preview`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - - ✅︎ - - ✅︎ - - - * - :code:`UltravoxModel` - - Ultravox - - T + A\ :sup:`E+` - - :code:`fixie-ai/ultravox-v0_3` - - - - ✅︎ - - +```{list-table} +:widths: 25 25 15 20 5 5 5 +:header-rows: 1 + +* - Architecture + - Models + - Inputs + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) + - [V1](gh-issue:8779) +* - `AriaForConditionalGeneration` + - Aria + - T + I + - `rhymes-ai/Aria` + - + - ✅︎ + - +* - `Blip2ForConditionalGeneration` + - BLIP-2 + - T + IE + - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. + - + - ✅︎ + - +* - `ChameleonForConditionalGeneration` + - Chameleon + - T + I + - `facebook/chameleon-7b` etc. + - + - ✅︎ + - +* - `FuyuForCausalLM` + - Fuyu + - T + I + - `adept/fuyu-8b` etc. + - + - ✅︎ + - +* - `ChatGLMModel` + - GLM-4V + - T + I + - `THUDM/glm-4v-9b` etc. + - ✅︎ + - ✅︎ + - +* - `H2OVLChatModel` + - H2OVL + - T + IE+ + - `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. + - + - ✅︎ + - +* - `Idefics3ForConditionalGeneration` + - Idefics3 + - T + I + - `HuggingFaceM4/Idefics3-8B-Llama3` etc. + - ✅︎ + - + - +* - `InternVLChatModel` + - InternVL 2.5, Mono-InternVL, InternVL 2.0 + - T + IE+ + - `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. + - + - ✅︎ + - ✅︎ +* - `LlavaForConditionalGeneration` + - LLaVA-1.5 + - T + IE+ + - `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. + - + - ✅︎ + - ✅︎ +* - `LlavaNextForConditionalGeneration` + - LLaVA-NeXT + - T + IE+ + - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. + - + - ✅︎ + - +* - `LlavaNextVideoForConditionalGeneration` + - LLaVA-NeXT-Video + - T + V + - `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. + - + - ✅︎ + - +* - `LlavaOnevisionForConditionalGeneration` + - LLaVA-Onevision + - T + I+ + V+ + - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. + - + - ✅︎ + - +* - `MiniCPMV` + - MiniCPM-V + - T + IE+ + - `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc. + - ✅︎ + - ✅︎ + - +* - `MllamaForConditionalGeneration` + - Llama 3.2 + - T + I+ + - `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc. + - + - + - +* - `MolmoForCausalLM` + - Molmo + - T + I + - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc. + - + - ✅︎ + - ✅︎ +* - `NVLM_D_Model` + - NVLM-D 1.0 + - T + IE+ + - `nvidia/NVLM-D-72B`, etc. + - + - ✅︎ + - ✅︎ +* - `PaliGemmaForConditionalGeneration` + - PaliGemma, PaliGemma 2 + - T + IE + - `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc. + - + - ✅︎ + - +* - `Phi3VForCausalLM` + - Phi-3-Vision, Phi-3.5-Vision + - T + IE+ + - `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct` etc. + - + - ✅︎ + - ✅︎ +* - `PixtralForConditionalGeneration` + - Pixtral + - T + I+ + - `mistralai/Pixtral-12B-2409`, `mistral-community/pixtral-12b` etc. + - + - ✅︎ + - ✅︎ +* - `QWenLMHeadModel` + - Qwen-VL + - T + IE+ + - `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc. + - ✅︎ + - ✅︎ + - +* - `Qwen2AudioForConditionalGeneration` + - Qwen2-Audio + - T + A+ + - `Qwen/Qwen2-Audio-7B-Instruct` + - + - ✅︎ + - +* - `Qwen2VLForConditionalGeneration` + - Qwen2-VL + - T + IE+ + VE+ + - `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc. + - ✅︎ + - ✅︎ + - +* - `UltravoxModel` + - Ultravox + - T + AE+ + - `fixie-ai/ultravox-v0_3` + - + - ✅︎ + - ``` -```{eval-rst} -:sup:`E` Pre-computed embeddings can be inputted for this modality. - -:sup:`+` Multiple items can be inputted per text prompt for this modality. -``` +E Pre-computed embeddings can be inputted for this modality. ++ Multiple items can be inputted per text prompt for this modality. ````{important} To enable multiple multi-modal items per text prompt, you have to set {code}`limit_mm_per_prompt` (offline inference) @@ -787,38 +778,37 @@ To get the best results, you should use pooling models that are specifically tra The following table lists those that are tested in vLLM. -```{eval-rst} -.. list-table:: - :widths: 25 25 15 25 5 5 - :header-rows: 1 - - * - Architecture - - Models - - Inputs - - Example HF Models - - :ref:`LoRA ` - - :ref:`PP ` - * - :code:`LlavaNextForConditionalGeneration` - - LLaVA-NeXT-based - - T / I - - :code:`royokong/e5-v` - - - - ✅︎ - * - :code:`Phi3VForCausalLM` - - Phi-3-Vision-based - - T + I - - :code:`TIGER-Lab/VLM2Vec-Full` - - 🚧 - - ✅︎ - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL-based - - T + I - - :code:`MrLight/dse-qwen2-2b-mrl-v1` - - - - ✅︎ +```{list-table} +:widths: 25 25 15 25 5 5 +:header-rows: 1 + +* - Architecture + - Models + - Inputs + - Example HF Models + - [LoRA](#lora-adapter) + - [PP](#distributed-serving) +* - `LlavaNextForConditionalGeneration` + - LLaVA-NeXT-based + - T / I + - `royokong/e5-v` + - + - ✅︎ +* - `Phi3VForCausalLM` + - Phi-3-Vision-based + - T + I + - `TIGER-Lab/VLM2Vec-Full` + - 🚧 + - ✅︎ +* - `Qwen2VLForConditionalGeneration` + - Qwen2-VL-based + - T + I + - `MrLight/dse-qwen2-2b-mrl-v1` + - + - ✅︎ ``` -______________________________________________________________________ +_________________ # Model Support Policy diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/quantization/supported_hardware.md index 843ee21627d78..7330c2f8aa194 100644 --- a/docs/source/quantization/supported_hardware.md +++ b/docs/source/quantization/supported_hardware.md @@ -4,121 +4,120 @@ The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: -```{eval-rst} -.. list-table:: - :header-rows: 1 - :widths: 20 8 8 8 8 8 8 8 8 8 8 +```{list-table} +:header-rows: 1 +:widths: 20 8 8 8 8 8 8 8 8 8 8 - * - Implementation - - Volta - - Turing - - Ampere - - Ada - - Hopper - - AMD GPU - - Intel GPU - - x86 CPU - - AWS Inferentia - - Google TPU - * - AWQ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - GPTQ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - * - Marlin (GPTQ/AWQ/FP8) - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - INT8 (W8A8) - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✅︎ - - ✗ - - ✗ - * - FP8 (W8A8) - - ✗ - - ✗ - - ✗ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - * - AQLM - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - bitsandbytes - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - DeepSpeedFP - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ - * - GGUF - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✅︎ - - ✗ - - ✗ - - ✗ - - ✗ - - ✗ +* - Implementation + - Volta + - Turing + - Ampere + - Ada + - Hopper + - AMD GPU + - Intel GPU + - x86 CPU + - AWS Inferentia + - Google TPU +* - AWQ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ +* - GPTQ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✅︎ + - ✅︎ + - ✗ + - ✗ +* - Marlin (GPTQ/AWQ/FP8) + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +* - INT8 (W8A8) + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✅︎ + - ✗ + - ✗ +* - FP8 (W8A8) + - ✗ + - ✗ + - ✗ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ +* - AQLM + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +* - bitsandbytes + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +* - DeepSpeedFP + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ +* - GGUF + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✅︎ + - ✗ + - ✗ + - ✗ + - ✗ + - ✗ ``` ## Notes: diff --git a/docs/source/serving/deploying_with_helm.md b/docs/source/serving/deploying_with_helm.md index 3b26575827011..7286a0a88968f 100644 --- a/docs/source/serving/deploying_with_helm.md +++ b/docs/source/serving/deploying_with_helm.md @@ -43,209 +43,208 @@ chart **including persistent volumes** and deletes the release. ## Values -```{eval-rst} -.. list-table:: Values - :widths: 25 25 25 25 - :header-rows: 1 - - * - Key - - Type - - Default - - Description - * - autoscaling - - object - - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} - - Autoscaling configuration - * - autoscaling.enabled - - bool - - false - - Enable autoscaling - * - autoscaling.maxReplicas - - int - - 100 - - Maximum replicas - * - autoscaling.minReplicas - - int - - 1 - - Minimum replicas - * - autoscaling.targetCPUUtilizationPercentage - - int - - 80 - - Target CPU utilization for autoscaling - * - configs - - object - - {} - - Configmap - * - containerPort - - int - - 8000 - - Container port - * - customObjects - - list - - [] - - Custom Objects configuration - * - deploymentStrategy - - object - - {} - - Deployment strategy configuration - * - externalConfigs - - list - - [] - - External configuration - * - extraContainers - - list - - [] - - Additional containers configuration - * - extraInit - - object - - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} - - Additional configuration for the init container - * - extraInit.pvcStorage - - string - - "50Gi" - - Storage size of the s3 - * - extraInit.s3modelpath - - string - - "relative_s3_model_path/opt-125m" - - Path of the model on the s3 which hosts model weights and config files - * - extraInit.awsEc2MetadataDisabled - - boolean - - true - - Disables the use of the Amazon EC2 instance metadata service - * - extraPorts - - list - - [] - - Additional ports configuration - * - gpuModels - - list - - ["TYPE_GPU_USED"] - - Type of gpu used - * - image - - object - - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} - - Image configuration - * - image.command - - list - - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] - - Container launch command - * - image.repository - - string - - "vllm/vllm-openai" - - Image repository - * - image.tag - - string - - "latest" - - Image tag - * - livenessProbe - - object - - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} - - Liveness probe configuration - * - livenessProbe.failureThreshold - - int - - 3 - - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive - * - livenessProbe.httpGet - - object - - {"path":"/health","port":8000} - - Configuration of the Kubelet http request on the server - * - livenessProbe.httpGet.path - - string - - "/health" - - Path to access on the HTTP server - * - livenessProbe.httpGet.port - - int - - 8000 - - Name or number of the port to access on the container, on which the server is listening - * - livenessProbe.initialDelaySeconds - - int - - 15 - - Number of seconds after the container has started before liveness probe is initiated - * - livenessProbe.periodSeconds - - int - - 10 - - How often (in seconds) to perform the liveness probe - * - maxUnavailablePodDisruptionBudget - - string - - "" - - Disruption Budget Configuration - * - readinessProbe - - object - - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} - - Readiness probe configuration - * - readinessProbe.failureThreshold - - int - - 3 - - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready - * - readinessProbe.httpGet - - object - - {"path":"/health","port":8000} - - Configuration of the Kubelet http request on the server - * - readinessProbe.httpGet.path - - string - - "/health" - - Path to access on the HTTP server - * - readinessProbe.httpGet.port - - int - - 8000 - - Name or number of the port to access on the container, on which the server is listening - * - readinessProbe.initialDelaySeconds - - int - - 5 - - Number of seconds after the container has started before readiness probe is initiated - * - readinessProbe.periodSeconds - - int - - 5 - - How often (in seconds) to perform the readiness probe - * - replicaCount - - int - - 1 - - Number of replicas - * - resources - - object - - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} - - Resource configuration - * - resources.limits."nvidia.com/gpu" - - int - - 1 - - Number of gpus used - * - resources.limits.cpu - - int - - 4 - - Number of CPUs - * - resources.limits.memory - - string - - "16Gi" - - CPU memory configuration - * - resources.requests."nvidia.com/gpu" - - int - - 1 - - Number of gpus used - * - resources.requests.cpu - - int - - 4 - - Number of CPUs - * - resources.requests.memory - - string - - "16Gi" - - CPU memory configuration - * - secrets - - object - - {} - - Secrets configuration - * - serviceName - - string - - - - Service name - * - servicePort - - int - - 80 - - Service port - * - labels.environment - - string - - test - - Environment name - * - labels.release - - string - - test - - Release name +```{list-table} +:widths: 25 25 25 25 +:header-rows: 1 + +* - Key + - Type + - Default + - Description +* - autoscaling + - object + - {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80} + - Autoscaling configuration +* - autoscaling.enabled + - bool + - false + - Enable autoscaling +* - autoscaling.maxReplicas + - int + - 100 + - Maximum replicas +* - autoscaling.minReplicas + - int + - 1 + - Minimum replicas +* - autoscaling.targetCPUUtilizationPercentage + - int + - 80 + - Target CPU utilization for autoscaling +* - configs + - object + - {} + - Configmap +* - containerPort + - int + - 8000 + - Container port +* - customObjects + - list + - [] + - Custom Objects configuration +* - deploymentStrategy + - object + - {} + - Deployment strategy configuration +* - externalConfigs + - list + - [] + - External configuration +* - extraContainers + - list + - [] + - Additional containers configuration +* - extraInit + - object + - {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true} + - Additional configuration for the init container +* - extraInit.pvcStorage + - string + - "50Gi" + - Storage size of the s3 +* - extraInit.s3modelpath + - string + - "relative_s3_model_path/opt-125m" + - Path of the model on the s3 which hosts model weights and config files +* - extraInit.awsEc2MetadataDisabled + - boolean + - true + - Disables the use of the Amazon EC2 instance metadata service +* - extraPorts + - list + - [] + - Additional ports configuration +* - gpuModels + - list + - ["TYPE_GPU_USED"] + - Type of gpu used +* - image + - object + - {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} + - Image configuration +* - image.command + - list + - ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"] + - Container launch command +* - image.repository + - string + - "vllm/vllm-openai" + - Image repository +* - image.tag + - string + - "latest" + - Image tag +* - livenessProbe + - object + - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10} + - Liveness probe configuration +* - livenessProbe.failureThreshold + - int + - 3 + - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive +* - livenessProbe.httpGet + - object + - {"path":"/health","port":8000} + - Configuration of the Kubelet http request on the server +* - livenessProbe.httpGet.path + - string + - "/health" + - Path to access on the HTTP server +* - livenessProbe.httpGet.port + - int + - 8000 + - Name or number of the port to access on the container, on which the server is listening +* - livenessProbe.initialDelaySeconds + - int + - 15 + - Number of seconds after the container has started before liveness probe is initiated +* - livenessProbe.periodSeconds + - int + - 10 + - How often (in seconds) to perform the liveness probe +* - maxUnavailablePodDisruptionBudget + - string + - "" + - Disruption Budget Configuration +* - readinessProbe + - object + - {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5} + - Readiness probe configuration +* - readinessProbe.failureThreshold + - int + - 3 + - Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready +* - readinessProbe.httpGet + - object + - {"path":"/health","port":8000} + - Configuration of the Kubelet http request on the server +* - readinessProbe.httpGet.path + - string + - "/health" + - Path to access on the HTTP server +* - readinessProbe.httpGet.port + - int + - 8000 + - Name or number of the port to access on the container, on which the server is listening +* - readinessProbe.initialDelaySeconds + - int + - 5 + - Number of seconds after the container has started before readiness probe is initiated +* - readinessProbe.periodSeconds + - int + - 5 + - How often (in seconds) to perform the readiness probe +* - replicaCount + - int + - 1 + - Number of replicas +* - resources + - object + - {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}} + - Resource configuration +* - resources.limits."nvidia.com/gpu" + - int + - 1 + - Number of gpus used +* - resources.limits.cpu + - int + - 4 + - Number of CPUs +* - resources.limits.memory + - string + - "16Gi" + - CPU memory configuration +* - resources.requests."nvidia.com/gpu" + - int + - 1 + - Number of gpus used +* - resources.requests.cpu + - int + - 4 + - Number of CPUs +* - resources.requests.memory + - string + - "16Gi" + - CPU memory configuration +* - secrets + - object + - {} + - Secrets configuration +* - serviceName + - string + - + - Service name +* - servicePort + - int + - 80 + - Service port +* - labels.environment + - string + - test + - Environment name +* - labels.release + - string + - test + - Release name ``` From dba4d9dec606da028fbb28240e99cabd5a761e6a Mon Sep 17 00:00:00 2001 From: youkaichao Date: Sun, 29 Dec 2024 17:03:49 +0800 Subject: [PATCH 022/462] [v1][bugfix] fix cudagraph with inplace buffer assignment (#11596) Signed-off-by: youkaichao --- vllm/compilation/wrapper.py | 10 +++++++++- vllm/model_executor/layers/rotary_embedding.py | 11 +---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py index c10241b483169..e3260a10c02ae 100644 --- a/vllm/compilation/wrapper.py +++ b/vllm/compilation/wrapper.py @@ -28,11 +28,12 @@ def __init__(self, compiled_callable: Optional[Callable] = None, compilation_level: int = 0): + vllm_config = get_current_vllm_config() + self.vllm_config = vllm_config if compiled_callable is None: # default compilation settings # compiling the forward method - vllm_config = get_current_vllm_config() backend = vllm_config.compilation_config.init_backend(vllm_config) compiled_callable = torch.compile( @@ -82,6 +83,13 @@ def bytecode_hook(self, old_code: CodeType, new_code: CodeType): self.compiled_codes.append(new_code) + if self.vllm_config.compilation_config.use_cudagraph and \ + "update" in new_code.co_names: + import depyf + src = depyf.decompile(new_code) + msg = "Assigning / modifying buffers of nn.Module during forward pass is not allowed when using cudagraph inside the compiler because it will cause silent errors. Please use eager mode or fix the code. The following code contains clues about which buffer is being modified (please search for the usage of the function `update`):\n" + src # noqa + raise RuntimeError(msg) + @contextmanager def dispatch_to_code(self, index: int): """Context manager to dispatch to the compiled code. diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 117fe086e5e87..6695d44dfa32b 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -541,19 +541,12 @@ def __init__( short_cache = self._compute_cos_sin_cache( original_max_position_embeddings, short_factor, short_mscale) short_cache = short_cache.to(dtype) - self.register_buffer("short_cos_sin_cache", - short_cache, - persistent=False) long_cache = self._compute_cos_sin_cache(max_position_embeddings, long_factor, long_mscale) long_cache = long_cache.to(dtype) - self.register_buffer("long_cos_sin_cache", - long_cache, - persistent=False) - long_short_cache = torch.cat( - [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0) + long_short_cache = torch.cat([short_cache, long_cache], dim=0) self.register_buffer("long_short_cos_sin_cache", long_short_cache, persistent=False) @@ -593,8 +586,6 @@ def forward( torch.full_like(positions, k)).long() idx = (torch.add(positions, long_prompt_offset) if long_prompt_offset is not None else positions) - self.long_short_cos_sin_cache: torch.Tensor = ( - self.long_short_cos_sin_cache.to(idx.device)) idx = torch.add(idx, offsets) if offsets is not None else idx cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx) From faef77c0d69c5429182f475a57127676e6bcb230 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 29 Dec 2024 10:08:09 -0600 Subject: [PATCH 023/462] [Misc] KV cache transfer connector registry (#11481) Signed-off-by: KuntaiDu --- vllm/config.py | 8 ---- .../kv_transfer/kv_connector/factory.py | 48 +++++++++++++++---- 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 6ae1d4d944447..8e556743c8528 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2559,14 +2559,6 @@ def from_cli(cls, cli_value: str) -> "KVTransferConfig": return KVTransferConfig.model_validate_json(cli_value) def model_post_init(self, __context: Any) -> None: - supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"] - if all([ - self.kv_connector is not None, self.kv_connector - not in supported_kv_connector - ]): - raise ValueError(f"Unsupported kv_connector: {self.kv_connector}. " - f"Supported connectors are " - f"{supported_kv_connector}.") if self.kv_role is not None and self.kv_role not in [ "kv_producer", "kv_consumer", "kv_both" diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 3e2bb436d24b5..6372dab726086 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING +import importlib +from typing import TYPE_CHECKING, Callable, Dict, Type from .base import KVConnectorBase @@ -7,14 +8,41 @@ class KVConnectorFactory: + _registry: Dict[str, Callable[[], Type[KVConnectorBase]]] = {} - @staticmethod - def create_connector(rank: int, local_rank: int, + @classmethod + def register_connector(cls, name: str, module_path: str, + class_name: str) -> None: + """Register a connector with a lazy-loading module and class name.""" + if name in cls._registry: + raise ValueError(f"Connector '{name}' is already registered.") + + def loader() -> Type[KVConnectorBase]: + module = importlib.import_module(module_path) + return getattr(module, class_name) + + cls._registry[name] = loader + + @classmethod + def create_connector(cls, rank: int, local_rank: int, config: "VllmConfig") -> KVConnectorBase: - supported_kv_connector = ["PyNcclConnector", "MooncakeConnector"] - if config.kv_transfer_config.kv_connector in supported_kv_connector: - from .simple_connector import SimpleConnector - return SimpleConnector(rank, local_rank, config) - else: - raise ValueError(f"Unsupported connector type: " - f"{config.kv_connector}") + connector_name = config.kv_transfer_config.kv_connector + if connector_name not in cls._registry: + raise ValueError(f"Unsupported connector type: {connector_name}") + + connector_cls = cls._registry[connector_name]() + return connector_cls(rank, local_rank, config) + + +# Register various connectors here. +# The registration should not be done in each individual file, as we want to +# only load the files corresponding to the current connector. +KVConnectorFactory.register_connector( + "PyNcclConnector", + "vllm.distributed.kv_transfer.kv_connector.simple_connector", + "SimpleConnector") + +KVConnectorFactory.register_connector( + "MooncakeConnector", + "vllm.distributed.kv_transfer.kv_connector.simple_connector", + "SimpleConnector") From 0aa38d16f56327622c1689d7510171662757deee Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Sun, 29 Dec 2024 15:16:46 -0500 Subject: [PATCH 024/462] Remove print statement in DeepseekScalingRotaryEmbedding (#11604) --- vllm/model_executor/layers/rotary_embedding.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 6695d44dfa32b..3fcd81a3c4213 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -668,7 +668,6 @@ def _compute_cos_sin_cache(self) -> torch.Tensor: cos = (freqs.cos() * self.mscale) sin = (freqs.sin() * self.mscale) cache = torch.cat((cos, sin), dim=-1) - print("Cache shape", cache.shape) return cache def forward( From 3682e33f9ff9d8baade6112a8e75a77da898f504 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 30 Dec 2024 12:24:12 +0800 Subject: [PATCH 025/462] [v1] fix compilation cache (#11598) Signed-off-by: youkaichao --- tests/compile/piecewise/test_toy_llama.py | 15 +++++++- vllm/compilation/backends.py | 22 ++++++----- vllm/config.py | 45 +++++++++++++++++++++-- vllm/v1/worker/gpu_worker.py | 1 + 4 files changed, 69 insertions(+), 14 deletions(-) diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py index 07c10a3a18c55..d4ede4d2320a7 100644 --- a/tests/compile/piecewise/test_toy_llama.py +++ b/tests/compile/piecewise/test_toy_llama.py @@ -7,7 +7,7 @@ initialized randomly with a fixed seed. """ from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Any, List, Optional, Tuple import torch from torch import nn @@ -54,6 +54,16 @@ class LlamaConfig: tractable_init: bool = False random_seed: int = 0 + def compute_hash(self) -> str: + factors: List[Any] = [] + for k, v in self.__dict__.items(): + if k == "random_seed": + continue + factors.append((k, v)) + factors.sort() + import hashlib + return hashlib.md5(str(factors).encode()).hexdigest() + def __post_init__(self): assert self.mlp_size >= self.hidden_size @@ -263,7 +273,8 @@ def run_model(llama_config, compilation_config = CompilationConfig( level=CompilationLevel.NO_COMPILATION, ) - vllm_config = VllmConfig(compilation_config=compilation_config) + vllm_config = VllmConfig(compilation_config=compilation_config, + additional_config=llama_config) with set_current_vllm_config(vllm_config): model = LlamaModel(config=llama_config, vllm_config=vllm_config, diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 4f960b441f21d..a8dd628b9cd6f 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -619,8 +619,10 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, # the entries for different shapes that we need to either # compile or capture cudagraph self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} - self.to_be_compiled_sizes: Set[int] = self.compile_sizes.union( - self.capture_sizes) + + # to_be_compiled_sizes tracks the remaining sizes to compile, + # and updates during the compilation process, so we need to copy it + self.to_be_compiled_sizes: Set[int] = self.compile_sizes.copy() for shape in self.compile_sizes.union(self.capture_sizes): self.concrete_size_entries[shape] = ConcreteSizeEntry( runtime_shape=shape, @@ -628,12 +630,17 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig, use_cudagraph=shape in self.capture_sizes, ) + def check_for_ending_compilation(self): + if self.is_last_graph and not self.to_be_compiled_sizes: + # no specific sizes to compile + # save the hash of the inductor graph for the next run + self.compilation_config.inductor_hash_cache.save_to_file() + end_monitoring_torch_compile(self.vllm_config) + def __call__(self, *args) -> Any: if not self.first_run_finished: self.first_run_finished = True - # no specific sizes to compile - if self.is_last_graph and not self.to_be_compiled_sizes: - end_monitoring_torch_compile(self.vllm_config) + self.check_for_ending_compilation() return self.compiled_graph_for_general_shape(*args) runtime_shape = args[self.sym_shape_indices[0]] @@ -662,10 +669,7 @@ def __call__(self, *args) -> Any: # finished compilations for all required shapes if self.is_last_graph and not self.to_be_compiled_sizes: - - # save the hash of the inductor graph for the next run - self.compilation_config.inductor_hash_cache.save_to_file() - end_monitoring_torch_compile(self.vllm_config) + self.check_for_ending_compilation() if not entry.use_cudagraph: return entry.runnable(*args) diff --git a/vllm/config.py b/vllm/config.py index 8e556743c8528..765a46e6aeee3 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -9,8 +9,8 @@ from dataclasses import dataclass, field, replace from pathlib import Path from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Counter, Dict, - Final, List, Literal, Mapping, Optional, Set, Tuple, Type, - Union) + Final, List, Literal, Mapping, Optional, Protocol, Set, + Tuple, Type, Union) import torch from pydantic import BaseModel, Field, PrivateAttr @@ -75,6 +75,12 @@ PretrainedConfig]] +class SupportsHash(Protocol): + + def compute_hash(self) -> str: + ... + + class ModelConfig: """Configuration for the model. @@ -2969,6 +2975,10 @@ class VllmConfig: init=True) # type: ignore kv_transfer_config: KVTransferConfig = field(default=None, init=True) # type: ignore + # some opaque config, only used to provide additional information + # for the hash computation, mainly used for testing and debugging. + additional_config: SupportsHash = field(default=None, + init=True) # type: ignore instance_id: str = "" def compute_hash(self) -> str: @@ -3000,33 +3010,62 @@ def compute_hash(self) -> str: vllm_factors.append(__version__) if self.model_config: vllm_factors.append(self.model_config.compute_hash()) + else: + vllm_factors.append("None") if self.cache_config: vllm_factors.append(self.cache_config.compute_hash()) + else: + vllm_factors.append("None") if self.parallel_config: vllm_factors.append(self.parallel_config.compute_hash()) + else: + vllm_factors.append("None") if self.scheduler_config: vllm_factors.append(self.scheduler_config.compute_hash()) + else: + vllm_factors.append("None") if self.device_config: vllm_factors.append(self.device_config.compute_hash()) + else: + vllm_factors.append("None") if self.load_config: vllm_factors.append(self.load_config.compute_hash()) + else: + vllm_factors.append("None") if self.lora_config: vllm_factors.append(self.lora_config.compute_hash()) + else: + vllm_factors.append("None") if self.speculative_config: vllm_factors.append(self.speculative_config.compute_hash()) + else: + vllm_factors.append("None") if self.decoding_config: vllm_factors.append(self.decoding_config.compute_hash()) + else: + vllm_factors.append("None") if self.observability_config: vllm_factors.append(self.observability_config.compute_hash()) + else: + vllm_factors.append("None") if self.prompt_adapter_config: vllm_factors.append(self.prompt_adapter_config.compute_hash()) + else: + vllm_factors.append("None") if self.quant_config: pass # should be captured by model_config.quantization if self.compilation_config: vllm_factors.append(self.compilation_config.compute_hash()) + else: + vllm_factors.append("None") if self.kv_transfer_config: vllm_factors.append(self.kv_transfer_config.compute_hash()) - + else: + vllm_factors.append("None") + if self.additional_config: + vllm_factors.append(self.additional_config.compute_hash()) + else: + vllm_factors.append("None") factors.append(vllm_factors) hash_str = hashlib.md5(str(factors).encode()).hexdigest()[:10] diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 0000b09bfaa36..af438f7d5820c 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -48,6 +48,7 @@ def __init__( self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config + self.parallel_config.rank = rank self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method From 628ec6c17b8121517e8f303b64567573036cdb38 Mon Sep 17 00:00:00 2001 From: Liangfu Chen Date: Sun, 29 Dec 2024 21:46:14 -0800 Subject: [PATCH 026/462] [Docker] bump up neuron sdk v2.21 (#11593) Signed-off-by: Liangfu Chen --- Dockerfile.neuron | 6 +++--- requirements-neuron.txt | 4 ++-- vllm/_custom_ops.py | 3 +-- vllm/triton_utils/importing.py | 1 - 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 77162bc82de62..269139fe90f0b 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -1,6 +1,6 @@ # default base image # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx -ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.1.2-neuronx-py310-sdk2.20.2-ubuntu20.04" +ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.21.0-ubuntu22.04" FROM $BASE_IMAGE @@ -22,9 +22,9 @@ WORKDIR ${APP_MOUNT}/vllm RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas -RUN python3 -m pip install sentencepiece transformers==4.36.2 -U +RUN python3 -m pip install sentencepiece transformers==4.45.2 -U RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U -RUN python3 -m pip install --pre neuronx-cc==2.15.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U +RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U COPY . . ARG GIT_REPO_CHECK=0 diff --git a/requirements-neuron.txt b/requirements-neuron.txt index 148fdbe0d6310..5e08d101fcd61 100644 --- a/requirements-neuron.txt +++ b/requirements-neuron.txt @@ -2,6 +2,6 @@ -r requirements-common.txt # Dependencies for Neuron devices -transformers-neuronx >= 0.12.0 -torch-neuronx >= 2.1.2 +transformers-neuronx >= 0.13.0 +torch-neuronx >= 2.5.0 neuronx-cc diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index aeacf5dda5761..eb2f69df42624 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -23,8 +23,7 @@ import vllm._moe_C # noqa: F401 supports_moe_ops = True -# neuron has torch version that doesn't even have impl_abstract -if TYPE_CHECKING or current_platform.is_neuron(): +if TYPE_CHECKING: def register_fake(fn): return lambda name: fn diff --git a/vllm/triton_utils/importing.py b/vllm/triton_utils/importing.py index 36315abcdfcda..0c96e0632f646 100644 --- a/vllm/triton_utils/importing.py +++ b/vllm/triton_utils/importing.py @@ -8,7 +8,6 @@ HAS_TRITON = ( find_spec("triton") is not None and not current_platform.is_xpu() # Not compatible - and not current_platform.is_neuron() # neuron has too old torch ) if not HAS_TRITON: From 970d6d0776076f17604077ba4d484cdadd604ceb Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Mon, 30 Dec 2024 04:22:13 -0500 Subject: [PATCH 027/462] [Build][Kernel] Update CUTLASS to v3.6.0 (#11607) Signed-off-by: Tyler Michael Smith --- CMakeLists.txt | 4 ++-- .../vllm_cutlass_library_extension.py | 18 +++++++++--------- csrc/quantization/machete/generate.py | 8 ++++---- .../machete/machete_collective_builder.cuh | 10 ++++------ csrc/quantization/machete/machete_mainloop.cuh | 11 ++++------- .../machete/machete_prepacked_layout.cuh | 5 ++--- 6 files changed, 25 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 83c8033434f3b..3206d76125545 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -223,13 +223,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") FetchContent_Declare( cutlass GIT_REPOSITORY https://github.com/nvidia/cutlass.git - GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227 + GIT_TAG v3.6.0 GIT_PROGRESS TRUE # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history. # Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags. # So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE - GIT_SHALLOW FALSE + GIT_SHALLOW TRUE ) endif() FetchContent_MakeAvailable(cutlass) diff --git a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py index a5beea1a35e49..b401736c9824b 100644 --- a/csrc/cutlass_extensions/vllm_cutlass_library_extension.py +++ b/csrc/cutlass_extensions/vllm_cutlass_library_extension.py @@ -14,9 +14,9 @@ class VLLMDataType(enum.Enum): class MixedInputKernelScheduleType(enum.Enum): - TmaWarpSpecializedMixedInput = enum_auto() - TmaWarpSpecializedPingpongMixedInput = enum_auto() - TmaWarpSpecializedCooperativeMixedInput = enum_auto() + TmaWarpSpecialized = enum_auto() + TmaWarpSpecializedPingpong = enum_auto() + TmaWarpSpecializedCooperative = enum_auto() VLLMDataTypeNames: Dict[Union[VLLMDataType, DataType], str] = { @@ -68,11 +68,11 @@ class MixedInputKernelScheduleType(enum.Enum): MixedInputKernelScheduleType, KernelScheduleType], str] = { **KernelScheduleTag, # type: ignore **{ - MixedInputKernelScheduleType.TmaWarpSpecializedMixedInput: - "cutlass::gemm::KernelTmaWarpSpecializedMixedInput", - MixedInputKernelScheduleType.TmaWarpSpecializedPingpongMixedInput: - "cutlass::gemm::KernelTmaWarpSpecializedPingpongMixedInput", - MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput: - "cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput", + MixedInputKernelScheduleType.TmaWarpSpecialized: + "cutlass::gemm::KernelTmaWarpSpecialized", + MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: + "cutlass::gemm::KernelTmaWarpSpecializedPingpong", + MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: + "cutlass::gemm::KernelTmaWarpSpecializedCooperative", } } diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index ac63afe79a255..2df4d181902f8 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -189,7 +189,7 @@ {{DataTypeTag[t.b_group_zeropoint]}}, // GroupZeroT {{DataTypeTag[t.b_channel_scale]}}, // ChannelScaleT {{DataTypeTag[t.a_token_scale]}}, // TokenScaleT - cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput, + cutlass::gemm::KernelTmaWarpSpecializedCooperative, Sch>; {% for sch in schs %} @@ -223,7 +223,7 @@ {{DataTypeTag[t.convert]}}, // ElementConvert {{DataTypeTag[t.accumulator]}}, // Accumulator cutlass::layout::ColumnMajor, - cutlass::gemm::KernelTmaWarpSpecializedCooperativeMixedInput> + cutlass::gemm::KernelTmaWarpSpecializedCooperative> >(args.B); } {%- endfor %} @@ -239,7 +239,7 @@ }; // namespace machete """ -TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperativeMixedInput +TmaMI = MixedInputKernelScheduleType.TmaWarpSpecializedCooperative TmaCoop = EpilogueScheduleType.TmaWarpSpecializedCooperative @@ -300,7 +300,7 @@ def generate_sch_sig(schedule_config: ScheduleConfig) -> str: # mostly unique shorter sch_sig def generate_terse_sch_sig(schedule_config: ScheduleConfig) -> str: kernel_terse_names_replace = { - "KernelTmaWarpSpecializedCooperativeMixedInput_": "TmaMI_", + "KernelTmaWarpSpecializedCooperative": "TmaMI_", "TmaWarpSpecializedCooperative_": "TmaCoop_", "StreamKScheduler": "streamK", } diff --git a/csrc/quantization/machete/machete_collective_builder.cuh b/csrc/quantization/machete/machete_collective_builder.cuh index a74cf8b2dd455..ee825583dee1a 100644 --- a/csrc/quantization/machete/machete_collective_builder.cuh +++ b/csrc/quantization/machete/machete_collective_builder.cuh @@ -18,16 +18,14 @@ struct VLLMCollectiveBuilder< ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType, KernelScheduleType, cute::enable_if_t<( + cute::is_same_v || + cute::is_same_v || cute::is_same_v || - cute::is_same_v || - cute::is_same_v)>> { + KernelTmaWarpSpecializedCooperative>)>> { using CollectiveOp = machete::MacheteCollectiveMma< ElementPairA_, GmemLayoutA_, AlignmentA, ElementPairB_, GmemLayoutB_, AlignmentB, ElementAccumulator, TileShape_MNK, ClusterShape_MNK, StageCountType, KernelScheduleType>; }; -}; // namespace cutlass::gemm::collective \ No newline at end of file +}; // namespace cutlass::gemm::collective diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index 816f33a1078e5..4071b19a3564d 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -66,13 +66,11 @@ struct MacheteCollectiveMma { using Schedule = KernelScheduleType; static_assert( cute::is_same_v || - cute::is_same_v || + cute::is_same_v || + cute::is_same_v || cute::is_same_v || - cute::is_same_v || cute::is_same_v || - cute::is_same_v, + cute::is_same_v, "KernelSchedule must be one of the warp specialized policies"); public: @@ -113,8 +111,7 @@ struct MacheteCollectiveMma { // For coop schedules we have two warp groups cooperatively issuing wgmma // instructions so we use 2 atoms along the M dim (one for each warpgroup) using AtomLayoutMNK = cute::conditional_t< - cute::is_same_v, + cute::is_same_v, Layout>, Layout>>; using TiledMma = decltype(cute::make_tiled_mma( diff --git a/csrc/quantization/machete/machete_prepacked_layout.cuh b/csrc/quantization/machete/machete_prepacked_layout.cuh index 680a858a893c1..81aaa6c4f3a28 100644 --- a/csrc/quantization/machete/machete_prepacked_layout.cuh +++ b/csrc/quantization/machete/machete_prepacked_layout.cuh @@ -98,8 +98,7 @@ struct PrepackedLayoutBTemplate { // For coop schedules we have two warp groups cooperatively issuing wgmma // instructions so we use 2 atoms along the M dim (one for each warpgroup) using AtomLayoutMNK = cute::conditional_t< - cute::is_same_v, + cute::is_same_v, Layout>, Layout>>; using TiledMma = decltype(cute::make_tiled_mma( @@ -247,4 +246,4 @@ struct PrepackedLayoutBTemplate { } }; -}; // namespace machete \ No newline at end of file +}; // namespace machete From 5dbf854553cb6ac97f0c633ed36ba64e0fc9bb29 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Mon, 30 Dec 2024 18:17:04 +0800 Subject: [PATCH 028/462] [CI/Build][CPU] Fix CPU CI by lazy importing triton FP8 kernels (#11618) Signed-off-by: jiang1.li --- vllm/model_executor/layers/quantization/fp8.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 7f779ac8d3b3e..2fe22903a385b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -15,8 +15,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_w8a8_block_fp8_linear) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -337,6 +335,9 @@ def apply(self, size_k=layer.input_size_per_partition, bias=bias) + # Note: lazy import to avoid triton import error. + from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_w8a8_block_fp8_linear) if self.block_quant: assert self.quant_config.weight_block_size is not None return apply_w8a8_block_fp8_linear( From b12e87f942eb7740c17ab546b964bc327afdda37 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 30 Dec 2024 20:24:45 +0800 Subject: [PATCH 029/462] [platforms] enable platform plugins (#11602) Signed-off-by: youkaichao --- .buildkite/test-pipeline.yaml | 25 +- docs/source/design/plugin_system.md | 6 +- tests/conftest.py | 2 +- tests/kernels/test_attention_selector.py | 16 +- .../plugins/vllm_add_dummy_platform/setup.py | 11 + .../vllm_add_dummy_platform/__init__.py | 5 + .../vllm_add_dummy_platform/dummy_platform.py | 5 + tests/plugins_tests/test_platform_plugins.py | 16 + vllm/config.py | 15 +- vllm/distributed/parallel_state.py | 3 +- vllm/engine/arg_utils.py | 2 +- vllm/executor/ray_utils.py | 2 +- .../guided_decoding/__init__.py | 3 +- vllm/model_executor/models/registry.py | 2 +- vllm/model_executor/utils.py | 4 +- vllm/platforms/__init__.py | 320 ++++++++++++------ vllm/plugins/__init__.py | 72 ++-- vllm/spec_decode/metrics.py | 2 +- vllm/usage/usage_lib.py | 2 +- vllm/utils.py | 8 +- vllm/worker/model_runner_base.py | 5 +- vllm/worker/multi_step_model_runner.py | 1 + vllm/worker/worker_base.py | 14 +- 23 files changed, 360 insertions(+), 181 deletions(-) create mode 100644 tests/plugins/vllm_add_dummy_platform/setup.py create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py create mode 100644 tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py create mode 100644 tests/plugins_tests/test_platform_plugins.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b563c96343f92..bee968b4d2e43 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -106,14 +106,12 @@ steps: source_file_dependencies: - vllm/ commands: - - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py - - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests @@ -333,8 +331,6 @@ steps: - vllm/ - tests/models commands: - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_registry.py - pytest -v -s models/test_initialization.py @@ -469,11 +465,28 @@ steps: - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py +- label: Plugin Tests (2 GPUs) # 40min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + fast_check: true + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # other tests continue here: + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - label: Multi-step Tests (4 GPUs) # 36min working_dir: "/vllm-workspace/tests" num_gpus: 4 diff --git a/docs/source/design/plugin_system.md b/docs/source/design/plugin_system.md index 79aff757518f2..225030885f629 100644 --- a/docs/source/design/plugin_system.md +++ b/docs/source/design/plugin_system.md @@ -41,9 +41,11 @@ Every plugin has three parts: 2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the `entry_points` dictionary. In the example above, the plugin name is `register_dummy_model`. Plugins can be filtered by their names using the `VLLM_PLUGINS` environment variable. To load only a specific plugin, set `VLLM_PLUGINS` to the plugin name. 3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is `vllm_add_dummy_model:register`, which refers to a function named `register` in the `vllm_add_dummy_model` module. -## What Can Plugins Do? +## Types of supported plugins -Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM. +- **General plugins** (with group name `vllm.general_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling `ModelRegistry.register_model` to register the model inside the plugin function. + +- **Platform plugins** (with group name `vllm.platform_plugins`): The primary use case for these plugins is to register custom, out-of-the-tree platforms into vLLM. The plugin function should return `None` when the platform is not supported in the current environment, or the platform class's fully qualified name when the platform is supported. ## Guidelines for Writing Plugins diff --git a/tests/conftest.py b/tests/conftest.py index 4e939221329cd..6e2f75e33654f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -31,7 +31,6 @@ to_enc_dec_tuple_list, zip_enc_dec_prompts) from vllm.logger import init_logger from vllm.outputs import RequestOutput -from vllm.platforms import current_platform from vllm.sampling_params import BeamSearchParams from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, identity) @@ -242,6 +241,7 @@ def video_assets() -> _VideoAssets: class HfRunner: def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + from vllm.platforms import current_platform if x is None or isinstance(x, (bool, )): return x diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py index d37f95d48d5b2..916cc2efa3895 100644 --- a/tests/kernels/test_attention_selector.py +++ b/tests/kernels/test_attention_selector.py @@ -5,7 +5,10 @@ from tests.kernels.utils import override_backend_env_variable from vllm.attention.selector import which_attn_to_use -from vllm.platforms import cpu, cuda, openvino, rocm +from vllm.platforms.cpu import CpuPlatform +from vllm.platforms.cuda import CudaPlatform +from vllm.platforms.openvino import OpenVinoPlatform +from vllm.platforms.rocm import RocmPlatform from vllm.utils import STR_FLASH_ATTN_VAL, STR_INVALID_VAL @@ -20,26 +23,23 @@ def test_env(name: str, device: str, monkeypatch): override_backend_env_variable(monkeypatch, name) if device == "cpu": - with patch("vllm.attention.selector.current_platform", - cpu.CpuPlatform()): + with patch("vllm.attention.selector.current_platform", CpuPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "TORCH_SDPA" elif device == "hip": - with patch("vllm.attention.selector.current_platform", - rocm.RocmPlatform()): + with patch("vllm.attention.selector.current_platform", RocmPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "ROCM_FLASH" elif device == "openvino": with patch("vllm.attention.selector.current_platform", - openvino.OpenVinoPlatform()): + OpenVinoPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == "OPENVINO" else: - with patch("vllm.attention.selector.current_platform", - cuda.CudaPlatform()): + with patch("vllm.attention.selector.current_platform", CudaPlatform()): backend = which_attn_to_use(16, torch.float16, torch.float16, 16, False) assert backend.name == name diff --git a/tests/plugins/vllm_add_dummy_platform/setup.py b/tests/plugins/vllm_add_dummy_platform/setup.py new file mode 100644 index 0000000000000..31639906898db --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/setup.py @@ -0,0 +1,11 @@ +from setuptools import setup + +setup( + name='vllm_add_dummy_platform', + version='0.1', + packages=['vllm_add_dummy_platform'], + entry_points={ + 'vllm.platform_plugins': [ + "dummy_platform_plugin = vllm_add_dummy_platform:dummy_platform_plugin" # noqa + ] + }) diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py new file mode 100644 index 0000000000000..594cef520a7de --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/__init__.py @@ -0,0 +1,5 @@ +from typing import Optional + + +def dummy_platform_plugin() -> Optional[str]: + return "vllm_add_dummy_platform.dummy_platform.DummyPlatform" diff --git a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py new file mode 100644 index 0000000000000..fde93142f1103 --- /dev/null +++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py @@ -0,0 +1,5 @@ +from vllm.platforms.cuda import CudaPlatform + + +class DummyPlatform(CudaPlatform): + device_name = "DummyDevice" diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py new file mode 100644 index 0000000000000..0d27cf9f152e0 --- /dev/null +++ b/tests/plugins_tests/test_platform_plugins.py @@ -0,0 +1,16 @@ +def test_platform_plugins(): + # simulate workload by running an example + import runpy + current_file = __file__ + import os + example_file = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(current_file))), + "examples", "offline_inference.py") + runpy.run_path(example_file) + + # check if the plugin is loaded correctly + from vllm.platforms import _init_trace, current_platform + assert current_platform.device_name == "DummyDevice", ( + f"Expected DummyDevice, got {current_platform.device_name}, " + "possibly because current_platform is imported before the plugin" + f" is loaded. The first import:\n{_init_trace}") diff --git a/vllm/config.py b/vllm/config.py index 765a46e6aeee3..e72c53b6130d0 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -22,7 +22,7 @@ from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS, get_quantization_config) from vllm.model_executor.models import ModelRegistry -from vllm.platforms import current_platform, interface +from vllm.platforms import CpuArchEnum from vllm.tracing import is_otel_available, otel_import_error_traceback from vllm.transformers_utils.config import ( ConfigFormat, get_config, get_hf_image_processor_config, @@ -349,6 +349,7 @@ def __init__(self, self.is_hybrid = self._init_is_hybrid() self.has_inner_state = self._init_has_inner_state() + from vllm.platforms import current_platform if current_platform.is_neuron(): self.override_neuron_config = override_neuron_config else: @@ -589,6 +590,7 @@ def _verify_quantization(self) -> None: raise ValueError( f"Unknown quantization method: {self.quantization}. Must " f"be one of {supported_quantization}.") + from vllm.platforms import current_platform current_platform.verify_quantization(self.quantization) if self.quantization not in optimized_quantization_methods: logger.warning( @@ -644,6 +646,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, # Reminder: Please update docs/source/usage/compatibility_matrix.md # If the feature combo become valid + from vllm.platforms import current_platform if not current_platform.is_async_output_supported(self.enforce_eager): logger.warning( "Async output processing is not supported on the " @@ -1012,6 +1015,7 @@ def _verify_args(self) -> None: raise ValueError( "GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") + from vllm.platforms import current_platform if (current_platform.is_cuda() and self.block_size is not None and self.block_size > 32): raise ValueError("CUDA Paged Attention kernel only supports " @@ -1279,6 +1283,7 @@ def __post_init__(self) -> None: f"distributed executor backend " f"'{self.distributed_executor_backend}'.") ray_only_devices = ["tpu", "hpu"] + from vllm.platforms import current_platform if (current_platform.device_type in ray_only_devices and self.world_size > 1): if self.distributed_executor_backend is None: @@ -1327,7 +1332,7 @@ def use_ray(self) -> bool: def _verify_args(self) -> None: # Lazy import to avoid circular import from vllm.executor.executor_base import ExecutorBase - + from vllm.platforms import current_platform if self.distributed_executor_backend not in ( "ray", "mp", None) and not (isinstance( self.distributed_executor_backend, type) and issubclass( @@ -1528,6 +1533,7 @@ def compute_hash(self) -> str: def __init__(self, device: str = "auto") -> None: if device == "auto": # Automated device type detection + from vllm.platforms import current_platform self.device_type = current_platform.device_type if not self.device_type: raise RuntimeError("Failed to infer device type") @@ -2241,9 +2247,10 @@ def _get_and_verify_dtype( else: torch_dtype = config_dtype + from vllm.platforms import current_platform if (current_platform.is_cpu() and current_platform.get_cpu_architecture() - == interface.CpuArchEnum.POWERPC + == CpuArchEnum.POWERPC and (config_dtype == torch.float16 or config_dtype == torch.float32)): logger.info( @@ -3083,6 +3090,7 @@ def _get_quantization_config( model_config: ModelConfig, load_config: LoadConfig) -> Optional[QuantizationConfig]: """Get the quantization config.""" + from vllm.platforms import current_platform if model_config.quantization is not None: from vllm.model_executor.model_loader.weight_utils import ( get_quant_config) @@ -3145,6 +3153,7 @@ def __post_init__(self): self.quant_config = VllmConfig._get_quantization_config( self.model_config, self.load_config) + from vllm.platforms import current_platform if self.scheduler_config is not None and \ self.model_config is not None and \ self.scheduler_config.chunked_prefill_enabled and \ diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 5b9236f8c56b6..e6768467f4c27 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -39,7 +39,6 @@ import vllm.envs as envs from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils import direct_register_custom_op, supports_custom_op if TYPE_CHECKING: @@ -194,6 +193,7 @@ def __init__( assert self.cpu_group is not None assert self.device_group is not None + from vllm.platforms import current_platform if current_platform.is_cuda_alike(): self.device = torch.device(f"cuda:{local_rank}") else: @@ -1188,6 +1188,7 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False): import ray # Lazy import Ray ray.shutdown() gc.collect() + from vllm.platforms import current_platform if not current_platform.is_cpu(): torch.cuda.empty_cache() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 21966d003c7ef..69c7c5077fe32 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -18,7 +18,6 @@ from vllm.executor.executor_base import ExecutorBase from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.platforms import current_platform from vllm.transformers_utils.utils import check_gguf_file from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, StoreBoolean @@ -1094,6 +1093,7 @@ def create_engine_config(self, use_sliding_window = (model_config.get_sliding_window() is not None) use_spec_decode = self.speculative_model is not None + from vllm.platforms import current_platform if (is_gpu and not use_sliding_window and not use_spec_decode and not self.enable_lora and not self.enable_prompt_adapter diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 426aa1b5c728f..8d766bad1a072 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -8,7 +8,6 @@ from vllm.config import ParallelConfig from vllm.executor.msgspec_utils import decode_hook, encode_hook from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import get_ip from vllm.worker.worker_base import WorkerWrapperBase @@ -229,6 +228,7 @@ def initialize_ray_cluster( the default Ray cluster address. """ assert_ray_available() + from vllm.platforms import current_platform # Connect to a ray cluster. if current_platform.is_rocm() or current_platform.is_xpu(): diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index 694c5b68b1cbd..18b435a42544a 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -6,7 +6,7 @@ from vllm.model_executor.guided_decoding.utils import ( convert_lark_to_gbnf, grammar_is_likely_lark, has_lmf_unsupported_json_features, has_xgrammar_unsupported_json_features) -from vllm.platforms import CpuArchEnum, current_platform +from vllm.platforms import CpuArchEnum if TYPE_CHECKING: from transformers import PreTrainedTokenizer @@ -39,6 +39,7 @@ def maybe_backend_fallback( if guided_params.backend == "xgrammar": # xgrammar only has x86 wheels for linux, fallback to outlines + from vllm.platforms import current_platform if current_platform.get_cpu_architecture() is not CpuArchEnum.X86: logger.warning("xgrammar is only supported on x86 CPUs. " "Falling back to use outlines instead.") diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 67268eb4bb85f..07f4b5a3b3bc8 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -18,7 +18,6 @@ import torch.nn as nn from vllm.logger import init_logger -from vllm.platforms import current_platform from .interfaces import (has_inner_state, is_attention_free, is_hybrid, supports_cross_encoding, supports_multimodal, @@ -273,6 +272,7 @@ def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, ) -> Optional[Type[nn.Module]]: + from vllm.platforms import current_platform current_platform.verify_model_arch(model_arch) try: return model.load_model_cls() diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 39ead08c238ce..6f1cc9d5e0c30 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -3,10 +3,9 @@ import torch -from vllm.platforms import current_platform - def set_random_seed(seed: int) -> None: + from vllm.platforms import current_platform current_platform.seed_everything(seed) @@ -38,6 +37,7 @@ def set_weight_attrs( # This sometimes causes OOM errors during model loading. To avoid this, # we sync the param tensor after its weight loader is called. # TODO(woosuk): Remove this hack once we have a better solution. + from vllm.platforms import current_platform if current_platform.is_tpu() and key == "weight_loader": value = _make_synced_weight_loader(value) setattr(weight, key, value) diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index 419237c252ffd..f6ac14446c021 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -1,123 +1,223 @@ +import logging +import traceback +from itertools import chain +from typing import TYPE_CHECKING, Optional + +from vllm.plugins import load_plugins_by_group +from vllm.utils import resolve_obj_by_qualname + from .interface import _Backend # noqa: F401 -from .interface import CpuArchEnum, Platform, PlatformEnum, UnspecifiedPlatform +from .interface import CpuArchEnum, Platform, PlatformEnum -current_platform: Platform +logger = logging.getLogger(__name__) -# NOTE: we don't use `torch.version.cuda` / `torch.version.hip` because -# they only indicate the build configuration, not the runtime environment. -# For example, people can install a cuda build of pytorch but run on tpu. -is_tpu = False -try: - # While it's technically possible to install libtpu on a non-TPU machine, - # this is a very uncommon scenario. Therefore, we assume that libtpu is - # installed if and only if the machine has TPUs. - import libtpu # noqa: F401 - is_tpu = True -except Exception: - pass +def tpu_platform_plugin() -> Optional[str]: + is_tpu = False + try: + # While it's technically possible to install libtpu on a + # non-TPU machine, this is a very uncommon scenario. Therefore, + # we assume that libtpu is installed if and only if the machine + # has TPUs. + import libtpu # noqa: F401 + is_tpu = True + except Exception: + pass + + return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None -is_cuda = False -try: - import pynvml - pynvml.nvmlInit() +def cuda_platform_plugin() -> Optional[str]: + is_cuda = False + try: - if pynvml.nvmlDeviceGetCount() > 0: + import pynvml + pynvml.nvmlInit() + try: + if pynvml.nvmlDeviceGetCount() > 0: + is_cuda = True + finally: + pynvml.nvmlShutdown() + except Exception: + # CUDA is supported on Jetson, but NVML may not be. + import os + + def cuda_is_jetson() -> bool: + return os.path.isfile("/etc/nv_tegra_release") \ + or os.path.exists("/sys/class/tegra-firmware") + + if cuda_is_jetson(): is_cuda = True - finally: - pynvml.nvmlShutdown() -except Exception: - # CUDA is supported on Jetson, but NVML may not be. - import os - def cuda_is_jetson() -> bool: - return os.path.isfile("/etc/nv_tegra_release") \ - or os.path.exists("/sys/class/tegra-firmware") + return "vllm.platforms.cuda.CudaPlatform" if is_cuda else None + + +def rocm_platform_plugin() -> Optional[str]: + is_rocm = False + + try: + import amdsmi + amdsmi.amdsmi_init() + try: + if len(amdsmi.amdsmi_get_processor_handles()) > 0: + is_rocm = True + finally: + amdsmi.amdsmi_shut_down() + except Exception: + pass + + return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None + + +def hpu_platform_plugin() -> Optional[str]: + is_hpu = False + try: + from importlib import util + is_hpu = util.find_spec('habana_frameworks') is not None + except Exception: + pass + + return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None + + +def xpu_platform_plugin() -> Optional[str]: + is_xpu = False + + try: + # installed IPEX if the machine has XPUs. + import intel_extension_for_pytorch # noqa: F401 + import oneccl_bindings_for_pytorch # noqa: F401 + import torch + if hasattr(torch, 'xpu') and torch.xpu.is_available(): + is_xpu = True + except Exception: + pass + + return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None + + +def cpu_platform_plugin() -> Optional[str]: + is_cpu = False + try: + from importlib.metadata import version + is_cpu = "cpu" in version("vllm") + except Exception: + pass + + return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None + + +def neuron_platform_plugin() -> Optional[str]: + is_neuron = False + try: + import transformers_neuronx # noqa: F401 + is_neuron = True + except ImportError: + pass - if cuda_is_jetson(): - is_cuda = True + return "vllm.platforms.neuron.NeuronPlatform" if is_neuron else None -is_rocm = False -try: - import amdsmi - amdsmi.amdsmi_init() +def openvino_platform_plugin() -> Optional[str]: + is_openvino = False try: - if len(amdsmi.amdsmi_get_processor_handles()) > 0: - is_rocm = True - finally: - amdsmi.amdsmi_shut_down() -except Exception: - pass - -is_hpu = False -try: - from importlib import util - is_hpu = util.find_spec('habana_frameworks') is not None -except Exception: - pass - -is_xpu = False - -try: - # installed IPEX if the machine has XPUs. - import intel_extension_for_pytorch # noqa: F401 - import oneccl_bindings_for_pytorch # noqa: F401 - import torch - if hasattr(torch, 'xpu') and torch.xpu.is_available(): - is_xpu = True -except Exception: - pass - -is_cpu = False -try: - from importlib.metadata import version - is_cpu = "cpu" in version("vllm") -except Exception: - pass - -is_neuron = False -try: - import transformers_neuronx # noqa: F401 - is_neuron = True -except ImportError: - pass - -is_openvino = False -try: - from importlib.metadata import version - is_openvino = "openvino" in version("vllm") -except Exception: - pass - -if is_tpu: - # people might install pytorch built with cuda but run on tpu - # so we need to check tpu first - from .tpu import TpuPlatform - current_platform = TpuPlatform() -elif is_cuda: - from .cuda import CudaPlatform - current_platform = CudaPlatform() -elif is_rocm: - from .rocm import RocmPlatform - current_platform = RocmPlatform() -elif is_hpu: - from .hpu import HpuPlatform - current_platform = HpuPlatform() -elif is_xpu: - from .xpu import XPUPlatform - current_platform = XPUPlatform() -elif is_cpu: - from .cpu import CpuPlatform - current_platform = CpuPlatform() -elif is_neuron: - from .neuron import NeuronPlatform - current_platform = NeuronPlatform() -elif is_openvino: - from .openvino import OpenVinoPlatform - current_platform = OpenVinoPlatform() -else: - current_platform = UnspecifiedPlatform() - -__all__ = ['Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum'] + from importlib.metadata import version + is_openvino = "openvino" in version("vllm") + except Exception: + pass + + return "vllm.platforms.openvino.OpenVinoPlatform" if is_openvino else None + + +builtin_platform_plugins = { + 'tpu': tpu_platform_plugin, + 'cuda': cuda_platform_plugin, + 'rocm': rocm_platform_plugin, + 'hpu': hpu_platform_plugin, + 'xpu': xpu_platform_plugin, + 'cpu': cpu_platform_plugin, + 'neuron': neuron_platform_plugin, + 'openvino': openvino_platform_plugin, +} + + +def resolve_current_platform_cls_qualname() -> str: + platform_plugins = load_plugins_by_group('vllm.platform_plugins') + + activated_plugins = [] + + for name, func in chain(builtin_platform_plugins.items(), + platform_plugins.items()): + try: + assert callable(func) + platform_cls_qualname = func() + if platform_cls_qualname is not None: + activated_plugins.append(name) + except Exception: + pass + + activated_builtin_plugins = list( + set(activated_plugins) & set(builtin_platform_plugins.keys())) + activated_oot_plugins = list( + set(activated_plugins) & set(platform_plugins.keys())) + + if len(activated_oot_plugins) >= 2: + raise RuntimeError( + "Only one platform plugin can be activated, but got: " + f"{activated_oot_plugins}") + elif len(activated_oot_plugins) == 1: + platform_cls_qualname = platform_plugins[activated_oot_plugins[0]]() + logger.info("Platform plugin %s is activated", + activated_oot_plugins[0]) + elif len(activated_builtin_plugins) >= 2: + raise RuntimeError( + "Only one platform plugin can be activated, but got: " + f"{activated_builtin_plugins}") + elif len(activated_builtin_plugins) == 1: + platform_cls_qualname = builtin_platform_plugins[ + activated_builtin_plugins[0]]() + logger.info("Automatically detected platform %s.", + activated_builtin_plugins[0]) + else: + platform_cls_qualname = "vllm.interface.UnspecifiedPlatform" + logger.info( + "No platform detected, vLLM is running on UnspecifiedPlatform") + return platform_cls_qualname + + +_current_platform = None +_init_trace: str = '' + +if TYPE_CHECKING: + current_platform: Platform + + +def __getattr__(name: str): + if name == 'current_platform': + # lazy init current_platform. + # 1. out-of-tree platform plugins need `from vllm.platforms import + # Platform` so that they can inherit `Platform` class. Therefore, + # we cannot resolve `current_platform` during the import of + # `vllm.platforms`. + # 2. when users use out-of-tree platform plugins, they might run + # `import vllm`, some vllm internal code might access + # `current_platform` during the import, and we need to make sure + # `current_platform` is only resolved after the plugins are loaded + # (we have tests for this, if any developer violate this, they will + # see the test failures). + global _current_platform + if _current_platform is None: + platform_cls_qualname = resolve_current_platform_cls_qualname() + _current_platform = resolve_obj_by_qualname( + platform_cls_qualname)() + global _init_trace + _init_trace = "".join(traceback.format_stack()) + return _current_platform + else: + return globals()[name] + + +__all__ = [ + 'Platform', 'PlatformEnum', 'current_platform', 'CpuArchEnum', + "_init_trace" +] diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py index 17f604ea0e202..c50eb2cef4cd5 100644 --- a/vllm/plugins/__init__.py +++ b/vllm/plugins/__init__.py @@ -1,10 +1,10 @@ import logging import os +from typing import Callable, Dict import torch import vllm.envs as envs -from vllm.platforms import current_platform logger = logging.getLogger(__name__) @@ -12,6 +12,39 @@ plugins_loaded = False +def load_plugins_by_group(group: str) -> Dict[str, Callable]: + import sys + if sys.version_info < (3, 10): + from importlib_metadata import entry_points + else: + from importlib.metadata import entry_points + + allowed_plugins = envs.VLLM_PLUGINS + + discovered_plugins = entry_points(group=group) + if len(discovered_plugins) == 0: + logger.debug("No plugins for group %s found.", group) + return {} + logger.info("Available plugins for group %s:", group) + for plugin in discovered_plugins: + logger.info("name=%s, value=%s", plugin.name, plugin.value) + if allowed_plugins is None: + logger.info("all available plugins for group %s will be loaded.", + group) + logger.info("set environment variable VLLM_PLUGINS to control" + " which plugins to load.") + plugins = {} + for plugin in discovered_plugins: + if allowed_plugins is None or plugin.name in allowed_plugins: + try: + func = plugin.load() + plugins[plugin.name] = func + logger.info("plugin %s loaded.", plugin.name) + except Exception: + logger.exception("Failed to load plugin %s", plugin.name) + return plugins + + def load_general_plugins(): """WARNING: plugins can be loaded for multiple times in different processes. They should be designed in a way that they can be loaded @@ -26,6 +59,9 @@ def load_general_plugins(): os.environ['TORCHINDUCTOR_COMPILE_THREADS'] = '1' # see https://github.com/vllm-project/vllm/issues/10619 torch._inductor.config.compile_threads = 1 + + from vllm.platforms import current_platform + if current_platform.is_xpu(): # see https://github.com/pytorch/pytorch/blob/8cada5cbe5450e17c26fb8b358116785324537b2/torch/_dynamo/config.py#L158 # noqa os.environ['TORCH_COMPILE_DISABLE'] = 'True' @@ -47,33 +83,7 @@ def load_general_plugins(): if plugins_loaded: return plugins_loaded = True - import sys - if sys.version_info < (3, 10): - from importlib_metadata import entry_points - else: - from importlib.metadata import entry_points - - allowed_plugins = envs.VLLM_PLUGINS - - discovered_plugins = entry_points(group='vllm.general_plugins') - if len(discovered_plugins) == 0: - logger.debug("No plugins found.") - return - logger.info("Available plugins:") - for plugin in discovered_plugins: - logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value, - plugin.group) - if allowed_plugins is None: - logger.info("all available plugins will be loaded.") - logger.info("set environment variable VLLM_PLUGINS to control" - " which plugins to load.") - else: - logger.info("plugins to load: %s", allowed_plugins) - for plugin in discovered_plugins: - if allowed_plugins is None or plugin.name in allowed_plugins: - try: - func = plugin.load() - func() - logger.info("plugin %s loaded.", plugin.name) - except Exception: - logger.exception("Failed to load plugin %s", plugin.name) + plugins = load_plugins_by_group(group='vllm.general_plugins') + # general plugins, we only need to execute the loaded functions + for func in plugins.values(): + func() diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py index 03dc46600d8a9..d678f4578499b 100644 --- a/vllm/spec_decode/metrics.py +++ b/vllm/spec_decode/metrics.py @@ -6,7 +6,6 @@ from vllm.model_executor.layers.spec_decode_base_sampler import ( SpecDecodeBaseSampler) -from vllm.platforms import current_platform from vllm.utils import is_pin_memory_available @@ -94,6 +93,7 @@ def init_tensors(self, def maybe_collect_rejsample_metrics( self, k: int) -> Optional[SpecDecodeWorkerMetrics]: # currently using cuda.Event, skip for any non_cuda_alike platform + from vllm.platforms import current_platform if not current_platform.is_cuda_alike(): return None diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py index 9ae46ff43a916..a9deee881f41a 100644 --- a/vllm/usage/usage_lib.py +++ b/vllm/usage/usage_lib.py @@ -17,7 +17,6 @@ import vllm.envs as envs from vllm.connections import global_http_connection -from vllm.platforms import current_platform from vllm.version import __version__ as VLLM_VERSION _config_home = envs.VLLM_CONFIG_ROOT @@ -152,6 +151,7 @@ def _report_usage_once(self, model_architecture: str, usage_context: UsageContext, extra_kvs: Dict[str, Any]) -> None: # Platform information + from vllm.platforms import current_platform if current_platform.is_cuda_alike(): device_property = torch.cuda.get_device_properties(0) self.gpu_count = torch.cuda.device_count() diff --git a/vllm/utils.py b/vllm/utils.py index 2b46c1fef0d09..8ef07d2c326a3 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -50,7 +50,6 @@ import vllm.envs as envs from vllm.logger import enable_trace_function_call, init_logger -from vllm.platforms import current_platform if TYPE_CHECKING: from vllm.config import VllmConfig @@ -609,6 +608,7 @@ def create_kv_caches_with_random_flash( seed: int = 0, device: Optional[str] = "cuda", ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: + from vllm.platforms import current_platform current_platform.seed_everything(seed) torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) @@ -650,7 +650,7 @@ def create_kv_caches_with_random( raise ValueError( f"Does not support key cache of type fp8 with head_size {head_size}" ) - + from vllm.platforms import current_platform current_platform.seed_everything(seed) torch_dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) @@ -703,6 +703,7 @@ def print_warning_once(msg: str) -> None: @lru_cache(maxsize=None) def is_pin_memory_available() -> bool: + from vllm.platforms import current_platform return current_platform.is_pin_memory_available() @@ -713,6 +714,7 @@ def __init__(self, device: Optional[torch.types.Device] = None): def current_memory_usage(self) -> float: # Return the memory usage in bytes. + from vllm.platforms import current_platform if current_platform.is_cuda_alike(): torch.cuda.reset_peak_memory_stats(self.device) mem = torch.cuda.max_memory_allocated(self.device) @@ -1066,6 +1068,7 @@ def _cuda_device_count_stateless( import torch.cuda import torch.version + from vllm.platforms import current_platform if not torch.cuda._is_compiled(): return 0 if current_platform.is_rocm(): @@ -1673,6 +1676,7 @@ def direct_register_custom_op( return if not supports_custom_op(): + from vllm.platforms import current_platform assert not current_platform.is_cuda_alike(), ( "cuda platform needs torch>=2.4 to support custom op, " "chances are you are using an old version of pytorch " diff --git a/vllm/worker/model_runner_base.py b/vllm/worker/model_runner_base.py index cd4770202a186..c7abad7e0258d 100644 --- a/vllm/worker/model_runner_base.py +++ b/vllm/worker/model_runner_base.py @@ -12,7 +12,6 @@ from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, SequenceGroupMetadata if TYPE_CHECKING: @@ -265,13 +264,13 @@ def prepare_model_input( """ raise NotImplementedError - @current_platform.inference_mode() def execute_model( self, model_input: T, kv_caches: Optional[List[torch.Tensor]], - intermediate_tensors: Optional[IntermediateTensors], + intermediate_tensors: Optional[IntermediateTensors] = None, num_steps: int = 1, + **kwargs, ) -> Optional[List[SamplerOutput]]: """ Execute the model on the given input. diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 65d9bab0e2822..dee63a75c0605 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -544,6 +544,7 @@ def execute_model( model_input.record_step_event(current_stream) if get_pp_group().is_last_rank and self.is_driver_worker: + assert isinstance(output, list) assert len( output ) == 1, "MultiStepModelRunner requires single-step base_models" diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 3ac7fb8dfb766..249b3ed2dfd37 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -11,7 +11,6 @@ from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors from vllm.utils import (enable_trace_function_call_for_thread, resolve_obj_by_qualname, update_environment_variables) @@ -44,6 +43,8 @@ def __init__( self.prompt_adapter_config = vllm_config.prompt_adapter_config self.observability_config = vllm_config.observability_config self.kv_transfer_config = vllm_config.kv_transfer_config + from vllm.platforms import current_platform + self.current_platform = current_platform @abstractmethod def init_device(self) -> None: @@ -74,17 +75,17 @@ def initialize_cache(self, num_gpu_blocks: int, """ raise NotImplementedError - @current_platform.inference_mode() def start_worker_execution_loop(self) -> None: """Execute model loop in parallel worker. You can stop the loop by executing a driver worker with an empty output. See `stop_remote_worker_execution_loop` for more details. """ - while True: - output = self.execute_model(execute_model_req=None) - if output is None: - return None + with self.current_platform.inference_mode(): + while True: + output = self.execute_model(execute_model_req=None) + if output is None: + return None @abstractmethod def execute_model( @@ -352,6 +353,7 @@ def execute_model( model_execute_time = time.perf_counter() - start_time if not get_pp_group().is_last_rank: # output is IntermediateTensors + assert isinstance(output, IntermediateTensors) if (self.observability_config is not None and self.observability_config.collect_model_execute_time): output.tensors["model_execute_time"] = torch.tensor( From 8d9b6721e7f5b7d191951c6f1cd12710ffd08093 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 30 Dec 2024 23:01:35 +0800 Subject: [PATCH 030/462] [VLM] Abstract out multi-modal data parsing in merged processor (#11620) Signed-off-by: DarkLight1337 --- .buildkite/test-pipeline.yaml | 4 +- vllm/model_executor/models/chatglm.py | 4 +- vllm/model_executor/models/llava.py | 18 +- vllm/model_executor/models/phi3v.py | 19 +- vllm/model_executor/models/qwen2_audio.py | 22 +- vllm/model_executor/models/qwen2_vl.py | 153 +++++----- vllm/model_executor/models/ultravox.py | 22 +- vllm/multimodal/__init__.py | 9 +- vllm/multimodal/audio.py | 4 +- vllm/multimodal/base.py | 8 +- vllm/multimodal/image.py | 4 +- vllm/multimodal/inputs.py | 195 ++++-------- vllm/multimodal/parse.py | 344 ++++++++++++++++++++++ vllm/multimodal/processing.py | 62 ++-- vllm/multimodal/video.py | 4 +- 15 files changed, 560 insertions(+), 312 deletions(-) create mode 100644 vllm/multimodal/parse.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index bee968b4d2e43..c6f8316412e2f 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -356,7 +356,7 @@ steps: - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - pytest -v -s models/embedding/language -m 'not core_model' -- label: Multi-Modal Models Test (Standard) # 28min +- label: Multi-Modal Models Test (Standard) # 40min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -372,7 +372,7 @@ steps: - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model -- label: Multi-Modal Models Test (Extended) 1 # 1h16m +- label: Multi-Modal Models Test (Extended) 1 # 48m optional: true source_file_dependencies: - vllm/ diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 6c50882d83c3b..ffd6891b25965 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -33,7 +33,7 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs, +from vllm.multimodal.inputs import (ModalityData, MultiModalKwargs, NestedTensors) from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -54,7 +54,7 @@ def calculate_image_placeholder(vision_config): def mm_input_mapper_for_glmv( ctx: InputContext, - data: MultiModalData[object], + data: ModalityData[object], ) -> Dict: model_config = ctx.model_config tokenizer = cached_get_tokenizer( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0ecba5a1cae0f..1d6ee2a0be72e 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -20,11 +20,13 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems, - MultiModalFieldConfig, MultiModalInputsV2, - MultiModalKwargs, NestedTensors) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - ProcessorInputs, PromptReplacement, + MultiModalDataItems, ProcessorInputs, + PromptReplacement, full_groupby_modality) from vllm.sequence import IntermediateTensors @@ -179,7 +181,9 @@ def _get_prompt_replacements( assert isinstance(vision_config, PixtralVisionConfig) def get_replacement_pixtral(item_idx: int): - image_size = mm_items.get_image_size(item_idx) + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + ( num_width_tokens, num_height_tokens, @@ -591,8 +595,8 @@ def apply( result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) - mm_items = self._get_mm_items(mm_data) - mm_item_counts = mm_items.get_item_counts() + mm_items = self._to_mm_items(mm_data) + mm_item_counts = mm_items.get_all_counts() mm_kwargs = result["mm_kwargs"] # We reimplement the functionality of MLlavaProcessor from diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index fefa9fd62d1d0..15362db6cdfbf 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -32,12 +32,13 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems, - MultiModalFieldConfig, MultiModalInputsV2, - MultiModalKwargs, NestedTensors, - PlaceholderRange) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, - ProcessorInputs, PromptReplacement, + MultiModalDataItems, ProcessorInputs, + PromptReplacement, _BoundPromptReplacement, _PlaceholderInfo) from vllm.sequence import IntermediateTensors @@ -381,7 +382,9 @@ def _get_prompt_replacements( assert isinstance(bos_token_id, int) def get_replacement_phi3v(item_idx: int): - image_size = mm_items.get_image_size(item_idx) + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + num_tokens = image_processor.calc_num_image_tokens_from_image_size( width=image_size.width, height=image_size.height, @@ -389,12 +392,14 @@ def get_replacement_phi3v(item_idx: int): return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id] + num_images = mm_items.get_count("image", strict=False) + return [ PromptReplacement( modality="image", target=image_token, replacement=get_replacement_phi3v, - ) for image_token in image_tokens[:len(mm_items.images)] + ) for image_token in image_tokens[:num_images] ] def _apply_prompt_replacements( diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 25a351bd9c656..e3d43b017f894 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -20,8 +20,8 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from functools import cached_property -from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import numpy as np import torch @@ -38,10 +38,12 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - ProcessorInputs, PromptReplacement) + MultiModalDataItems, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -99,15 +101,9 @@ def _get_hf_processor( def _get_feature_extractor(self) -> WhisperFeatureExtractor: return self._get_hf_processor().feature_extractor # type: ignore - def _get_hf_mm_data( - self, - mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - # resample audio to the model's sampling rate + def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self._get_feature_extractor() - mm_items.resample_audios(feature_extractor.sampling_rate) - - return super()._get_hf_mm_data(mm_items) + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) def _call_hf_processor( self, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 574845ef5a525..6181fe3dd13d8 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -25,7 +25,6 @@ from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional, Set, Tuple, Type, TypedDict, Union) -import numpy as np import torch import torch.nn as nn import torch.nn.functional as F @@ -55,15 +54,16 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalDataItems, +from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, - NestedTensors) + NestedTensors, VideoItem) +from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - ProcessorInputs, PromptReplacement) + MultiModalDataItems, ProcessorInputs, + PromptReplacement) from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope -from vllm.utils import is_list_of from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, get_vit_attn_backend, @@ -719,61 +719,81 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key="video") -class Qwen2VLMultiModalDataItems(MultiModalDataItems): +class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], + dict[str, torch.Tensor]]): - @staticmethod - def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": - """ - Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. - """ - multi_data = Qwen2VLMultiModalDataItems() - - for k, v in data.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - # yapf: disable - if k == "video": - # Special case since even a single item can be a list - multi_data[k] = ( # type: ignore[index] - v if ( - isinstance(v, (dict, torch.Tensor)) # type: ignore[assignment] - or is_list_of(v, list) - or isinstance(v[0], (np.ndarray, torch.Tensor)) - and v[0].ndim == 4 - ) else [v] - ) - elif k in ("image", "audio"): - multi_data[k] = ( # type: ignore[index] - v if isinstance(v, (dict, torch.Tensor, list)) else [v] - ) - else: - multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] - # yapf: enable + def __init__(self, data: dict, modality: str) -> None: + super().__init__(data) - return multi_data + self.modality = modality - def get_item_counts(self) -> Mapping[str, int]: - return { - m: ( - len(items[f"{m}_grid_thw"]) # type: ignore - if isinstance(items, dict) else len(items)) - for m, items in self.items() - } + grid_thw = data[f"{modality}_grid_thw"] + slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist() + self._slices = [ + slice(slice_idxs[i], slice_idxs[i + 1]) + for i in range(len(grid_thw)) + ] - def has_embedding_inputs(self) -> bool: - return any( - isinstance(items, dict) or any( - isinstance(item, torch.Tensor) for item in items) - for items in self.values()) + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r})") + def get_count(self) -> int: + return len(self.data[f"{self.modality}_grid_thw"]) -class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): + def get(self, index: int) -> dict[str, torch.Tensor]: + out = {} + for k, v in self.data.items(): + if v != f"{self.modality}_grid_thw": + v = v[self._slices[index]] + + out[k] = v + + return out + + def get_processor_data(self) -> Mapping[str, object]: + return {} + + def get_passthrough_data(self) -> Mapping[str, object]: + return self.data + + +class Qwen2ImageEmbeddingItems(Qwen2EmbeddingItems): + + def __init__(self, data: dict) -> None: + super().__init__(data, "image") + + +class Qwen2VideoEmbeddingItems(Qwen2EmbeddingItems): - def _get_mm_items( + def __init__(self, data: dict) -> None: + super().__init__(data, "video") + + +class Qwen2MultiModalDataParser(MultiModalDataParser): + + def _parse_image_data( + self, + data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return Qwen2EmbeddingItems(data, modality="image") + + return super()._parse_image_data(data) + + def _parse_video_data( self, - mm_data: MultiModalDataDict, - ) -> MultiModalDataItems: - return Qwen2VLMultiModalDataItems.from_dict(mm_data) + data: Union[dict[str, torch.Tensor], ModalityData[VideoItem]], + ) -> ModalityDataItems[Any, Any]: + if isinstance(data, dict): + return Qwen2EmbeddingItems(data, modality="video") + + return super()._parse_video_data(data) + + +class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): + + def _get_data_parser(self) -> MultiModalDataParser: + return Qwen2MultiModalDataParser() def _get_hf_processor( self, @@ -796,35 +816,6 @@ def _get_hf_processor( return hf_processor - def _get_hf_mm_data( - self, - mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - processor_data = dict[str, Any]() - passthrough_data = dict[str, Any]() - - for k, v in mm_items.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - if k in ("image", "video", "audio"): - if isinstance(v, dict): - # Pass through embedding inputs (dict) - passthrough_data.update(v) - elif isinstance(v, torch.Tensor) and v.ndim == 3: - # Pass through embedding inputs (single) - passthrough_data[f"{k}_embeds"] = [v] - elif (is_list_of(v, torch.Tensor) and len(v) > 0 - and v[0].ndim == 2): - # Pass through embedding inputs (multi) - passthrough_data[f"{k}_embeds"] = v - elif len(v) > 0: - # Map keys to plural form, e.g.: image -> images - processor_data[f"{k}s"] = v - else: - processor_data[k] = v - - return processor_data, passthrough_data - def _get_prompt_replacements( self, mm_items: MultiModalDataItems, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 7b4aeeec5f403..7e853e5b90096 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,8 +3,8 @@ import math from functools import cached_property, lru_cache -from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, + TypedDict, Union) import numpy as np import torch @@ -24,10 +24,12 @@ from vllm.model_executor.model_loader.loader import DefaultModelLoader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import (MultiModalDataItems, MultiModalFieldConfig, - MultiModalKwargs, NestedTensors) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - ProcessorInputs, PromptReplacement) + MultiModalDataItems, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig from vllm.utils import is_list_of @@ -85,15 +87,9 @@ def _get_feature_extractor(self) -> WhisperFeatureExtractor: hf_processor = self._get_hf_processor() return hf_processor.audio_processor.feature_extractor # type: ignore - def _get_hf_mm_data( - self, - mm_items: MultiModalDataItems, - ) -> tuple[dict[str, Any], dict[str, Any]]: - # resample audio to the model's sampling rate + def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self._get_feature_extractor() - mm_items.resample_audios(feature_extractor.sampling_rate) - - return super()._get_hf_mm_data(mm_items) + return MultiModalDataParser(target_sr=feature_extractor.sampling_rate) def _call_hf_processor( self, diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 9255e062e4870..e58bbe81717a0 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,8 +1,7 @@ from .base import MultiModalPlaceholderMap, MultiModalPlugin -from .inputs import (BatchedTensorInputs, MultiModalData, - MultiModalDataBuiltins, MultiModalDataDict, - MultiModalKwargs, MultiModalPlaceholderDict, - NestedTensors) +from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins, + MultiModalDataDict, MultiModalKwargs, + MultiModalPlaceholderDict, NestedTensors) from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -16,7 +15,7 @@ __all__ = [ "BatchedTensorInputs", - "MultiModalData", + "ModalityData", "MultiModalDataBuiltins", "MultiModalDataDict", "MultiModalKwargs", diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index 3e09ef1fcbb56..de80f22bac2a3 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -9,7 +9,7 @@ from vllm.utils import PlaceholderModule from .base import MediaIO, MultiModalPlugin -from .inputs import AudioItem, MultiModalData, MultiModalKwargs +from .inputs import AudioItem, ModalityData, MultiModalKwargs try: import librosa @@ -31,7 +31,7 @@ def get_data_key(self) -> str: def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[AudioItem], + data: ModalityData[AudioItem], **mm_processor_kwargs, ) -> MultiModalKwargs: raise NotImplementedError("There is no default audio input mapper") diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index cdda6f8052794..7f4029e726332 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -15,12 +15,12 @@ from vllm.config import ModelConfig from vllm.sequence import SequenceGroupMetadata -from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs, +from .inputs import (ModalityData, MultiModalDataDict, MultiModalKwargs, PlaceholderRange) logger = init_logger(__name__) -MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], +MultiModalInputMapper = Callable[[InputContext, ModalityData[object]], MultiModalKwargs] """ Return a dictionary to be passed as keyword arguments to @@ -69,7 +69,7 @@ def get_data_key(self) -> str: def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[Any], + data: ModalityData[Any], **mm_processor_kwargs, ) -> MultiModalKwargs: """ @@ -118,7 +118,7 @@ def wrapper(model_cls: N) -> N: def map_input( self, model_config: "ModelConfig", - data: MultiModalData[Any], + data: ModalityData[Any], mm_processor_kwargs: Optional[dict[str, Any]], ) -> MultiModalKwargs: """ diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 14c79dfadec0c..da13a381c4530 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -13,7 +13,7 @@ from vllm.utils import is_list_of from .base import MediaIO, MultiModalPlugin -from .inputs import ImageItem, MultiModalData, MultiModalKwargs +from .inputs import ImageItem, ModalityData, MultiModalKwargs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -44,7 +44,7 @@ def _get_hf_image_processor( def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[ImageItem], + data: ModalityData[ImageItem], **mm_processor_kwargs, ) -> MultiModalKwargs: model_config = ctx.model_config diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 1fbda6e0b8750..db489af7ac475 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -2,53 +2,74 @@ from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import (Any, Literal, NamedTuple, TypedDict, TypeVar, Union, cast, - final) +from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final import numpy as np import torch import torch.types from PIL.Image import Image from transformers import BatchFeature -from typing_extensions import NotRequired, TypeAlias, assert_never +from typing_extensions import NotRequired, TypeAlias from vllm.utils import JSONTree, is_list_of, json_map_leaves _T = TypeVar("_T") -# yapf: disable -ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] +HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] """ A :class:`transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace :code:`ImageProcessor`. """ -VideoItem: TypeAlias = Union[ - list[Image], - np.ndarray, - torch.Tensor, - list[np.ndarray], - list[torch.Tensor], -] +HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor, + list[np.ndarray], list[torch.Tensor]] """ A :class:`transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace :code:`VideoProcessor`. """ -AudioItem: TypeAlias = Union[ - np.ndarray, - list[float], - # `(audio, sampling_rate)`: If the audio's sampling rate is different - # from that expected by the model, we need to resample it. - tuple[np.ndarray, float], -] +HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor] """ Represents a single audio item, which can be passed to a HuggingFace :code:`AudioProcessor`. """ -# yapf: enable -MultiModalData: TypeAlias = Union[_T, list[_T]] +ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor] +""" +A :class:`transformers.image_utils.ImageInput` representing a single image +item, which can be passed to a HuggingFace :code:`ImageProcessor`. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as image embeddings; +these are directly passed to the model without HF processing. +""" + +VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor] +""" +A :class:`transformers.image_utils.VideoInput` representing a single video +item, which can be passed to a HuggingFace :code:`VideoProcessor`. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as video embeddings; +these are directly passed to the model without HF processing. +""" + +AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], + torch.Tensor] +""" +Represents a single audio +item, which can be passed to a HuggingFace :code:`AudioProcessor`. + +Alternatively, a tuple `(audio, sampling_rate)`, where the sampling rate +is different from that expected by the model; +these are resampled to the model's sampling rate before being processed by HF. + +Alternatively, a 3-D tensor or batch of 2-D tensors, +which are treated as audio embeddings; +these are directly passed to the model without HF processing. +""" + +ModalityData: TypeAlias = Union[_T, list[_T]] """ Either a single data item, or a list of data items. @@ -61,17 +82,17 @@ class MultiModalDataBuiltins(TypedDict, total=False): """Type annotations for modality types predefined by vLLM.""" - image: MultiModalData[ImageItem] + image: ModalityData[ImageItem] """The input image(s).""" - video: MultiModalData[VideoItem] + video: ModalityData[VideoItem] """The input video(s).""" - audio: MultiModalData[AudioItem] + audio: ModalityData[AudioItem] """The input audio(s).""" -MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]] +MultiModalDataDict: TypeAlias = Mapping[str, ModalityData[Any]] """ A dictionary containing an entry for each modality type to input. @@ -83,123 +104,6 @@ class MultiModalDataBuiltins(TypedDict, total=False): """ -class ImageSize(NamedTuple): - width: int - height: int - - -class MultiModalDataItems(UserDict[str, list[Any]]): - """ - As :class:`MultiModalDataDict`, but normalized such that each entry - corresponds to a list. - """ - - @staticmethod - def from_dict(data: MultiModalDataDict) -> "MultiModalDataItems": - """ - Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems`. - """ - multi_data = MultiModalDataItems() - - for k, v in data.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - # yapf: disable - if k == "video": - # Special case since even a single item can be a list - multi_data[k] = ( # type: ignore[index] - v if ( - isinstance(v, torch.Tensor) - or is_list_of(v, list) - or isinstance(v[0], (np.ndarray, torch.Tensor)) - and v[0].ndim == 4 - ) else [v] - ) - elif k in ("image", "audio"): - multi_data[k] = ( # type: ignore[index] - v if isinstance(v, (torch.Tensor, list)) else [v] - ) - else: - multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] - # yapf: enable - - return multi_data - - # NOTE: When a field (e.g. `images`) doesn't exist, directly appending to - # `self.images` doesn't update this dictionary, which may be confusing - # We annotate the getter methods as `Sequence` to prevent others from - # trying to update the list in this way - @property - def images(self) -> Sequence[ImageItem]: - return self.get("image", []) - - @property - def videos(self) -> Sequence[VideoItem]: - return self.get("video", []) - - @property - def audios(self) -> Sequence[AudioItem]: - return self.get("audio", []) - - def get_item_counts(self) -> Mapping[str, int]: - return {m: len(items) for m, items in self.items()} - - def has_embedding_inputs(self) -> bool: - return any( - any(isinstance(item, torch.Tensor) for item in items) - for items in self.values()) - - def get_image_size(self, item_idx: int) -> ImageSize: - image = self.images[item_idx] - - if isinstance(image, Image): - return ImageSize(*image.size) - if isinstance(image, (np.ndarray, torch.Tensor)): - _, h, w = image.shape - return ImageSize(w, h) - - assert_never(image) - - def get_audio_with_sr( - self, - item_idx: int, - *, - default_sr: float, - ) -> tuple[np.ndarray, float]: - audio = self.audios[item_idx] - - if isinstance(audio, tuple): - return audio - if isinstance(audio, list): - return np.array(audio), default_sr - if isinstance(audio, np.ndarray): - return audio, default_sr - - assert_never(audio) - - def resample_audios(self, new_sr: float, *, drop_sr: bool = True) -> None: - """ - If :code:`drop_sr=True`, the audio items in this dictionary are updated - to be NumPy arrays which implicitly means that their sampling rate is - the same as the model's expected sampling rate; otherwise, they remain - as :code:`(audio, new_sr)` tuples. - """ - # Avoid circular import - from .audio import resample_audio - - if not self.audios: - return - - new_audios = [] - for item_idx in range(len(self.audios)): - audio, sr = self.get_audio_with_sr(item_idx, default_sr=new_sr) - audio = resample_audio(audio, orig_sr=sr, target_sr=new_sr) - - new_audios.append(audio if drop_sr else (audio, new_sr)) - - self["audio"] = new_audios - - class PlaceholderRange(TypedDict): """ Placeholder location information for multi-modal data. @@ -436,7 +340,7 @@ def from_items_by_key( ) -> "MultiModalKwargs": data = { key: items[0].field.reduce(items).data - for key, items in items_by_key.items() + for key, items in items_by_key.items() if len(items) > 0 } return MultiModalKwargs(data, @@ -567,6 +471,11 @@ def get_items_by_modality( Get the keyword arguments corresponding to an item identified by its modality and index. """ + if modality not in self._keys_by_modality: + available_modalities = set(self._keys_by_modality.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + keys_to_gather = self._keys_by_modality[modality] return { diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py new file mode 100644 index 0000000000000..17a795247372e --- /dev/null +++ b/vllm/multimodal/parse.py @@ -0,0 +1,344 @@ +from abc import ABC, abstractmethod +from collections import UserDict +from collections.abc import Callable, Iterator, Mapping, Sequence +from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar + +import numpy as np +import torch +from PIL.Image import Image +from typing_extensions import TypeAlias, TypeGuard, assert_never + +from vllm.utils import is_list_of + +from .audio import resample_audio +from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, + ImageItem, ModalityData, MultiModalDataDict, + NestedTensors, VideoItem) + +_T = TypeVar("_T") +_I = TypeVar("_I") + + +class ModalityDataItems(ABC, Generic[_T, _I]): + + def __init__(self, data: _T) -> None: + super().__init__() + + self.data = data + + def __len__(self) -> int: + return self.get_count() + + def __getitem__(self, index: int) -> _I: + return self.get(index) + + if TYPE_CHECKING: + # Auto-generated + def __iter__(self) -> Iterator[_I]: + ... + + @abstractmethod + def get_count(self) -> int: + """Get the number of data items.""" + raise NotImplementedError + + @abstractmethod + def get(self, index: int) -> _I: + """Get a data item by its index.""" + raise NotImplementedError + + def get_all(self) -> list[_I]: + """Get all data items.""" + return [self.get(idx) for idx in range(self.get_count())] + + @abstractmethod + def get_processor_data(self) -> Mapping[str, object]: + """Get the data to pass to the HF processor.""" + raise NotImplementedError + + @abstractmethod + def get_passthrough_data(self) -> Mapping[str, object]: + """Get the data to pass directly to the model.""" + raise NotImplementedError + + +class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): + + def __init__(self, data: Sequence[_T], modality: str) -> None: + super().__init__(data) + + self.modality = modality + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r})") + + def get_count(self) -> int: + return len(self.data) + + def get(self, index: int) -> _T: + return self.data[index] + + def get_processor_data(self) -> Mapping[str, object]: + return {f"{self.modality}s": self.data} + + def get_passthrough_data(self) -> Mapping[str, object]: + return {} + + +class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): + + def __init__(self, data: NestedTensors, modality: str) -> None: + super().__init__(data) + + self.modality = modality + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r})") + + def get_count(self) -> int: + return len(self.data) + + def get(self, index: int) -> object: + return self.data[index] + + def get_processor_data(self) -> Mapping[str, object]: + return {} + + def get_passthrough_data(self) -> Mapping[str, object]: + return {f"{self.modality}_embeds": self.data} + + +class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): + + def __init__(self, data: Sequence[HfAudioItem]) -> None: + super().__init__(data, "audio") + + +class AudioEmbeddingItems(EmbeddingItems): + + def __init__(self, data: NestedTensors) -> None: + super().__init__(data, "audio") + + +class ImageSize(NamedTuple): + width: int + height: int + + +class ImageProcessorItems(ProcessorBatchItems[HfImageItem]): + + def __init__(self, data: Sequence[HfImageItem]) -> None: + super().__init__(data, "image") + + def get_image_size(self, item_idx: int) -> ImageSize: + image = self.get(item_idx) + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + + +class ImageEmbeddingItems(EmbeddingItems): + + def __init__(self, data: NestedTensors) -> None: + super().__init__(data, "image") + + +class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): + + def __init__(self, data: Sequence[HfVideoItem]) -> None: + super().__init__(data, "video") + + +class VideoEmbeddingItems(EmbeddingItems): + + def __init__(self, data: NestedTensors) -> None: + super().__init__(data, "video") + + +_D = TypeVar("_D", bound=ModalityDataItems[Any, Any]) + + +class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): + """ + As :class:`MultiModalDataDict`, but normalized such that each entry + corresponds to a list. + """ + + def get_count(self, modality: str, *, strict: bool = True) -> int: + """ + Get the number of data items belonging to a modality. + + If `strict=False`, return `0` instead of raising :exc:`KeyError` + even if the modality is not found. + """ + if modality not in self: + if strict: + available_modalities = set(self.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + + return 0 + + return self[modality].get_count() + + def get_all_counts(self) -> Mapping[str, int]: + """Get the number of items belonging to each modality.""" + return {m: items.get_count() for m, items in self.items()} + + def get_items( + self, + modality: str, + typ: type[_D], + ) -> _D: + """ + Get the data items belonging to a modality, + requiring that they belong to a certain type. + """ + if modality not in self: + available_modalities = set(self.keys()) + raise KeyError(f"Modality {modality!r} not found. " + f"Available modalities: {available_modalities}") + + items = self[modality] + if not isinstance(items, typ): + raise TypeError(f"Invalid type of data items for {modality=}. " + f"Expected type: {typ}, but " + f"found type: {type(items)}") + + return items + + +ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], + ModalityDataItems[Any, Any]] + + +class MultiModalDataParser: + """ + Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. + """ + + def __init__(self, *, target_sr: Optional[float] = None) -> None: + super().__init__() + + self.target_sr = target_sr + + def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: + if isinstance(data, torch.Tensor): + return data.ndim == 3 + if is_list_of(data, torch.Tensor): + return len(data) == 0 or data[0].ndim == 2 + + return False + + def _get_audio_with_sr( + self, + audio: AudioItem, + ) -> tuple[np.ndarray, Optional[float]]: + if isinstance(audio, tuple): + return audio + if isinstance(audio, list): + return np.array(audio), None + if isinstance(audio, np.ndarray): + return audio, None + if isinstance(audio, torch.Tensor): + return audio.numpy(), None + + assert_never(audio) + + def _parse_audio_data( + self, + data: ModalityData[AudioItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return AudioEmbeddingItems(data) + + if (is_list_of(data, float) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 1 + or isinstance(data, tuple)): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + new_audios = list[np.ndarray]() + for data_item in data_items: + audio, orig_sr = self._get_audio_with_sr(data_item) + if orig_sr is None: + new_audio = audio + else: + target_sr = self.target_sr + if target_sr is None: + raise RuntimeError( + "Audio resampling is not supported when " + "`target_sr` is not provided") + + new_audio = resample_audio(audio, + orig_sr=orig_sr, + target_sr=target_sr) + + new_audios.append(new_audio) + + return AudioProcessorItems(new_audios) + + def _parse_image_data( + self, + data: ModalityData[ImageItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return ImageEmbeddingItems(data) + + if (isinstance(data, Image) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 3): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + return ImageProcessorItems(data_items) + + def _parse_video_data( + self, + data: ModalityData[VideoItem], + ) -> ModalityDataItems[Any, Any]: + if self._is_embeddings(data): + return VideoEmbeddingItems(data) + + if (is_list_of(data, Image) + or isinstance(data, + (np.ndarray, torch.Tensor)) and data.ndim == 4): + data_items = [data] + elif isinstance(data, (np.ndarray, torch.Tensor)): + data_items = [elem for elem in data] + else: + data_items = data + + return VideoProcessorItems(data_items) + + def _get_subparsers(self) -> Mapping[str, ModalityDataParser]: + return { + "audio": self._parse_audio_data, + "image": self._parse_image_data, + "video": self._parse_video_data, + } + + def parse_mm_data(self, + mm_data: MultiModalDataDict) -> MultiModalDataItems: + subparsers = self._get_subparsers() + + mm_items = MultiModalDataItems() + for k, v in mm_data.items(): + if k not in subparsers: + raise ValueError(f"Unsupported modality: {k}") + + mm_items[k] = subparsers[k](v) + + return mm_items diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 3ece0762e3228..180489166b407 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -15,11 +15,12 @@ from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import LRUCache, flatten_2d_lists, full_groupby, is_list_of +from vllm.utils import LRUCache, flatten_2d_lists, full_groupby -from .inputs import (MultiModalDataDict, MultiModalDataItems, - MultiModalFieldConfig, MultiModalFieldItem, - MultiModalInputsV2, MultiModalKwargs, PlaceholderRange) +from .inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs, + PlaceholderRange) +from .parse import MultiModalDataItems, MultiModalDataParser logger = init_logger(__name__) @@ -621,6 +622,16 @@ def __call__( ) -> MultiModalInputsV2: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) + def _get_data_parser(self) -> MultiModalDataParser: + """ + Construct a data parser to preprocess multi-modal data items + before passing them to :meth:`_get_hf_mm_data`. + + You can support additional modalities by creating a subclass + of :class:`MultiModalDataParser` that has additional subparsers. + """ + return MultiModalDataParser() + def _get_hf_processor(self) -> ProcessorMixin: """ Subclasses can add keyword arguments to this method to accept @@ -631,11 +642,16 @@ def _get_hf_processor(self) -> ProcessorMixin: def _get_tokenizer(self) -> AnyTokenizer: return self.ctx.tokenizer - def _get_mm_items( + def _to_mm_items( self, mm_data: MultiModalDataDict, ) -> MultiModalDataItems: - return MultiModalDataItems.from_dict(mm_data) + """ + Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` + before passing them to :meth:`_get_hf_mm_data`. + """ + parser = self._get_data_parser() + return parser.parse_mm_data(mm_data) @abstractmethod def _get_mm_fields_config( @@ -680,22 +696,9 @@ def _get_hf_mm_data( processor_data = dict[str, Any]() passthrough_data = dict[str, Any]() - for k, v in mm_items.items(): - # TODO: Make a separate modality for embedding inputs - # to avoid confusion - if k in ("image", "video", "audio"): - if isinstance(v, torch.Tensor) and v.ndim == 3: - # Pass through embedding inputs (single) - passthrough_data[f"{k}_embeds"] = [v] - elif (is_list_of(v, torch.Tensor) and len(v) > 0 - and v[0].ndim == 2): - # Pass through embedding inputs (multi) - passthrough_data[f"{k}_embeds"] = v - elif len(v) > 0: - # Map keys to plural form, e.g.: image -> images - processor_data[f"{k}s"] = v - else: - processor_data[k] = v + for items in mm_items.values(): + processor_data.update(items.get_processor_data()) + passthrough_data.update(items.get_passthrough_data()) return processor_data, passthrough_data @@ -756,7 +759,7 @@ def _apply_hf_processor_missing( cached items; instead, we rely on our own prompt replacement logic for the full text. """ - mm_missing_counts = mm_missing_data_items.get_item_counts() + mm_missing_counts = mm_missing_data_items.get_all_counts() prompt_ids, _ = self._apply_hf_processor( prompt_text=prompt_text, @@ -789,7 +792,8 @@ def _cached_apply_hf_processor( cache = self.cache model_id = self.ctx.model_config.model - if cache is None or mm_data_items.has_embedding_inputs(): + _, passthrough_data = self._get_hf_mm_data(mm_data_items) + if cache is None or passthrough_data: return self._apply_hf_processor( prompt_text=prompt_text, mm_items=mm_data_items, @@ -812,7 +816,7 @@ def _cached_apply_hf_processor( modality: [mm_data_items[modality][idx] for idx in idxs] for modality, idxs in mm_missing_idxs.items() } - mm_missing_data_items = self._get_mm_items(mm_missing_data) + mm_missing_data_items = self._to_mm_items(mm_missing_data) prompt_ids, mm_missing_kwargs = self._apply_hf_processor_missing( prompt_text=prompt_text, @@ -852,7 +856,7 @@ def _cached_apply_hf_processor( mm_merged_field_items[modality] = merged_modal_items_lst if self.enable_sanity_checks: - mm_missing_counts = mm_missing_data_items.get_item_counts() + mm_missing_counts = mm_missing_data_items.get_all_counts() assert all( item_count == mm_missing_counts[modality] for modality, item_count in mm_missing_next_idx.items()), dict( @@ -865,7 +869,7 @@ def _cached_apply_hf_processor( ) if self.enable_sanity_checks: - mm_item_counts = mm_data_items.get_item_counts() + mm_item_counts = mm_data_items.get_all_counts() for modality, item_count in mm_item_counts.items(): for item_idx in range(item_count): @@ -958,7 +962,7 @@ def apply( 3. Extract information about the placeholder tokens from the processed token IDs. """ - mm_items = self._get_mm_items(mm_data) + mm_items = self._to_mm_items(mm_data) prompt_ids, mm_kwargs = self._cached_apply_hf_processor( prompt_text, @@ -975,7 +979,7 @@ def apply( # If HF processor already inserts placeholder tokens, # there is no need for us to insert them - mm_item_counts = mm_items.get_item_counts() + mm_item_counts = mm_items.get_all_counts() all_placeholders = self._find_placeholders(prompt_repls, prompt_ids, mm_item_counts) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index b7d43c830cc46..1ad1f5abc27a2 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -15,7 +15,7 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import PlaceholderModule, is_list_of -from .base import MediaIO, MultiModalData +from .base import MediaIO, ModalityData from .image import ImageMediaIO, ImagePlugin from .inputs import MultiModalKwargs, VideoItem @@ -54,7 +54,7 @@ def _get_hf_video_processor( def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[VideoItem], + data: ModalityData[VideoItem], **mm_processor_kwargs, ) -> MultiModalKwargs: model_config = ctx.model_config From 5886aa496e8fa31c9180bcfc8e89faaa8899907d Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Mon, 30 Dec 2024 10:51:02 -0500 Subject: [PATCH 031/462] [V1] [6/N] API Server: Better Shutdown (#11586) --- vllm/entrypoints/openai/api_server.py | 44 ++++++++------------------- vllm/v1/engine/async_llm.py | 25 +++++++++++++-- vllm/v1/engine/core_client.py | 16 ++++------ 3 files changed, 40 insertions(+), 45 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 094cc15a317e9..bac72d87376da 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -68,7 +68,7 @@ from vllm.logger import init_logger from vllm.usage.usage_lib import UsageContext from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path, - is_valid_ipv6_address, kill_process_tree, set_ulimit) + is_valid_ipv6_address, set_ulimit) from vllm.version import __version__ as VLLM_VERSION TIMEOUT_KEEP_ALIVE = 5 # seconds @@ -133,32 +133,21 @@ async def build_async_engine_client_from_engine_args( Returns the Client or None if the creation failed. """ - # Fall back - # TODO: fill out feature matrix. + # AsyncLLMEngine. if (MQLLMEngineClient.is_unsupported_config(engine_args) or envs.VLLM_USE_V1 or disable_frontend_multiprocessing): - engine_config = engine_args.create_engine_config( - UsageContext.OPENAI_API_SERVER) - uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config), - "uses_ray", False) - - build_engine = partial(AsyncLLMEngine.from_engine_args, - engine_args=engine_args, - engine_config=engine_config, - usage_context=UsageContext.OPENAI_API_SERVER) - if uses_ray: - # Must run in main thread with ray for its signal handlers to work - engine_client = build_engine() - else: - engine_client = await asyncio.get_running_loop().run_in_executor( - None, build_engine) - yield engine_client - if hasattr(engine_client, "shutdown"): - engine_client.shutdown() - return + engine_client: Optional[EngineClient] = None + try: + engine_client = AsyncLLMEngine.from_engine_args( + engine_args=engine_args, + usage_context=UsageContext.OPENAI_API_SERVER) + yield engine_client + finally: + if engine_client and hasattr(engine_client, "shutdown"): + engine_client.shutdown() - # Otherwise, use the multiprocessing AsyncLLMEngine. + # MQLLMEngine. else: if "PROMETHEUS_MULTIPROC_DIR" not in os.environ: # Make TemporaryDirectory for prometheus multiprocessing @@ -737,15 +726,6 @@ def signal_handler(*_) -> None: signal.signal(signal.SIGTERM, signal_handler) - # The child processes will send SIGQUIT to this process when - # any error happens. This process then clean up the whole tree. - # TODO(rob): move this into AsyncLLM.__init__ once we remove - # the context manager below. - def sigquit_handler(signum, frame): - kill_process_tree(os.getpid()) - - signal.signal(signal.SIGQUIT, sigquit_handler) - async with build_async_engine_client(args) as engine_client: app = build_app(args) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 213ddaa023dbc..3f097ca7f439c 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,4 +1,6 @@ import asyncio +import os +import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -16,6 +18,7 @@ from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.usage.usage_lib import UsageContext +from vllm.utils import kill_process_tree from vllm.v1.engine.core_client import EngineCoreClient from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor @@ -38,6 +41,22 @@ def __init__( log_requests: bool = True, start_engine_loop: bool = True, ) -> None: + + # The child processes will send SIGQUIT when unrecoverable + # errors happen. We kill the process tree here so that the + # stack trace is very evident. + # TODO: rather than killing the main process, we should + # figure out how to raise an AsyncEngineDeadError and + # handle at the API server level so we can return a better + # error code to the clients calling VLLM. + def sigquit_handler(signum, frame): + logger.fatal( + "AsyncLLM got SIGQUIT from worker processes, shutting " + "down. See stack trace above for root cause issue.") + kill_process_tree(os.getpid()) + + signal.signal(signal.SIGQUIT, sigquit_handler) + assert start_engine_loop self.log_requests = log_requests @@ -276,9 +295,9 @@ async def _run_output_handler(self): # 4) Abort any requests that finished due to stop strings. await self.engine_core.abort_requests_async(reqs_to_abort) - except BaseException as e: - logger.error(e) - raise e + except Exception as e: + logger.exception("EngineCore output handler hit an error: %s", e) + kill_process_tree(os.getpid()) async def abort(self, request_id: str) -> None: """Abort RequestId in self, detokenizer, and engine core.""" diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index beb5d57c20c83..3293205e110af 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -6,7 +6,7 @@ from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import get_open_zmq_ipc_path +from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) @@ -144,17 +144,13 @@ def __init__( else: self.ctx = zmq.Context() # type: ignore[attr-defined] - # Path for IPC. + # Paths and sockets for IPC. output_path = get_open_zmq_ipc_path() input_path = get_open_zmq_ipc_path() - - # Get output (EngineCoreOutput) from EngineCore. - self.output_socket = self.ctx.socket(zmq.constants.PULL) - self.output_socket.connect(output_path) - - # Send input (EngineCoreRequest) to EngineCore. - self.input_socket = self.ctx.socket(zmq.constants.PUSH) - self.input_socket.bind(input_path) + self.output_socket = make_zmq_socket(self.ctx, output_path, + zmq.constants.PULL) + self.input_socket = make_zmq_socket(self.ctx, input_path, + zmq.constants.PUSH) # Start EngineCore in background process. self.proc_handle: Optional[BackgroundProcHandle] From 36e76700453924c8d421db99af70a88a1df835cd Mon Sep 17 00:00:00 2001 From: whyiug Date: Tue, 31 Dec 2024 02:51:04 +0800 Subject: [PATCH 032/462] [Bugfix] Validate and concatenate image embeddings in MiniCPMVBaseModel (#11631) --- vllm/model_executor/models/minicpmv.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 1e8f9bd4cf418..712022502539b 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -487,6 +487,12 @@ def _parse_and_validate_inputs( image_embeds = kwargs.pop("image_embeds", None) if image_embeds is not None: + if not isinstance(image_embeds, (torch.Tensor, list)): + raise ValueError(f"Incorrect type of image embeds. " + f"Got type: {type(image_embeds)}") + if isinstance(image_embeds, list): + image_embeds = torch.concat(image_embeds) + return MiniCPMVImageEmbeddingInputs( image_bounds=self._get_image_bounds(input_ids, im_start_id, im_end_id, slice_start_id, From ccb1aabccaa7aaf07b08fd8be30380e828efba0f Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Mon, 30 Dec 2024 12:27:07 -0800 Subject: [PATCH 033/462] [benchmark] Remove dependency for H100 benchmark step (#11572) --- .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 708e548727cf5..868b8e95db01d 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -73,7 +73,7 @@ steps: # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H100 - depends_on: block-h100 + depends_on: ~ plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT From a2a40bcd0d8275e19c46e9cc06ee994d8839b98d Mon Sep 17 00:00:00 2001 From: Matthias Vogler <60004995+ayylemao@users.noreply.github.com> Date: Tue, 31 Dec 2024 02:33:06 +0100 Subject: [PATCH 034/462] [Model][LoRA]LoRA support added for MolmoForCausalLM (#11439) Signed-off-by: Matthias Vogler Signed-off-by: Jee Jee Li Co-authored-by: Matthias Vogler Co-authored-by: Jee Jee Li --- docs/source/models/supported_models.md | 2 +- vllm/model_executor/models/molmo.py | 45 ++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 518505abeb2a9..613343281464c 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -666,7 +666,7 @@ See [this page](#generative-models) for more information on how to use generativ - Molmo - T + I - `allenai/Molmo-7B-D-0924`, `allenai/Molmo-72B-0924`, etc. - - + - ✅︎ - ✅︎ - ✅︎ * - `NVLM_D_Model` diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 5d52d2c3e6b48..cc25be9f5b6a9 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -36,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer @@ -43,7 +44,7 @@ SequenceData) from vllm.transformers_utils.processor import get_processor -from .interfaces import SupportsMultiModal, SupportsPP +from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) @@ -1161,8 +1162,8 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_molmo_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_molmo) @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo) -class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): - +class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, + SupportsLoRA): hf_to_vllm_mapper = WeightsMapper( orig_to_new_substr={ # vision backbone mapping @@ -1191,6 +1192,32 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): }, ) + packed_modules_mapping = { + "qkv_proj": ["qkv_proj"], + "gate_up_proj": ["gate_up_proj"], # language model + "merged_linear": ["gate_proj", "up_proj"] # image_projector + } + + # LoRA specific attributes + supported_lora_modules = [ + # language model + "qkv_proj", + "o_proj", + "gate_up_proj", + "down_proj", # same name with image_projector + # vision tower + "wq", + "wk", + "wv", + "wo", + "w1", + "w2", + # image_projector + "merged_linear", + ] + embedding_modules = {} + embedding_padding_modules = [] + # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = { "gate_proj": ("merged_linear", 0), @@ -1202,8 +1229,10 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config multimodal_config = vllm_config.model_config.multimodal_config + lora_config = vllm_config.lora_config self.config = config self.multimodal_config = multimodal_config + self.lora_config = lora_config vision_config = VisionBackboneConfig() self.vision_backbone = MolmoVisionBackbone(config, vision_config, @@ -1377,6 +1406,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weights = _get_weights_with_merged_embedding(weights) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="model", + connector="vision_backbone.image_projector", + tower_model="vision_backbone", + ) + def _get_weights_with_merged_embedding( weights: Iterable[Tuple[str, torch.Tensor]] From 74fa1d123c2818065d862d2ceb2338468914fa79 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 30 Dec 2024 22:43:54 -0500 Subject: [PATCH 035/462] [Bugfix] Fix OpenAI parallel sampling when using xgrammar (#11637) Signed-off-by: mgoin --- tests/entrypoints/openai/test_completion.py | 14 ++++++-------- .../guided_decoding/xgrammar_decoding.py | 5 +++++ vllm/sampling_params.py | 9 +++++---- vllm/sequence.py | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py index c81cfdbbe5cff..183d900c493e5 100644 --- a/tests/entrypoints/openai/test_completion.py +++ b/tests/entrypoints/openai/test_completion.py @@ -28,6 +28,8 @@ # need to change to match the prompt adapter PA_NUM_VIRTUAL_TOKENS = 8 +GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"] + @pytest.fixture(scope="module") def zephyr_lora_files(): @@ -635,8 +637,7 @@ async def test_allowed_token_ids(client: openai.AsyncOpenAI): @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_json_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_json_schema): @@ -658,8 +659,7 @@ async def test_guided_json_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_regex_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_regex): @@ -680,8 +680,7 @@ async def test_guided_regex_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_choice_completion(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_guided_choice): @@ -761,8 +760,7 @@ async def test_echo_logprob_completion(client: openai.AsyncOpenAI, @pytest.mark.asyncio -@pytest.mark.parametrize("guided_decoding_backend", - ["outlines", "lm-format-enforcer"]) +@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS) async def test_guided_decoding_type_error(client: openai.AsyncOpenAI, guided_decoding_backend: str, sample_json_schema, sample_regex): diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py index 5e1948977bff4..f10a8fb8e03cf 100644 --- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py +++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py @@ -1,6 +1,7 @@ # noqa: UP007 from __future__ import annotations +import copy import json from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any @@ -309,3 +310,7 @@ def __call__(self, input_ids: list[int], scores = scores.to(device_type).squeeze() return scores + + def clone(self) -> XGrammarLogitsProcessor: + """Deepcopy due to per-sequence state in the matchers""" + return copy.deepcopy(self) diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py index fc77f3ca529b2..605c09b8d7225 100644 --- a/vllm/sampling_params.py +++ b/vllm/sampling_params.py @@ -450,15 +450,16 @@ def all_stop_token_ids(self) -> Set[int]: return self._all_stop_token_ids def clone(self) -> "SamplingParams": - """Deep copy excluding LogitsProcessor objects. + """Deep copy, but maybe not the LogitsProcessor objects. - LogitsProcessor objects are excluded because they may contain an - arbitrary, nontrivial amount of data. + LogitsProcessor objects may contain an arbitrary, nontrivial amount of + data that is expensive to copy. However, if not copied, the processor + needs to support parallel decoding for multiple sequences See https://github.com/vllm-project/vllm/issues/3087 """ logit_processor_refs = None if self.logits_processors is None else { - id(lp): lp + id(lp): lp.clone() if hasattr(lp, 'clone') else lp for lp in self.logits_processors } return copy.deepcopy(self, memo=logit_processor_refs) diff --git a/vllm/sequence.py b/vllm/sequence.py index 34f910d47b7d9..034f89c0ddbe9 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -1372,7 +1372,7 @@ class ParallelSampleSequenceGroup(SequenceGroupBase): @staticmethod def add_request(request_id: str, engine, params, **kwargs): original_params = params - params = copy.deepcopy(original_params) + params = original_params.clone() params.n = 1 group = ParallelSampleSequenceGroup(request_id) seqs = [] From 82c49d3260f1fb9fcd686736e8439dc69cd2f1c4 Mon Sep 17 00:00:00 2001 From: John Giorgi Date: Tue, 31 Dec 2024 01:15:58 -0500 Subject: [PATCH 036/462] [Misc][LoRA] Support Rank Stabilized LoRA (RSLoRA) (#6909) Signed-off-by: Jee Jee Li Co-authored-by: Jee Jee Li --- tests/lora/test_lora_manager.py | 20 +++++++++++++------- vllm/lora/lora.py | 12 +++--------- vllm/lora/models.py | 2 +- vllm/lora/peft_helper.py | 18 +++++++++++++----- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 0b76f466702fc..a099f36b0a465 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -1,4 +1,5 @@ import json +import math import os from typing import Dict, List @@ -50,6 +51,18 @@ def test_peft_helper(sql_lora_files): "embed_tokens", "lm_head", ] + scaling = peft_helper.lora_alpha / peft_helper.r + assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 + + # test RSLoRA + config = dict(r=8, + lora_alpha=16, + target_modules=["gate_proj"], + use_rslora=True) + peft_helper = PEFTHelper.from_dict(config) + + scaling = peft_helper.lora_alpha / math.sqrt(peft_helper.r) + assert abs(peft_helper.vllm_lora_scaling_factor - scaling) < 1e-3 expected_error = "vLLM only supports modules_to_save being None." with pytest.raises(ValueError, match=expected_error): @@ -60,13 +73,6 @@ def test_peft_helper(sql_lora_files): modules_to_save=["lm_head"], ) PEFTHelper.from_dict(config) - expected_error = "vLLM does not yet support RSLoRA." - with pytest.raises(ValueError, match=expected_error): - config = dict(r=8, - lora_alpha=16, - target_modules=["gate_proj"], - use_rslora=True) - PEFTHelper.from_dict(config) expected_error = "vLLM does not yet support DoRA." with pytest.raises(ValueError, match=expected_error): diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index dde347b78bf81..93ad4651f4b77 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -67,15 +67,9 @@ def from_config( peft_helper: PEFTHelper, embeddings_tensor: Optional[torch.Tensor] = None, ) -> "LoRALayerWeights": - return cls( - module_name, - peft_helper.r, - peft_helper.lora_alpha, - None, - None, - None, - embeddings_tensor, - ) + return cls(module_name, peft_helper.r, peft_helper.lora_alpha, None, + None, None, embeddings_tensor, + peft_helper.vllm_lora_scaling_factor) @classmethod def create_dummy_lora_weights( diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 5c0e4e5cbc636..9cfcc6bba727f 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -173,7 +173,7 @@ def from_lora_tensors( return cls(lora_model_id, peft_helper.r, loras, - scaling_factor=peft_helper.vllm_scaling_factor) + scaling_factor=peft_helper.vllm_long_context_scaling_factor) @classmethod def from_local_checkpoint( diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index edf4ba5659575..ddd42ae93d290 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -4,6 +4,8 @@ from dataclasses import MISSING, dataclass, field, fields from typing import Literal, Optional, Union +from vllm.utils import print_info_once + @dataclass class PEFTHelper: @@ -14,21 +16,22 @@ class PEFTHelper: bias: Literal["none", "all", "lora_only"] = field(default="none") modules_to_save: Optional[list[str]] = field(default=None) + # True to use Rank-Stabilized LoRA (rsLoRA, see: https://arxiv.org/abs/2312.03732) use_rslora: bool = field(default=False) + # True to use Weight-Decomposed Low-Rank Adaptation (DoRA, see: https://arxiv.org/abs/2402.09353) use_dora: bool = field(default=False) - # long lora field + # long context lora field context_length: int = field(default=0) # Extra vllm field, start with 'vllm_' to avoid conflict + vllm_lora_scaling_factor: float = field(default=1.0) vllm_max_position_embeddings: Optional[int] = field(default=False) - vllm_scaling_factor: Optional[float] = field(default=None) + vllm_long_context_scaling_factor: Optional[float] = field(default=None) def _validate_features(self): error_msg = [] if self.modules_to_save: error_msg.append("vLLM only supports modules_to_save being None.") - if self.use_rslora: - error_msg.append("vLLM does not yet support RSLoRA.") if self.use_dora: error_msg.append("vLLM does not yet support DoRA.") @@ -38,10 +41,15 @@ def _validate_features(self): def __post_init__(self): self._validate_features() + if self.use_rslora: + print_info_once("Loading LoRA weights trained with rsLoRA.") + self.vllm_lora_scaling_factor = self.lora_alpha / math.sqrt(self.r) + else: + self.vllm_lora_scaling_factor = self.lora_alpha / self.r if self.context_length: if self.vllm_max_position_embeddings is None: self.vllm_max_position_embeddings = self.context_length - self.vllm_scaling_factor = float( + self.vllm_long_context_scaling_factor = float( math.ceil(self.context_length / self.vllm_max_position_embeddings)) From 2c5718809bb5f4bce2ae8e05041d613215dac1aa Mon Sep 17 00:00:00 2001 From: sakunkun Date: Tue, 31 Dec 2024 14:29:04 +0800 Subject: [PATCH 037/462] [Bugfix] Move the _touch(computed_blocks) call in the allocate_slots method to after the check for allocating new blocks. (#11565) --- tests/v1/core/test_prefix_caching.py | 63 +++++++++++++++++++++++++++- vllm/v1/core/kv_cache_manager.py | 19 ++++++--- 2 files changed, 74 insertions(+), 8 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index ed04f0a373c51..dafaa6aee9995 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -98,9 +98,9 @@ def test_prefill(): # Incomplete 1 block (6 tokens) unique_token_ids = [3] * 6 req2 = make_request("2", common_token_ids + unique_token_ids) - computed_block = manager.get_computed_blocks(req2) + computed_blocks = manager.get_computed_blocks(req2) assert len(req2.kv_block_hashes) == 3 - assert [b.block_id for b in computed_block] == [0, 1, 2] + assert [b.block_id for b in computed_blocks] == [0, 1, 2] num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks) assert [b.block_id for b in blocks] == [7, 8] @@ -500,3 +500,62 @@ def test_mm_prefix_caching(): mm_hashes=mm_hashes) computed_blocks = manager.get_computed_blocks(req1) assert len(computed_blocks) == 3 + + +def test_prefill_not_enough_free_blocks_with_computed_blocks(): + """ + This is a unit test that tests the correctness of the allocate_slots + when there is not enough free blocks. Specifically, when a request + has computed blocks but cannot be allocated due to not enough free blocks, + the computed blocks should not be touched. + """ + block_size = 16 + manager = KVCacheManager( + block_size=block_size, + num_gpu_blocks=10, + max_model_len=8192, + sliding_window=None, + enable_caching=True, + num_preallocate_tokens=0, + ) + # Complete 3 blocks (48 tokens) + # | Common-0 | Common-1 | Common-2 | ... | + common_token_ids = [i for i in range(3) for _ in range(16)] + req0 = make_request("0", common_token_ids) + computed_blocks = manager.get_computed_blocks(req0) + assert not computed_blocks + manager.allocate_slots(req0, 48, computed_blocks) + block_part0 = manager.req_to_blocks[req0.request_id] + + # | Common-0 | Common-1 | Common-2 | Req1-3 | Req1-4 | Req1-5 | ... | + req1 = make_request("1", common_token_ids * 2) + computed_blocks = manager.get_computed_blocks(req1) + assert computed_blocks == block_part0 + manager.allocate_slots(req1, 48, computed_blocks) + block_part1 = manager.req_to_blocks[req1.request_id] + # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | + # | Req1-5(F)| ... | + manager.free(req1) + assert {block.ref_cnt for block in block_part1[:3]} == {1} + assert {block.ref_cnt for block in block_part1[3:]} == {0} + + # | Common-0 | Common-1 | Common-2 | Req1-3 (F) | Req1-4 (F) | + # | Req1-5(F)| Req2-0 | Req2-1 | ... | + req2 = make_request("2", [7] * block_size * 2) + computed_blocks = manager.get_computed_blocks(req2) + assert not computed_blocks + manager.allocate_slots(req2, block_size * 2, computed_blocks) + + # Req3 is Req2 + 3 new blocks, so the first 6 blocks are computed, + # but it cannot be allocated due to insufficient free blocks (2). + # In this case, the ref_cnt of the computed blocks should not be changed. + assert manager.free_block_queue.num_free_blocks == 5 + req3 = make_request("3", common_token_ids * 3) + computed_blocks = manager.get_computed_blocks(req3) + assert computed_blocks == block_part1 + # Req3 cannot be allocated. + assert manager.allocate_slots(req3, 48, computed_blocks) is None + # Block 0-2 are used by Req 1. + assert {block.ref_cnt for block in block_part1[:3]} == {1} + # Block 3-5 are free. + assert {block.ref_cnt for block in block_part1[3:]} == {0} diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 78efacccfa078..00d0de51634ae 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -191,7 +191,7 @@ def allocate_slots( request: The request to allocate slots. num_tokens: The number of tokens to allocate. Note that this does not include the tokens that have already been computed. - computed_blocks: The blocks that have already been computed. + computed_blocks: A list of computed blocks. Returns: A list of new allocated blocks. @@ -200,6 +200,18 @@ def allocate_slots( raise ValueError( f"num_tokens must be greater than 0, got {num_tokens}") + # If a computed block of a request is an eviction candidate (in the + # free queue and ref_cnt == 0), it cannot be counted as a free block + # when allocating this request. + num_evictable_computed_blocks = sum(1 for blk in computed_blocks + if blk.ref_cnt == 0) + + num_required_blocks = cdiv(num_tokens, self.block_size) + if (num_required_blocks > self.free_block_queue.num_free_blocks - + num_evictable_computed_blocks): + # Cannot allocate new blocks. + return None + # Touch the computed blocks to make sure they won't be evicted. if self.enable_caching: self._touch(computed_blocks) @@ -208,11 +220,6 @@ def allocate_slots( "Computed blocks should be empty when " "prefix caching is disabled") - num_required_blocks = cdiv(num_tokens, self.block_size) - if (num_required_blocks > self.free_block_queue.num_free_blocks): - # Cannot allocate new blocks. - return None - # Determine the number of new blocks to allocate considering # preallocated blocks. num_new_blocks = min( From 8c3230d8c1cf114618c2316c54bf06b7d0c198b6 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 31 Dec 2024 16:56:01 +0800 Subject: [PATCH 038/462] [V1] Simpify vision block hash for prefix caching by removing offset from hash (#11646) --- tests/v1/core/test_prefix_caching.py | 8 ++++---- vllm/v1/core/kv_cache_utils.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index dafaa6aee9995..35e3a2f972720 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -469,9 +469,9 @@ def test_mm_prefix_caching(): # Completed block should have hashes with extra keys. assert not computed_blocks assert len(req0.kv_block_hashes) == 3 - assert req0.kv_block_hashes[0].extra_keys == (("aaa", 0), ) - assert req0.kv_block_hashes[1].extra_keys == (("aaa", 5), ("bbb", 0)) - assert req0.kv_block_hashes[2].extra_keys == (("bbb", 2), ) + assert req0.kv_block_hashes[0].extra_keys == ("aaa", ) + assert req0.kv_block_hashes[1].extra_keys == ("aaa", "bbb") + assert req0.kv_block_hashes[2].extra_keys == ("bbb", ) blocks = manager.allocate_slots(req0, 59, computed_blocks) assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4] @@ -485,7 +485,7 @@ def test_mm_prefix_caching(): # The just completed block should have hashes with extra keys. assert len(req0.kv_block_hashes) == 4 - assert req0.kv_block_hashes[3].extra_keys == (("ccc", 0), ) + assert req0.kv_block_hashes[3].extra_keys == ("ccc", ) # Cache hit. unique_token_ids = [-1] * 7 + [200] * 5 diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 9ddbff7c9a604..84ff48bf428a0 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -218,8 +218,8 @@ def generate_block_hash_extra_keys( continue # The block contains the current mm input. - mm_start = max(0, start_token_idx - offset) - extra_keys.append((mm_hashes[curr_mm_idx], mm_start)) + extra_keys.append(mm_hashes[curr_mm_idx]) + if end_token_idx >= offset + length: # If this block contains the end of the current mm input, # move to the next mm input as this block may also contain From e7c7c5e822a886e3dba202ca1b756c3260efffcc Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Tue, 31 Dec 2024 13:17:22 -0800 Subject: [PATCH 039/462] [V1][VLM] V1 support for selected single-image models. (#11632) Signed-off-by: Roger Wang Signed-off-by: DarkLight1337 Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: DarkLight1337 Co-authored-by: Isotr0py <2037008807@qq.com> --- docs/source/models/supported_models.md | 10 +- examples/offline_inference_vision_language.py | 10 +- .../vision_language/test_models.py | 7 +- tests/multimodal/test_processing.py | 29 +- vllm/model_executor/models/aria.py | 169 ++++---- vllm/model_executor/models/blip.py | 92 ----- vllm/model_executor/models/blip2.py | 172 ++++---- vllm/model_executor/models/chameleon.py | 191 ++++----- vllm/model_executor/models/fuyu.py | 381 +++++++++--------- .../models/idefics2_vision_model.py | 6 +- vllm/model_executor/models/llava.py | 4 +- vllm/model_executor/models/llava_next.py | 6 +- vllm/model_executor/models/pixtral.py | 12 +- vllm/model_executor/models/qwen2_audio.py | 14 +- vllm/model_executor/models/qwen2_vl.py | 17 +- vllm/model_executor/models/ultravox.py | 13 +- vllm/multimodal/processing.py | 68 +++- vllm/multimodal/utils.py | 10 +- vllm/v1/worker/gpu_model_runner.py | 15 +- 19 files changed, 590 insertions(+), 636 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 613343281464c..f74c201bdff6b 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -570,28 +570,28 @@ See [this page](#generative-models) for more information on how to use generativ - `rhymes-ai/Aria` - - ✅︎ - - + - ✅︎ * - `Blip2ForConditionalGeneration` - BLIP-2 - T + IE - `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. - - ✅︎ - - + - ✅︎ * - `ChameleonForConditionalGeneration` - Chameleon - T + I - `facebook/chameleon-7b` etc. - - ✅︎ - - + - ✅︎ * - `FuyuForCausalLM` - Fuyu - T + I - `adept/fuyu-8b` etc. - - ✅︎ - - + - ✅︎ * - `ChatGLMModel` - GLM-4V - T + I @@ -633,7 +633,7 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - - ✅︎ - - + - ✅︎ * - `LlavaNextVideoForConditionalGeneration` - LLaVA-NeXT-Video - T + V diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 77af914a6ef02..b51bfae455267 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -24,10 +24,13 @@ def run_aria(question: str, modality: str): assert modality == "image" model_name = "rhymes-ai/Aria" + # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM(model=model_name, tokenizer_mode="slow", - trust_remote_code=True, dtype="bfloat16", + max_model_len=4096, + max_num_seqs=2, + trust_remote_code=True, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) prompt = (f"<|im_start|>user\n<|img|>\n{question}" @@ -57,6 +60,7 @@ def run_chameleon(question: str, modality: str): prompt = f"{question}" llm = LLM(model="facebook/chameleon-7b", max_model_len=4096, + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) stop_token_ids = None return llm, prompt, stop_token_ids @@ -257,7 +261,7 @@ def run_minicpmv(question: str, modality: str): # 2.5 # model_name = "openbmb/MiniCPM-Llama3-V-2_5" - #2.6 + # 2.6 model_name = "openbmb/MiniCPM-V-2_6" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @@ -430,9 +434,11 @@ def run_pixtral_hf(question: str, modality: str): model_name = "mistral-community/pixtral-12b" + # NOTE: Need L40 (or equivalent) to avoid OOM llm = LLM( model=model_name, max_model_len=8192, + max_num_seqs=2, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 1a9c1b4ef1be0..7db08166826eb 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -140,10 +140,7 @@ "aria": VLMTestInfo( models=["rhymes-ai/Aria"], tokenizer_mode="slow", - test_type=( - VLMTestType.IMAGE, - VLMTestType.MULTI_IMAGE, - ), + test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), dtype="bfloat16", prompt_formatter=lambda img_prompt: f"<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n ", # noqa: E501 img_idx_to_prompt=lambda idx: "<|img|>\n", @@ -179,6 +176,7 @@ test_type=VLMTestType.IMAGE, prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", max_model_len=4096, + max_num_seqs=2, auto_cls=AutoModelForVision2Seq, postprocess_inputs=model_utils.cast_dtype_post_processor( "pixel_values" @@ -201,7 +199,6 @@ vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output, num_logprobs=10, image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], - marks=[large_gpu_mark(min_gb=48)], ), "glm4": VLMTestInfo( models=["THUDM/glm-4v-9b"], diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 1b2847ed0f534..81278cde264ff 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -528,7 +528,7 @@ def _rand_audio( def _test_processing_cache_correctness( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -583,9 +583,8 @@ def _test_processing_cache_correctness( partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000), } input_max_count = { - "image": 3, - "video": 3, - "audio": 3, + modality: 3 if supports_multi else 1 + for modality, supports_multi in modalities.items() } for batch_idx in range(num_batches): @@ -624,12 +623,16 @@ def _test_processing_cache_correctness( # yapf: disable @pytest.mark.parametrize(("model_id", "modalities"), [ - ("llava-hf/llava-1.5-7b-hf", {"image"}), - ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image"}), - ("mistral-community/pixtral-12b", {"image"}), - ("Qwen/Qwen2-VL-2B-Instruct", {"image", "video"}), - ("Qwen/Qwen2-Audio-7B-Instruct", {"audio"}), - ("fixie-ai/ultravox-v0_3", {"audio"}), + ("rhymes-ai/Aria", {"image": True}), + ("Salesforce/blip2-opt-2.7b", {"image": False}), + ("facebook/chameleon-7b", {"image": True}), + ("adept/fuyu-8b", {"image": False}), + ("llava-hf/llava-1.5-7b-hf", {"image": True}), + ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), + ("mistral-community/pixtral-12b", {"image": True}), + ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), + ("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}), + ("fixie-ai/ultravox-v0_3", {"audio": True}), ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -637,7 +640,7 @@ def _test_processing_cache_correctness( # yapf: enable def test_processing_cache_correctness( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, @@ -653,7 +656,7 @@ def test_processing_cache_correctness( # yapf: disable @pytest.mark.parametrize(("model_id", "modalities"), [ - ("microsoft/Phi-3-vision-128k-instruct", {"image"}), + ("microsoft/Phi-3-vision-128k-instruct", {"image": True}), ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) @pytest.mark.parametrize("num_batches", [32]) @@ -661,7 +664,7 @@ def test_processing_cache_correctness( # yapf: enable def test_processing_cache_correctness_phi3v( model_id: str, - modalities: set[str], + modalities: dict[str, bool], hit_rate: float, num_batches: int, simplify_rate: float, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 9437ad9688422..4ad6e859f4d93 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,15 +1,15 @@ -import math -from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) import torch import torch.nn as nn from torch.nn.init import trunc_normal_ -from transformers import LlamaConfig +from transformers import BatchFeature, PretrainedConfig from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, QuantizationConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_rank -from vllm.inputs import INPUT_REGISTRY, token_inputs +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -17,30 +17,27 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( get_compressed_tensors_cache_scale) -from vllm.model_executor.layers.sampler import (Sampler, SamplerOutput, - SamplingMetadata) +from vllm.model_executor.layers.sampler import (SamplerOutput, + SamplingMetadata, get_sampler) from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, maybe_remap_kv_scale_name) -from vllm.model_executor.models.idefics2_vision_model import ( - Idefics2VisionTransformer) -from vllm.model_executor.models.interfaces import SupportsMultiModal -from vllm.model_executor.models.llama import (LlamaDecoderLayer, LlamaMLP, - LlamaModel) -from vllm.model_executor.models.utils import (AutoWeightsLoader, WeightsMapper, - is_pp_missing_parameter, - maybe_prefix, - merge_multimodal_embeddings) from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, AriaVisionConfig) -from .utils import flatten_bn +from .idefics2_vision_model import Idefics2VisionTransformer +from .interfaces import SupportsMultiModal +from .llama import LlamaDecoderLayer, LlamaMLP, LlamaModel +from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, + is_pp_missing_parameter, maybe_prefix, + merge_multimodal_embeddings) class AriaImagePixelInputs(TypedDict): @@ -251,7 +248,7 @@ def forward(self, x, attn_mask=None): class AriaFusedMoE(FusedMoE): def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor, - shard_id: str) -> Set[str]: + shard_id: str) -> None: # Override the weight_loader to handle the expert weights in the Aria # model, which are already packed with experts, and merge the gate and # up weights for each expert. @@ -346,7 +343,7 @@ class MoEDecoderLayer(LlamaDecoderLayer): def __init__( self, - config: LlamaConfig, + config: AriaMoELMConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -434,7 +431,7 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -def build_mm_projector(config): +def build_mm_projector(config: PretrainedConfig): return AriaProjector( patch_to_query_dict=config.projector_patch_to_query_dict, embed_dim=config.vision_config.hidden_size, @@ -445,75 +442,70 @@ def build_mm_projector(config): ) -def get_max_multimodal_tokens(ctx): - return max(ctx.model_config.hf_config.image_size2tokens.values()) - - -def input_mapper_for_aria(ctx, data): - return MultiModalKwargs(data) +def get_max_aria_image_tokens(ctx: InputContext): + hf_config = ctx.get_hf_config() + return max(hf_config.projector_patch_to_query_dict.values()) -def input_processor(ctx, llm_inputs): - multi_modal_data = llm_inputs.get("multi_modal_data") - # if it is pure text input, use it as is - if multi_modal_data is None or "image" not in multi_modal_data: - return llm_inputs +class AriaMultiModalProcessor(BaseMultiModalProcessor): - model_config = ctx.model_config - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code) - hf_config = model_config.hf_config + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + pixel_mask=MultiModalFieldConfig.batched("image"), + ) - # prepare image tokens, the max_image_size is used to determine the number - # of patch_size for every image - max_image_size = multi_modal_data.pop("max_image_size", 980) - _split_image = multi_modal_data.pop("split_image", False) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.ctx.get_hf_config() + image_token_id = hf_config.image_token_index + + max_image_tokens = get_max_aria_image_tokens(self.ctx) + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=[image_token_id] * max_image_tokens, + ) + ] - assert isinstance(max_image_size, - (int, float)), "max_image_size should be float or int" - images = (multi_modal_data["image"] if isinstance( - multi_modal_data["image"], list) else [multi_modal_data["image"]]) + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.ctx.get_hf_config() + vision_config: AriaVisionConfig = hf_config.vision_config + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } - image_inputs = image_processor.preprocess(images, - max_image_size=max_image_size, - split_image=_split_image, - return_tensors="pt").data - image_inputs['pixel_values'] = image_inputs['pixel_values'].to( - ctx.model_config.dtype) - num_crops = image_inputs.pop("num_crops") + hf_processor = self._get_hf_processor() + image_token: str = hf_processor.image_token # type: ignore - prompt_token_ids = llm_inputs["prompt_token_ids"] - if num_crops.sum().item() > 0: - _, prompt_token_ids, _ = repeat_and_pad_placeholder_tokens( - tokenizer, - None, - prompt_token_ids, - placeholder_token_id=hf_config.image_token_index, - repeat_count=num_crops, + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, ) - repeat_count = [hf_config.image_size2tokens[max_image_size] - ] * sum(num_crops).item() - new_prompt, new_token_ids, _ = repeat_and_pad_placeholder_tokens( - tokenizer, - None, - prompt_token_ids, - placeholder_token_id=hf_config.image_token_index, - repeat_count=repeat_count, - ) - - return token_inputs( - prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data={"image": image_inputs}, - ) - -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_multimodal_tokens) -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_aria) -@INPUT_REGISTRY.register_input_processor(input_processor) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens) +@MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): """ Aria model for conditional generation tasks. @@ -540,12 +532,6 @@ def __init__( config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - # prepare the image_size to tokens mapping for the image preprocess, see - # input_processor - config.image_size2tokens = { - int(math.sqrt(k) * config.vision_config.patch_size): v - for k, v in config.projector_patch_to_query_dict.items() - } self.config = config self.vision_tower = AriaVisionModel(config.vision_config) self.multi_modal_projector = build_mm_projector(config) @@ -566,7 +552,7 @@ def __init__( logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, self.vocab_size, logit_scale) - self.sampler = Sampler() + self.sampler = get_sampler() def _validate_image_sizes( self, images: List[torch.Tensor]) -> List[torch.Tensor]: @@ -588,7 +574,12 @@ def _parse_and_validate_image_input( pixel_values = self._validate_image_sizes(pixel_values) pixel_values = flatten_bn(pixel_values, concat=True) + if pixel_mask is not None: + if not isinstance(pixel_mask, (torch.Tensor, list)): + raise ValueError("Incorrect type of pixel mask. " + f"Got type: {type(pixel_mask)}") + pixel_mask = flatten_bn(pixel_mask, concat=True) return AriaImagePixelInputs( diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 42a239cadac46..987dfaf44f228 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -4,22 +4,16 @@ import torch import torch.nn as nn -from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig from vllm.attention.layer import MultiHeadAttention -from vllm.config import ModelConfig from vllm.distributed import divide, get_tensor_model_parallel_world_size -from vllm.inputs import DecoderOnlyInputs, token_inputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import SequenceData def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -33,92 +27,6 @@ def get_blip_num_patches(*, image_size: int, patch_size: int) -> int: return grid_length * grid_length -def get_blip_image_feature_size( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: - return get_blip_num_patches(image_size=hf_config.image_size, - patch_size=hf_config.patch_size) - - -def get_max_blip_image_tokens( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig]) -> int: - return get_blip_image_feature_size(hf_config) - - -def dummy_seq_data_for_blip( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = get_blip_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ) - - -def dummy_image_for_blip( - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = height = hf_config.image_size - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override - - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -def input_processor_for_blip( - model_config: ModelConfig, - hf_config: Union[BlipVisionConfig, Blip2VisionConfig], - inputs: DecoderOnlyInputs, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "image" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - - if image_feature_size_override is None: - image_feature_size = get_blip_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=image_token_id, - repeat_count=image_feature_size, - ) - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"image": ranges}) - - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa class BlipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 76b8505ee1c2a..bf70f5d904f5b 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -4,32 +4,33 @@ import torch import torch.nn as nn -from transformers import (Blip2Config, Blip2QFormerConfig, Blip2VisionConfig, - apply_chunking_to_forward) +from transformers import (BatchFeature, Blip2Config, Blip2Processor, + Blip2QFormerConfig, apply_chunking_to_forward) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import consecutive_placeholder_ranges -from vllm.sequence import IntermediateTensors, SequenceData - -from .blip import (BlipVisionModel, dummy_image_for_blip, - get_max_blip_image_tokens) +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors + +from .blip import BlipVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) # We use this internally as placeholders since there is no image token # defined on the HuggingFace repo -BLIP2_IMAGE_TOKEN = "" -BLIP2_IMAGE_TOKEN_ID = 50265 +_IMAGE_TOKEN_ID = 50265 class Blip2ImagePixelInputs(TypedDict): @@ -396,92 +397,87 @@ def forward( return sequence_output -def get_blip2_image_feature_size(hf_config: Blip2Config) -> int: - return hf_config.num_query_tokens - - def get_max_blip2_image_tokens(ctx: InputContext): hf_config = ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - - if isinstance(vision_config, Blip2VisionConfig): - return get_max_blip_image_tokens(vision_config) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def dummy_seq_data_for_blip2( - hf_config: Blip2Config, - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = get_blip2_image_feature_size(hf_config) - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_data_for_blip2(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - num_images = mm_counts["image"] - - seq_data, ranges = dummy_seq_data_for_blip2( - hf_config, - seq_len, - num_images, - image_token_id=BLIP2_IMAGE_TOKEN_ID, - ) - - if isinstance(vision_config, Blip2VisionConfig): - mm_data = dummy_image_for_blip(vision_config, num_images) - - return DummyData(seq_data, mm_data, ranges) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + return hf_config.num_query_tokens -def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs +class Blip2MultiModalProcessor(BaseMultiModalProcessor): - hf_config = ctx.get_hf_config(Blip2Config) - image_feature_size = get_blip2_image_feature_size(hf_config) + def _get_hf_processor(self) -> Blip2Processor: + return self.ctx.get_hf_processor(Blip2Processor) - # The original model places image tokens at the front - # https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/blip_2/modeling_blip_2.py#L1514 - new_token_ids = [BLIP2_IMAGE_TOKEN_ID] * image_feature_size - new_token_ids += inputs["prompt_token_ids"] + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) - new_prompt = inputs.get("prompt") - if new_prompt is not None: - new_prompt = BLIP2_IMAGE_TOKEN * image_feature_size + new_prompt + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + max_image_tokens = get_max_blip2_image_tokens(self.ctx) + + return [ + PromptReplacement( + modality="image", + target="", + replacement="" * max_image_tokens + "", + ) + ] - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data) + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only tokens should be considered as placeholders, + # so we ignore the trailing bos_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self.ctx.get_hf_config(Blip2Config) + vision_config = hf_config.vision_config + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_blip2) -@INPUT_REGISTRY.register_input_processor(input_processor_for_blip2) +@MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -627,7 +623,7 @@ def get_input_embeddings( if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, - BLIP2_IMAGE_TOKEN_ID) + _IMAGE_TOKEN_ID) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index a40c321ce0a58..85fca23b05746 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -3,16 +3,15 @@ Tuple, TypedDict, Union) import torch +import torch.nn as nn import torch.nn.functional as F -from PIL import Image -from torch import nn -from transformers import ChameleonConfig, ChameleonVQVAEConfig +from transformers import (BatchFeature, ChameleonConfig, ChameleonProcessor, + ChameleonVQVAEConfig) from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -29,11 +28,13 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges, - repeat_and_pad_placeholder_tokens) -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once from .interfaces import SupportsMultiModal, SupportsPP @@ -45,10 +46,6 @@ # and processor files, so we hardcode them in the model file for now. CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512 CHAMELEON_IMAGE_SEQ_LENGTH = 1024 -CHAMELEON_IMAGE_TOKEN_ID = 8711 -CHAMELEON_IMAGE_START_TOKEN_ID = 8197 -CHAMELEON_IMAGE_END_TOKEN_ID = 8196 -CHAMELEON_SEP_TOKEN_ID = 8710 class ChameleonImagePixelInputs(TypedDict): @@ -61,99 +58,75 @@ def get_max_chameleon_image_tokens(ctx: InputContext): return CHAMELEON_IMAGE_SEQ_LENGTH -def dummy_seq_data_for_chameleon( - seq_len: int, - num_images: int, - *, - image_token_id: int, - image_feature_size_override: Optional[int] = None, -): - if image_feature_size_override is None: - image_feature_size = CHAMELEON_IMAGE_SEQ_LENGTH - else: - image_feature_size = image_feature_size_override - - return SequenceData.from_prompt_token_counts( - (image_token_id, image_feature_size * num_images), - (0, seq_len - image_feature_size * num_images), - ), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_image_for_chameleon( - num_images: int, - *, - image_width_override: Optional[int] = None, - image_height_override: Optional[int] = None, -): - width = CHAMELEON_CROP_SIZE_WIDTH - height = CHAMELEON_CROP_SIZE_HEIGHT - if image_width_override is not None: - width = image_width_override - if image_height_override is not None: - height = image_height_override - - image = Image.new("RGB", (width, height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -def dummy_data_for_chameleon(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] - - seq_data, ranges = dummy_seq_data_for_chameleon( - seq_len, - num_images, - image_token_id=CHAMELEON_IMAGE_TOKEN_ID, - ) - - mm_data = dummy_image_for_chameleon(num_images) - return DummyData(seq_data, mm_data, ranges) - - -def input_processor_for_chameleon(ctx: InputContext, - inputs: DecoderOnlyInputs): +class ChameleonMultiModalProcessor(BaseMultiModalProcessor): - """ - Processing input prompt to insert required tokens for image placeholder. - - See https://github.com/huggingface/transformers/blob/0fdea8607d7e01eb0e38a1ebeb7feee30a22f0cf/src/transformers/models/chameleon/processing_chameleon.py#L58 - """ # noqa - - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - if "multi_modal_placeholders" in inputs and "image" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs - - model_config = ctx.model_config - tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=CHAMELEON_IMAGE_TOKEN_ID, - repeat_count=CHAMELEON_IMAGE_SEQ_LENGTH, - pad_token_left=CHAMELEON_IMAGE_START_TOKEN_ID, - pad_token_right=CHAMELEON_IMAGE_END_TOKEN_ID, - ) - - # Appending sep token for chat mode to follow default processor - # behavior - if new_prompt is not None: - new_prompt += tokenizer.sep_token - new_token_ids += [CHAMELEON_SEP_TOKEN_ID] - - # NOTE: Create a defensive copy of the original inputs - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data) + def _get_hf_processor(self) -> ChameleonProcessor: + return self.ctx.get_hf_processor(ChameleonProcessor) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image")) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + processor = self._get_hf_processor() + + return [ + PromptReplacement( + modality="image", + target="", + replacement="".join([ + processor.image_start_token, + processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH, + processor.image_end_token, + ]), + ) + ] + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH, + height=CHAMELEON_CROP_SIZE_HEIGHT, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="" * num_images, + mm_data=mm_data, + ) + + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only tokens should be considered as placeholders, + # so we ignore the image_start_token and image_end_token + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"] + 1, + length=p["length"] - 2) for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result class ChameleonLayerNorm(nn.LayerNorm): @@ -736,7 +709,7 @@ def forward(self, pixel_values: torch.Tensor): for i_level in range(self.num_resolutions): for i_block in range(self.num_res_blocks): hidden_state = self.down[i_level].block[i_block]( - hidden_states[-1], ) + hidden_states[-1]) if len(self.down[i_level].attn) > 0: hidden_state = self.down[i_level].attn[i_block]( hidden_state) @@ -925,10 +898,8 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_chameleon) -@INPUT_REGISTRY.register_input_processor(input_processor_for_chameleon) +@MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor) class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 6e86900326c4b..8c14866f20b92 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -15,32 +15,30 @@ # limitations under the License. """ PyTorch Fuyu model.""" import math -from array import array from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict) import torch import torch.nn as nn -import torch.utils.checkpoint -from PIL import Image -from transformers import FuyuImageProcessor +from transformers import (BatchFeature, FuyuConfig, FuyuImageProcessor, + FuyuProcessor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) +from vllm.inputs import InputContext from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.image import cached_get_image_processor -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - consecutive_placeholder_ranges) -from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, - SequenceData) -from vllm.utils import is_list_of +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, + MultiModalInputsV2, MultiModalKwargs, + NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessorInputs, + PromptReplacement) +from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, @@ -54,178 +52,193 @@ MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920 -class FuyuImagePixelInputs(TypedDict): - type: Literal["pixel_values"] +class FuyuImagePatchInputs(TypedDict): + type: Literal["image_patches"] data: torch.Tensor """ Shape: - (batch_size, num_patches, patch_size_x * patch_size_y * num_channels) + `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` + """ + + patches_per_image: List[int] + """ + List of number of total patches for each image in the batch. + This is used to restore the first two dimensions of `data`. """ -def _calculate_num_image_tokens( - height: int, - width: int, +def _get_fuyu_num_image_tokens( + image_height: int, + image_width: int, ) -> Tuple[int, int]: """ - calculate number of image tokens needed for a given image size - The expected Fuyu image prompts is in format: - (image_token * ncols + newline_token) * nrows - args: - image_size: Tuple[int, int] - (width, height) of the image - returns: - ncols: int - number of image tokens in x direction - nrows: int - number of image tokens in y direction - """ - ncol = math.ceil(width / 30) - nrow = math.ceil(height / 30) - return ncol, nrow + Calculate the number of image tokens needed for a given image size. + The expected Fuyu image prompts can be expressed as: -def get_max_fuyu_image_feature_size(): + .. code-block:: + (image_token * ncols + newline_token) * nrows - return _calculate_num_image_tokens( - height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - ) + Args: + image_size: Tuple[int, int] - `(width, height)` of the image + + Returns: + ncols: int - number of image tokens in `x` direction + nrows: int - number of image tokens in `y` direction + """ + ncols = math.ceil(image_width / 30) + nrows = math.ceil(image_height / 30) + return ncols, nrows def get_max_fuyu_image_tokens(ctx: InputContext): - ncol, nrow = get_max_fuyu_image_feature_size() - return (ncol + 1) * nrow - - -def dummy_seq_data_for_fuyu(ctx: InputContext, seq_len: int, num_images: int): - ncol, nrow = get_max_fuyu_image_feature_size() - image_feature_size = get_max_fuyu_image_tokens(ctx) - - image_token_ids = ( - array(VLLM_TOKEN_ID_ARRAY_TYPE, [_IMAGE_TOKEN_ID]) * ncol + - array(VLLM_TOKEN_ID_ARRAY_TYPE, [_NEWLINE_TOKEN_ID])) * nrow - token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, image_token_ids) * num_images - token_ids += array(VLLM_TOKEN_ID_ARRAY_TYPE, - [0]) * (seq_len - image_feature_size * num_images) - return SequenceData(token_ids), { - "image": - consecutive_placeholder_ranges(num_items=num_images, - item_size=image_feature_size) - } - - -def dummy_image_for_fuyu( - num_images: int, - *, - image_width: int, - image_height: int, -): - image = Image.new("RGB", (image_width, image_height), color=0) - return {"image": image if num_images == 1 else [image] * num_images} - - -def dummy_data_for_fuyu(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - num_images = mm_counts["image"] - seq_data, ranges = dummy_seq_data_for_fuyu(ctx, seq_len, num_images) - mm_data = dummy_image_for_fuyu(num_images, - image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT) - return DummyData(seq_data, mm_data, ranges) - - -def _fuyu_image_preprocess(image_processor: FuyuImageProcessor, - data: List[Image.Image]): - image_encoding = image_processor.preprocess(data, return_tensors="pt") - batch_images = torch.stack([img[0] for img in image_encoding["images"] - ]).unsqueeze(1) - image_unpadded_heights = torch.tensor( - image_encoding["image_unpadded_heights"]) - image_unpadded_widths = torch.tensor( - image_encoding["image_unpadded_widths"]) - - batch_size = len(image_encoding["images"]) - image_present = torch.ones(batch_size, 1, 1) - model_image_input = image_processor.preprocess_with_tokenizer_info( - image_input=batch_images, - image_present=image_present, - image_unpadded_h=image_unpadded_heights, - image_unpadded_w=image_unpadded_widths, - image_placeholder_id=_IMAGE_TOKEN_ID, - image_newline_id=_NEWLINE_TOKEN_ID, - variable_sized=True, + ncols, nrows = _get_fuyu_num_image_tokens( + image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, ) - return model_image_input - - -def input_processor_for_fuyu(ctx: InputContext, inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - image_data = multi_modal_data["image"] - new_multi_modal_data = {} - image_list = image_data if isinstance(image_data, list) else [image_data] - - # process image data - if is_list_of(image_list, Image.Image): - # Fuyu's image_processor can also finish token padding - image_processor: FuyuImageProcessor = cached_get_image_processor( - model_config.model) - - model_image_input = _fuyu_image_preprocess(image_processor, image_data) - image_patches = torch.cat([ - image_patch[0] - for image_patch in model_image_input["image_patches"] - ]) - new_multi_modal_data["image"] = image_patches - - elif is_list_of(image_list, torch.Tensor): - raise NotImplementedError("Embeddings input is not supported yet") - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - # process prompts - prompt = inputs.get("prompt") - prompt_token_ids = inputs["prompt_token_ids"] - tokenizer = cached_get_tokenizer(model_config.model) - # dim0 is batch_size, dim1 is subseq_size which will always be 1 - image_input_ids: List[List[ - torch.Tensor]] = model_image_input["image_input_ids"] - image_input_ids = image_input_ids[0][0].tolist() - bos_token = tokenizer.encode("", add_special_tokens=False)[1:] - boa_token = tokenizer.encode("\x04", add_special_tokens=False)[1:] - - new_prompt = prompt + "\x04" - new_prompt_token_ids = image_input_ids + bos_token + prompt_token_ids[ - 1:] + boa_token - - return token_inputs(prompt=new_prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=new_multi_modal_data) - - -def input_mapper_for_fuyu(ctx: InputContext, data: object): - model_config = ctx.model_config - data_list = data if isinstance(data, list) else [data] - if is_list_of(data_list, Image.Image): - # Fuyu's image_processor can also finish token padding - image_processor: FuyuImageProcessor = cached_get_image_processor( - model_config.model) - - model_image_input = _fuyu_image_preprocess(image_processor, data_list) - data = torch.stack([ - image_patch[0] - for image_patch in model_image_input["image_patches"] - ]) - - # image has been processed with prompt in input processor - return MultiModalKwargs({"pixel_values": data}) - - -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) + + return (ncols + 1) * nrows + + +class FuyuMultiModalProcessor(BaseMultiModalProcessor): + + def _get_hf_processor(self) -> FuyuProcessor: + return self.ctx.get_hf_processor(FuyuProcessor) + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + + if not mm_data: + # Avoid warning from HF logger for text-only input + # Input_ids format: bos_token_id + prompt_token_ids + boa_token_id + # Tokenizer won't add boa_token_id by default, we add it manually. + tokenizer = self._get_tokenizer() + boa_token_id: int = tokenizer.vocab["<0x04>"] # type: ignore + prompt_ids = tokenizer.encode(prompt) + [boa_token_id] + return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") + + processed_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + image_patches = processed_outputs.get("image_patches") + if image_patches is not None: + images = mm_data["images"] + assert isinstance(images, list) + + # Original output: (1, num_images, Pn, Px * Py * C) + # New output: (num_images, Pn, Px * Py * C) + assert (isinstance(image_patches, list) + and len(image_patches) == 1) + assert (isinstance(image_patches[0], torch.Tensor) + and len(image_patches[0]) == len(images)) + + processed_outputs["image_patches"] = image_patches[0] + + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(image_patches=MultiModalFieldConfig.batched("image")) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self.ctx.get_hf_config(FuyuConfig) + bos_token_id = hf_config.bos_token_id + + tokenizer = self._get_tokenizer() + eot_token_id = tokenizer.bos_token_id + assert isinstance(eot_token_id, int) + + hf_processor = self._get_hf_processor() + image_processor: FuyuImageProcessor = hf_processor.image_processor + target_size = image_processor.size + target_height, target_width = (target_size["height"], + target_size["width"]) + + def get_replacement_fuyu(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) + width, height = image_size.width, image_size.height + if not (width <= target_width and height <= target_height): + height_scale_factor = target_height / height + width_scale_factor = target_width / width + optimal_scale_factor = min(height_scale_factor, + width_scale_factor) + + height = int(height * optimal_scale_factor) + width = int(width * optimal_scale_factor) + + ncols, nrows = _get_fuyu_num_image_tokens( + image_width=width, + image_height=height, + ) + + return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + + [bos_token_id]) + + return [ + PromptReplacement( + modality="image", + target=[eot_token_id], + replacement=get_replacement_fuyu, + ) + ] + + def apply( + self, + prompt_text: str, + mm_data: MultiModalDataDict, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) + + # Only |SPEAKER| (image) tokens should be considered as placeholders, + # so we ignore the trailing bos_token_id + result["mm_placeholders"] = { + modality: [ + PlaceholderRange(offset=p["offset"], length=p["length"] - 1) + for p in ps + ] + for modality, ps in result["mm_placeholders"].items() + } + + return result + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH, + height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) + + @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_fuyu) -@INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu) +@MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): @@ -280,28 +293,32 @@ def _validate_shape(d: torch.Tensor): return data.to(self.vision_embed_tokens.weight.dtype) def _parse_and_validate_image_input( - self, **kwargs: object) -> Optional[FuyuImagePixelInputs]: - pixel_values = kwargs.pop("pixel_values", None) - - if pixel_values is not None: - if not isinstance(pixel_values, (torch.Tensor, list)): + self, **kwargs: object) -> Optional[FuyuImagePatchInputs]: + image_patches = kwargs.pop("image_patches", None) + if image_patches is not None: + if not isinstance(image_patches, (torch.Tensor, list)): raise ValueError("Incorrect type of image patches. " - f"Got type: {type(pixel_values)}") + f"Got type: {type(image_patches)}") - return FuyuImagePixelInputs( - type="pixel_values", + image_patches_flat = flatten_bn(image_patches) + + return FuyuImagePatchInputs( + type="image_patches", data=self._validate_pixel_values( - flatten_bn(pixel_values, concat=True)), + flatten_bn(image_patches_flat, concat=True)), + patches_per_image=[x.size(0) for x in image_patches_flat], ) return None def _process_image_input( - self, image_input: FuyuImagePixelInputs) -> torch.Tensor: + self, image_input: FuyuImagePatchInputs) -> NestedTensors: + image_patches = image_input["data"] + patches_per_image = image_input["patches_per_image"] assert self.vision_embed_tokens is not None - vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) - return vision_embeddings + vision_embeddings, _ = self.vision_embed_tokens(image_patches) + return vision_embeddings.split(patches_per_image, dim=0) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index e430a158d869a..4e42a4b6f9e64 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -69,7 +69,8 @@ def forward(self, patch_attention_mask: torch.BoolTensor, tgt_sizes: Optional[torch.IntTensor] = None) -> torch.Tensor: batch_size, _, max_im_h, max_im_w = pixel_values.shape - patch_embeds = self.patch_embedding(pixel_values) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(target_dtype)) embeddings = patch_embeds.flatten(2).transpose(1, 2) max_nb_patches_h, max_nb_patches_w = ( max_im_h // self.patch_size, @@ -309,7 +310,8 @@ def forward( hidden_states = self.embeddings( pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, - tgt_sizes=tgt_sizes) + tgt_sizes=tgt_sizes, + ) encoder_outputs = self.encoder(hidden_states) last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 1d6ee2a0be72e..34dc7fa31ce6f 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -144,8 +144,8 @@ def _call_hf_processor( # Original output: (1, num_images, C, H, W) # New output: (num_images, C, H, W) assert (isinstance(pixel_values, list) - and len(pixel_values) == 1 - and isinstance(pixel_values[0], list) + and len(pixel_values) == 1) + assert (isinstance(pixel_values[0], list) and len(pixel_values[0]) == len(images)) processed_outputs["pixel_values"] = pixel_values[0] diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index a39f2f4124d05..5e70c11363c83 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -528,10 +528,8 @@ def _process_image_pixels( stacked_image_features = self._image_pixels_to_features( self.vision_tower, stacked_pixel_values) - return [ - self.multi_modal_projector(image_features) for image_features in - torch.split(stacked_image_features, num_patches_per_batch) - ] + return torch.split(self.multi_modal_projector(stacked_image_features), + num_patches_per_batch) def _process_image_input( self, diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 22d29f5bbc50c..2bce13792a88d 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -1,8 +1,8 @@ +import math from dataclasses import dataclass, fields from functools import cached_property from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union -import numpy import torch import torch.nn as nn import torch.nn.functional as F @@ -306,7 +306,7 @@ def _parse_and_validate_image_input( images: Optional[Union[List[List[torch.Tensor]], List[torch.Tensor], torch.Tensor]] = None, image_tokens: Optional[torch.Tensor] = None, - ) -> Optional[List[torch.Tensor]]: + ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor]]: if images is None: return None, None @@ -604,11 +604,11 @@ def max_patches_per_side(self) -> int: return self.args.image_size // self.args.patch_size @property - def device(self) -> torch.device: + def device(self) -> torch.types.Device: return next(self.parameters()).device @property - def dtype(self) -> torch.device: + def dtype(self) -> torch.dtype: return next(self.parameters()).dtype @property @@ -741,8 +741,8 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, ratio = max(image_width / max_width, image_height / max_height) if ratio > 1: - image_width = int(numpy.ceil(image_width / ratio)) - image_height = int(numpy.ceil(image_height / ratio)) + image_width = int(math.ceil(image_width / ratio)) + image_height = int(math.ceil(image_height / ratio)) num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens( (image_height, image_width), diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index e3d43b017f894..de55bc6bcc123 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -23,7 +23,6 @@ from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn from transformers import BatchFeature @@ -177,16 +176,19 @@ def _get_dummy_mm_inputs( mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) - audio_count = mm_counts.get("audio", 0) - audio = np.zeros(audio_len) - data = {"audio": [audio] * audio_count} + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } return ProcessorInputs( - prompt_text="<|AUDIO|>" * audio_count, - mm_data=data, + prompt_text="<|AUDIO|>" * num_audios, + mm_data=mm_data, ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 6181fe3dd13d8..1e485f87bb7a4 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -29,7 +29,6 @@ import torch.nn as nn import torch.nn.functional as F from einops import rearrange, repeat -from PIL import Image from transformers import BatchFeature from transformers.models.qwen2_vl import (Qwen2VLImageProcessor, Qwen2VLProcessor) @@ -882,12 +881,10 @@ def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) hf_processor = self._get_hf_processor() - image_token: str = hf_processor.image_token image_processor = _get_image_processor(hf_processor) - data = {} + image_token: str = hf_processor.image_token resized_height, resized_width = smart_resize( height=9999999, width=9999999, @@ -895,14 +892,18 @@ def _get_dummy_mm_inputs( min_pixels=image_processor.min_pixels, max_pixels=image_processor.max_pixels, ) + num_images = mm_counts.get("image", 0) - dummy_image = Image.new("RGB", (resized_width, resized_height), - color=0) - data["image"] = [dummy_image] * num_images + mm_data = { + "image": + self._get_dummy_images(width=resized_width, + height=resized_height, + num_images=num_images) + } return ProcessorInputs( prompt_text=image_token * num_images, - mm_data=data, + mm_data=mm_data, ) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 7e853e5b90096..54be7fed3f2be 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -188,16 +188,19 @@ def _get_dummy_mm_inputs( mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() + sampling_rate = feature_extractor.sampling_rate audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) - audio_count = mm_counts.get("audio", 0) - audio = np.zeros(audio_len) - data = {"audio": [audio] * audio_count} + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } return ProcessorInputs( - prompt_text="<|audio|>" * audio_count, - mm_data=data, + prompt_text="<|audio|>" * num_audios, + mm_data=mm_data, ) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 180489166b407..7712c3bcebe20 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,15 +1,17 @@ import pickle import re from abc import ABC, abstractmethod +from collections import defaultdict from collections.abc import Callable, ItemsView, Iterable, Mapping, Sequence from dataclasses import dataclass, field from functools import lru_cache from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union import numpy as np +import numpy.typing as npt import torch from blake3 import blake3 -from PIL.Image import Image +from PIL import Image from transformers import BatchFeature, ProcessorMixin from vllm.inputs import DummyData, InputProcessingContext @@ -353,13 +355,13 @@ def _replace_matches( ) -> list[_S]: out_seqs = list[_S]() prev_end_idx = 0 - next_idx_by_modality = {modality: 0 for modality in mm_item_counts} + next_idx_by_modality = defaultdict[str, int](lambda: 0) for match in _resolve_matches(prompt, matches): modality = match.modality item_idx = next_idx_by_modality[modality] - if item_idx >= mm_item_counts[modality]: + if item_idx >= mm_item_counts.get(modality, 0): continue start_idx = match.start_idx @@ -513,7 +515,7 @@ def _serialize_item(self, obj: object) -> bytes: return obj.encode("utf-8") if isinstance(obj, bytes): return obj - if isinstance(obj, Image): + if isinstance(obj, Image.Image): return obj.tobytes() # Convertible to NumPy arrays @@ -673,10 +675,14 @@ def _get_prompt_replacements( Given the original multi-modal items for this modality and HF-processed data, output the replacements to perform. - Note: - Even when the HF processor already performs replacement for us, - we still use this replacement information to determine - the placeholder token positions for each multi-modal item. + Notes: + - You should not assume that HF processor always performs prompt + replacement: in :meth:`_apply_hf_processor_missing`, this method + is called on text-only and multimodal-only inputs separately, + instead of passing them in the same call. + - The replacement information returned by this method is also used + to determine the placeholder token positions for each multi-modal + item. """ raise NotImplementedError @@ -710,6 +716,10 @@ def _call_hf_processor( mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], ) -> BatchFeature: + """ + Call the HF processor on the prompt text and + associated multi-modal data. + """ return self.ctx.call_hf_processor( self._get_hf_processor(**mm_kwargs), dict(text=prompt, **mm_data), @@ -723,7 +733,8 @@ def _apply_hf_processor( hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: """ - Apply the HF processor on the full prompt text and multi-modal data. + Wrapper of :meth:`_call_hf_processor` that applies + additional pre-processing and post-processing. """ processor_data, passthrough_data = self._get_hf_mm_data(mm_items) @@ -754,10 +765,11 @@ def _apply_hf_processor_missing( Apply the HF processor on the full prompt text, but only on the multi-modal data that are missing from the cache. - Note: We pass prompt text and multi-modal data into the HF processor - in separate calls to avoid HF prompt replacement being done for - cached items; instead, we rely on our own prompt replacement logic - for the full text. + Note: + We pass prompt text and multi-modal data into the HF processor + in separate calls to avoid HF prompt replacement being done for + cached items; instead, we rely on our own prompt replacement logic + (:meth:`_get_prompt_replacements`) for the full text. """ mm_missing_counts = mm_missing_data_items.get_all_counts() @@ -1010,6 +1022,36 @@ def apply( mm_placeholders=mm_placeholders, ) + def _get_dummy_audios( + self, + *, + length: int, + num_audios: int, + ) -> list[npt.NDArray]: + audio = np.zeros((length, )) + return [audio] * num_audios + + def _get_dummy_images( + self, + *, + width: int, + height: int, + num_images: int, + ) -> list[Image.Image]: + image = Image.new("RGB", (width, height), color=0) + return [image] * num_images + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[npt.NDArray]: + video = np.zeros((num_frames, width, height, 3)) + return [video] * num_videos + @abstractmethod def _get_dummy_mm_inputs( self, diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 87b12a6fb33c1..7b6ded6a27084 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -400,15 +400,19 @@ def repeat_and_pad_placeholder_tokens( placeholder_token_idx = 0 for i, token in enumerate(prompt_token_ids): if token == placeholder_token_id: + curr_repeat_count = repeat_count[placeholder_token_idx] replacement_ids = repeat_and_pad_token( placeholder_token_id, - repeat_count=repeat_count[placeholder_token_idx], + repeat_count=curr_repeat_count, pad_token_left=pad_token_left, pad_token_right=pad_token_right, ) + offset = len(new_token_ids) + if pad_token_left is not None: + offset += 1 placeholder_ranges.append({ - "offset": len(new_token_ids), - "length": len(replacement_ids) + "offset": offset, + "length": curr_repeat_count, }) new_token_ids.extend(replacement_ids) placeholder_token_idx += 1 diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 509771b7e2e5a..a08a86d4007dc 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -647,10 +647,23 @@ def profile_run(self) -> None: self.mm_registry.get_max_tokens_per_item_by_modality( self.model_config).values()) - max_num_mm_items = min( + max_num_mm_items_encoder_budget = min( self.max_num_encoder_input_tokens, self.encoder_cache_size) // max_tokens_per_mm_item + max_mm_items_per_req = max( + self.mm_registry.get_mm_limits_per_prompt( + self.model_config).values()) + + # NOTE: We do not consider max_num_batched_tokens on purpose + # because the multimodal embeddings can be generated in advance + # and chunked prefilled. + max_num_mm_items_decoder_budget = self.max_num_reqs * \ + max_mm_items_per_req + + max_num_mm_items = min(max_num_mm_items_encoder_budget, + max_num_mm_items_decoder_budget) + # Dummy data definition in V0 may contain multiple multimodal items # (e.g, multiple images) for a single request, therefore here we # always replicate first item by max_num_mm_items times since in V1 From 0c6f9985547d6b510d34c6c873db54abe03eb346 Mon Sep 17 00:00:00 2001 From: Yihua Cheng Date: Tue, 31 Dec 2024 18:10:55 -0600 Subject: [PATCH 040/462] [Benchmark] Add benchmark script for CPU offloading (#11533) Signed-off-by: ApostaC Co-authored-by: KuntaiDu --- .../benchmark_long_document_qa_throughput.py | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 benchmarks/benchmark_long_document_qa_throughput.py diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py new file mode 100644 index 0000000000000..13477ef535e86 --- /dev/null +++ b/benchmarks/benchmark_long_document_qa_throughput.py @@ -0,0 +1,184 @@ +""" +Offline benchmark to test the long document QA throughput. + +Example usage: + # This command run the vllm with 50GB CPU memory for offloading + # The workload samples 8 different prompts with a default input + # length of 20000 tokens, then replicates each prompt 2 times + # in random order. + python benchmark_long_document_qa_throughput.py \ + --model meta-llama/Llama-2-7b-chat-hf \ + --enable-prefix-caching \ + --num-documents 8 \ + --repeat-count 2 + +Commandline arguments: + --num-documents: The number of documents to sample prompts from. + + --document-length: The length of each document in tokens. + (Optional, default: 20000) + + --output-len: The number of tokens to generate for each prompt. + (Optional, default: 10) + + --repeat-count: The number of times to repeat each prompt. + (Optional, default: 2) + + --repeat-mode: The mode to repeat prompts. The supported modes are: + - 'random': shuffle the prompts randomly. (Default) + - 'tile': the entire prompt list is repeated in sequence. (Potentially + lowest cache hit) + - 'interleave': each prompt is repeated consecutively before + moving to the next element. (Highest cache hit) + + --shuffle-seed: Random seed when the repeat mode is "random". + (Optional, default: 0) + +In the meantime, it also supports all the vLLM engine args to initialize the +LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more +details. +""" + +import dataclasses +import random +import time + +from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser + + +def test_long_document_qa(llm=None, sampling_params=None, prompts=None): + """ + Test long document QA with the given prompts and sampling parameters. + Print the time spent in processing all the prompts. + + Args: + llm: The language model used for generating responses. + sampling_params: Sampling parameter used to generate the response. + prompts: A list of prompt strings to be processed by the LLM. + """ + start_time = time.time() + llm.generate(prompts, sampling_params=sampling_params) + end_time = time.time() + print(f"Time to execute all requests: {end_time - start_time:.4f} secs") + + +def repeat_prompts(prompts, repeat_count, mode: str): + """ + Repeat each prompt in the list for a specified number of times. + The order of prompts in the output list depends on the mode. + + Args: + prompts: A list of prompts to be repeated. + repeat_count: The number of times each prompt is repeated. + mode: The mode of repetition. Supported modes are: + - 'random': Shuffle the prompts randomly after repetition. + - 'tile': Repeat the entire prompt list in sequence. + Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3]. + - 'interleave': Repeat each prompt consecutively before moving to + the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3]. + + Returns: + A list of repeated prompts in the specified order. + + Raises: + ValueError: If an invalid mode is provided. + """ + print("Repeat mode: ", mode) + if mode == 'random': + repeated_prompts = prompts * repeat_count + random.shuffle(repeated_prompts) + return repeated_prompts + elif mode == 'tile': + return prompts * repeat_count + elif mode == 'interleave': + repeated_prompts = [] + for prompt in prompts: + repeated_prompts.extend([prompt] * repeat_count) + return repeated_prompts + else: + raise ValueError(f"Invalid mode: {mode}, only support " + "'random', 'tile', 'interleave'") + + +def main(args): + random.seed(args.shuffle_seed) + + # Prepare the prompts: + # we append the document id at the beginning to avoid any of the document + # being the prefix of other documents + prompts = [ + str(i) + ' '.join(['hi'] * args.document_length) + for i in range(args.num_documents) + ] + + prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode) + + warmup_prompts = [ + "This is warm up request " + str(i) + \ + ' '.join(['hi'] * args.document_length) + for i in range(args.num_documents)] + + # Create the LLM engine + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**dataclasses.asdict(engine_args)) + sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) + + print("------warm up------") + test_long_document_qa( + llm=llm, + prompts=warmup_prompts, + sampling_params=sampling_params, + ) + + print("------start generating------") + test_long_document_qa( + llm=llm, + prompts=prompts, + sampling_params=sampling_params, + ) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description= + 'Benchmark the performance with or without automatic prefix caching.') + + parser.add_argument( + '--document-length', + type=int, + # Roughly the number of tokens for a system paper, + # excluding images + default=20000, + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser.add_argument('--num-documents', + type=int, + default=8, + help='Range of input lengths for sampling prompts,' + 'specified as "min:max" (e.g., "128:256").') + + parser.add_argument('--output-len', type=int, default=10) + + parser.add_argument('--repeat-count', + type=int, + default=2, + help='Number of times to repeat each prompt') + + parser.add_argument("--repeat-mode", + type=str, + default='random', + help='The mode to repeat prompts. The supported ' + 'modes are "random", "tile", and "interleave". ' + 'See repeat_prompts() in the source code for details.') + + parser.add_argument("--shuffle-seed", + type=int, + default=0, + help='Random seed when the repeat mode is "random"') + + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + main(args) From 4db72e57f6e8da5e78285e9868e9327167bea973 Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Tue, 31 Dec 2024 18:21:51 -0800 Subject: [PATCH 041/462] [Bugfix][Refactor] Unify model management in frontend (#11660) Signed-off-by: Joe Runde --- tests/entrypoints/openai/test_cli_args.py | 2 +- tests/entrypoints/openai/test_lora_lineage.py | 32 ++- tests/entrypoints/openai/test_serving_chat.py | 20 +- ...rving_engine.py => test_serving_models.py} | 66 +++--- vllm/entrypoints/openai/api_server.py | 62 +++--- vllm/entrypoints/openai/cli_args.py | 2 +- vllm/entrypoints/openai/run_batch.py | 15 +- vllm/entrypoints/openai/serving_chat.py | 16 +- vllm/entrypoints/openai/serving_completion.py | 16 +- vllm/entrypoints/openai/serving_embedding.py | 9 +- vllm/entrypoints/openai/serving_engine.py | 192 ++-------------- vllm/entrypoints/openai/serving_models.py | 210 ++++++++++++++++++ vllm/entrypoints/openai/serving_pooling.py | 9 +- vllm/entrypoints/openai/serving_score.py | 9 +- .../openai/serving_tokenization.py | 12 +- 15 files changed, 365 insertions(+), 307 deletions(-) rename tests/entrypoints/openai/{test_serving_engine.py => test_serving_models.py} (61%) create mode 100644 vllm/entrypoints/openai/serving_models.py diff --git a/tests/entrypoints/openai/test_cli_args.py b/tests/entrypoints/openai/test_cli_args.py index 45e6980a94630..e49562ad6a21f 100644 --- a/tests/entrypoints/openai/test_cli_args.py +++ b/tests/entrypoints/openai/test_cli_args.py @@ -4,7 +4,7 @@ from vllm.entrypoints.openai.cli_args import (make_arg_parser, validate_parsed_serve_args) -from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.entrypoints.openai.serving_models import LoRAModulePath from vllm.utils import FlexibleArgumentParser from ...utils import VLLM_PATH diff --git a/tests/entrypoints/openai/test_lora_lineage.py b/tests/entrypoints/openai/test_lora_lineage.py index ab39684c2f31a..ce4f85c13fff9 100644 --- a/tests/entrypoints/openai/test_lora_lineage.py +++ b/tests/entrypoints/openai/test_lora_lineage.py @@ -55,7 +55,10 @@ def server_with_lora_modules_json(zephyr_lora_files): "64", ] - with RemoteOpenAIServer(MODEL_NAME, args) as remote_server: + # Enable the /v1/load_lora_adapter endpoint + envs = {"VLLM_ALLOW_RUNTIME_LORA_UPDATING": "True"} + + with RemoteOpenAIServer(MODEL_NAME, args, env_dict=envs) as remote_server: yield remote_server @@ -67,8 +70,8 @@ async def client_for_lora_lineage(server_with_lora_modules_json): @pytest.mark.asyncio -async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, - zephyr_lora_files): +async def test_static_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, + zephyr_lora_files): models = await client_for_lora_lineage.models.list() models = models.data served_model = models[0] @@ -81,3 +84,26 @@ async def test_check_lora_lineage(client_for_lora_lineage: openai.AsyncOpenAI, assert all(lora_model.parent == MODEL_NAME for lora_model in lora_models) assert lora_models[0].id == "zephyr-lora" assert lora_models[1].id == "zephyr-lora2" + + +@pytest.mark.asyncio +async def test_dynamic_lora_lineage( + client_for_lora_lineage: openai.AsyncOpenAI, zephyr_lora_files): + + response = await client_for_lora_lineage.post("load_lora_adapter", + cast_to=str, + body={ + "lora_name": + "zephyr-lora-3", + "lora_path": + zephyr_lora_files + }) + # Ensure adapter loads before querying /models + assert "success" in response + + models = await client_for_lora_lineage.models.list() + models = models.data + dynamic_lora_model = models[-1] + assert dynamic_lora_model.root == zephyr_lora_files + assert dynamic_lora_model.parent == MODEL_NAME + assert dynamic_lora_model.id == "zephyr-lora-3" diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 61677b65af342..97248f1150979 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -8,7 +8,8 @@ from vllm.engine.multiprocessing.client import MQLLMEngineClient from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.entrypoints.openai.serving_chat import OpenAIServingChat -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.transformers_utils.tokenizer import get_tokenizer MODEL_NAME = "openai-community/gpt2" @@ -50,14 +51,13 @@ async def _async_serving_chat_init(): engine = MockEngine() model_config = await engine.get_model_config() + models = OpenAIServingModels(model_config, BASE_MODEL_PATHS) serving_completion = OpenAIServingChat(engine, model_config, - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) return serving_completion @@ -72,14 +72,14 @@ def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False + models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=MockModelConfig()) serving_chat = OpenAIServingChat(mock_engine, MockModelConfig(), - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) req = ChatCompletionRequest( model=MODEL_NAME, @@ -115,14 +115,14 @@ def test_serving_chat_could_load_correct_generation_config(): mock_engine.errored = False # Initialize the serving chat + models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config) serving_chat = OpenAIServingChat(mock_engine, mock_model_config, - BASE_MODEL_PATHS, + models, response_role="assistant", chat_template=CHAT_TEMPLATE, chat_template_content_format="auto", - lora_modules=None, - prompt_adapters=None, request_logger=None) req = ChatCompletionRequest( model=MODEL_NAME, diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_models.py similarity index 61% rename from tests/entrypoints/openai/test_serving_engine.py rename to tests/entrypoints/openai/test_serving_models.py index 096ab6fa0ac09..96897dc730da2 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -4,11 +4,11 @@ import pytest from vllm.config import ModelConfig -from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import (ErrorResponse, LoadLoraAdapterRequest, UnloadLoraAdapterRequest) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.lora.request import LoRARequest MODEL_NAME = "meta-llama/Llama-2-7b" @@ -19,47 +19,45 @@ "Success: LoRA adapter '{lora_name}' removed successfully.") -async def _async_serving_engine_init(): - mock_engine_client = MagicMock(spec=EngineClient) +async def _async_serving_models_init() -> OpenAIServingModels: mock_model_config = MagicMock(spec=ModelConfig) # Set the max_model_len attribute to avoid missing attribute mock_model_config.max_model_len = 2048 - serving_engine = OpenAIServing(mock_engine_client, - mock_model_config, - BASE_MODEL_PATHS, - lora_modules=None, - prompt_adapters=None, - request_logger=None) - return serving_engine + serving_models = OpenAIServingModels(base_model_paths=BASE_MODEL_PATHS, + model_config=mock_model_config, + lora_modules=None, + prompt_adapters=None) + + return serving_models @pytest.mark.asyncio async def test_serving_model_name(): - serving_engine = await _async_serving_engine_init() - assert serving_engine._get_model_name(None) == MODEL_NAME + serving_models = await _async_serving_models_init() + assert serving_models.model_name(None) == MODEL_NAME request = LoRARequest(lora_name="adapter", lora_path="/path/to/adapter2", lora_int_id=1) - assert serving_engine._get_model_name(request) == request.lora_name + assert serving_models.model_name(request) == request.lora_name @pytest.mark.asyncio async def test_load_lora_adapter_success(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter", lora_path="/path/to/adapter2") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert response == LORA_LOADING_SUCCESS_MESSAGE.format(lora_name='adapter') - assert len(serving_engine.lora_requests) == 1 - assert serving_engine.lora_requests[0].lora_name == "adapter" + assert len(serving_models.lora_requests) == 1 + assert serving_models.lora_requests[0].lora_name == "adapter" @pytest.mark.asyncio async def test_load_lora_adapter_missing_fields(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="", lora_path="") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST @@ -67,43 +65,43 @@ async def test_load_lora_adapter_missing_fields(): @pytest.mark.asyncio async def test_load_lora_adapter_duplicate(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert response == LORA_LOADING_SUCCESS_MESSAGE.format( lora_name='adapter1') - assert len(serving_engine.lora_requests) == 1 + assert len(serving_models.lora_requests) == 1 request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) + response = await serving_models.load_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST - assert len(serving_engine.lora_requests) == 1 + assert len(serving_models.lora_requests) == 1 @pytest.mark.asyncio async def test_unload_lora_adapter_success(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = LoadLoraAdapterRequest(lora_name="adapter1", lora_path="/path/to/adapter1") - response = await serving_engine.load_lora_adapter(request) - assert len(serving_engine.lora_requests) == 1 + response = await serving_models.load_lora_adapter(request) + assert len(serving_models.lora_requests) == 1 request = UnloadLoraAdapterRequest(lora_name="adapter1") - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert response == LORA_UNLOADING_SUCCESS_MESSAGE.format( lora_name='adapter1') - assert len(serving_engine.lora_requests) == 0 + assert len(serving_models.lora_requests) == 0 @pytest.mark.asyncio async def test_unload_lora_adapter_missing_fields(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = UnloadLoraAdapterRequest(lora_name="", lora_int_id=None) - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST @@ -111,9 +109,9 @@ async def test_unload_lora_adapter_missing_fields(): @pytest.mark.asyncio async def test_unload_lora_adapter_not_found(): - serving_engine = await _async_serving_engine_init() + serving_models = await _async_serving_models_init() request = UnloadLoraAdapterRequest(lora_name="nonexistent_adapter") - response = await serving_engine.unload_lora_adapter(request) + response = await serving_models.unload_lora_adapter(request) assert isinstance(response, ErrorResponse) assert response.type == "InvalidUserInput" assert response.code == HTTPStatus.BAD_REQUEST diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index bac72d87376da..74fe378fdae42 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -58,7 +58,9 @@ from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.entrypoints.openai.serving_pooling import OpenAIServingPooling from vllm.entrypoints.openai.serving_score import OpenAIServingScores from vllm.entrypoints.openai.serving_tokenization import ( @@ -269,6 +271,10 @@ def base(request: Request) -> OpenAIServing: return tokenization(request) +def models(request: Request) -> OpenAIServingModels: + return request.app.state.openai_serving_models + + def chat(request: Request) -> Optional[OpenAIServingChat]: return request.app.state.openai_serving_chat @@ -336,10 +342,10 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request): @router.get("/v1/models") async def show_available_models(raw_request: Request): - handler = base(raw_request) + handler = models(raw_request) - models = await handler.show_available_models() - return JSONResponse(content=models.model_dump()) + models_ = await handler.show_available_models() + return JSONResponse(content=models_.model_dump()) @router.get("/version") @@ -505,26 +511,22 @@ async def stop_profile(raw_request: Request): @router.post("/v1/load_lora_adapter") async def load_lora_adapter(request: LoadLoraAdapterRequest, raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.load_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + handler = models(raw_request) + response = await handler.load_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @router.post("/v1/unload_lora_adapter") async def unload_lora_adapter(request: UnloadLoraAdapterRequest, raw_request: Request): - for route in [chat, completion, embedding]: - handler = route(raw_request) - if handler is not None: - response = await handler.unload_lora_adapter(request) - if isinstance(response, ErrorResponse): - return JSONResponse(content=response.model_dump(), - status_code=response.code) + handler = models(raw_request) + response = await handler.unload_lora_adapter(request) + if isinstance(response, ErrorResponse): + return JSONResponse(content=response.model_dump(), + status_code=response.code) return Response(status_code=200, content=response) @@ -628,13 +630,18 @@ def init_app_state( resolved_chat_template = load_chat_template(args.chat_template) logger.info("Using supplied chat template:\n%s", resolved_chat_template) + state.openai_serving_models = OpenAIServingModels( + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=args.lora_modules, + prompt_adapters=args.prompt_adapters, + ) + # TODO: The chat template is now broken for lora adapters :( state.openai_serving_chat = OpenAIServingChat( engine_client, model_config, - base_model_paths, + state.openai_serving_models, args.response_role, - lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -646,16 +653,14 @@ def init_app_state( state.openai_serving_completion = OpenAIServingCompletion( engine_client, model_config, - base_model_paths, - lora_modules=args.lora_modules, - prompt_adapters=args.prompt_adapters, + state.openai_serving_models, request_logger=request_logger, return_tokens_as_token_ids=args.return_tokens_as_token_ids, ) if model_config.runner_type == "generate" else None state.openai_serving_pooling = OpenAIServingPooling( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -663,7 +668,7 @@ def init_app_state( state.openai_serving_embedding = OpenAIServingEmbedding( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, @@ -671,14 +676,13 @@ def init_app_state( state.openai_serving_scores = OpenAIServingScores( engine_client, model_config, - base_model_paths, + state.openai_serving_models, request_logger=request_logger ) if model_config.task == "score" else None state.openai_serving_tokenization = OpenAIServingTokenization( engine_client, model_config, - base_model_paths, - lora_modules=args.lora_modules, + state.openai_serving_models, request_logger=request_logger, chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 908f8c3532c9e..22206ef8dbfe6 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -12,7 +12,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption, validate_chat_template) -from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, +from vllm.entrypoints.openai.serving_models import (LoRAModulePath, PromptAdapterPath) from vllm.entrypoints.openai.tool_parsers import ToolParserManager from vllm.utils import FlexibleArgumentParser diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 572ed27b39083..822c0f5f7c211 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -20,7 +20,8 @@ # yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding -from vllm.entrypoints.openai.serving_engine import BaseModelPath +from vllm.entrypoints.openai.serving_models import (BaseModelPath, + OpenAIServingModels) from vllm.usage.usage_lib import UsageContext from vllm.utils import FlexibleArgumentParser, random_uuid from vllm.version import __version__ as VLLM_VERSION @@ -213,13 +214,17 @@ async def main(args): request_logger = RequestLogger(max_log_len=args.max_log_len) # Create the openai serving objects. + openai_serving_models = OpenAIServingModels( + model_config=model_config, + base_model_paths=base_model_paths, + lora_modules=None, + prompt_adapters=None, + ) openai_serving_chat = OpenAIServingChat( engine, model_config, - base_model_paths, + openai_serving_models, args.response_role, - lora_modules=None, - prompt_adapters=None, request_logger=request_logger, chat_template=None, chat_template_content_format="auto", @@ -228,7 +233,7 @@ async def main(args): openai_serving_embedding = OpenAIServingEmbedding( engine, model_config, - base_model_paths, + openai_serving_models, request_logger=request_logger, chat_template=None, chat_template_content_format="auto", diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index d085333563d19..9ba5eeb7709c9 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -21,10 +21,8 @@ ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, RequestResponseMetadata, ToolCall, UsageInfo) -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing, - PromptAdapterPath) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager from vllm.logger import init_logger from vllm.outputs import CompletionOutput, RequestOutput @@ -42,11 +40,9 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, response_role: str, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, @@ -57,9 +53,7 @@ def __init__( ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=prompt_adapters, + models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids) @@ -126,7 +120,7 @@ async def create_chat_completion( prompt_adapter_request, ) = self._maybe_get_adapters(request) - model_name = self._get_model_name(lora_request) + model_name = self.models.model_name(lora_request) tokenizer = await self.engine_client.get_tokenizer(lora_request) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index aaad7b8c7f44c..17197dce8da23 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -21,10 +21,8 @@ RequestResponseMetadata, UsageInfo) # yapf: enable -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing, - PromptAdapterPath) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import RequestOutput from vllm.sampling_params import BeamSearchParams, SamplingParams @@ -41,18 +39,14 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, ): super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=prompt_adapters, + models=models, request_logger=request_logger, return_tokens_as_token_ids=return_tokens_as_token_ids) diff_sampling_param = self.model_config.get_diff_sampling_param() @@ -170,7 +164,7 @@ async def create_completion( result_generator = merge_async_iterators(*generators) - model_name = self._get_model_name(lora_request) + model_name = self.models.model_name(lora_request) num_prompts = len(engine_prompts) # Similar to the OpenAI API, when n != best_of, we do not stream the diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index b8fb9d6bd77f2..e7116a3d95d10 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -16,7 +16,8 @@ EmbeddingResponse, EmbeddingResponseData, ErrorResponse, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import (EmbeddingOutput, EmbeddingRequestOutput, PoolingRequestOutput) @@ -46,7 +47,7 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], chat_template: Optional[str], @@ -54,9 +55,7 @@ def __init__( ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 5b6a089e4c319..319f869240036 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1,7 +1,5 @@ import json -import pathlib from concurrent.futures.thread import ThreadPoolExecutor -from dataclasses import dataclass from http import HTTPStatus from typing import (Any, Callable, Dict, Iterable, Iterator, List, Mapping, Optional, Sequence, Tuple, TypedDict, Union) @@ -28,13 +26,10 @@ DetokenizeRequest, EmbeddingChatRequest, EmbeddingCompletionRequest, - ErrorResponse, - LoadLoraAdapterRequest, - ModelCard, ModelList, - ModelPermission, ScoreRequest, + ErrorResponse, ScoreRequest, TokenizeChatRequest, - TokenizeCompletionRequest, - UnloadLoraAdapterRequest) + TokenizeCompletionRequest) +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser # yapf: enable from vllm.inputs import TokensPrompt @@ -48,30 +43,10 @@ from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer -from vllm.utils import AtomicCounter, is_list_of, make_async, random_uuid +from vllm.utils import is_list_of, make_async, random_uuid logger = init_logger(__name__) - -@dataclass -class BaseModelPath: - name: str - model_path: str - - -@dataclass -class PromptAdapterPath: - name: str - local_path: str - - -@dataclass -class LoRAModulePath: - name: str - path: str - base_model_name: Optional[str] = None - - CompletionLikeRequest = Union[CompletionRequest, DetokenizeRequest, EmbeddingCompletionRequest, ScoreRequest, TokenizeCompletionRequest] @@ -96,10 +71,8 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], request_logger: Optional[RequestLogger], return_tokens_as_token_ids: bool = False, ): @@ -109,35 +82,7 @@ def __init__( self.model_config = model_config self.max_model_len = model_config.max_model_len - self.base_model_paths = base_model_paths - - self.lora_id_counter = AtomicCounter(0) - self.lora_requests = [] - if lora_modules is not None: - self.lora_requests = [ - LoRARequest(lora_name=lora.name, - lora_int_id=i, - lora_path=lora.path, - base_model_name=lora.base_model_name - if lora.base_model_name - and self._is_model_supported(lora.base_model_name) - else self.base_model_paths[0].name) - for i, lora in enumerate(lora_modules, start=1) - ] - - self.prompt_adapter_requests = [] - if prompt_adapters is not None: - for i, prompt_adapter in enumerate(prompt_adapters, start=1): - with pathlib.Path(prompt_adapter.local_path, - "adapter_config.json").open() as f: - adapter_config = json.load(f) - num_virtual_tokens = adapter_config["num_virtual_tokens"] - self.prompt_adapter_requests.append( - PromptAdapterRequest( - prompt_adapter_name=prompt_adapter.name, - prompt_adapter_id=i, - prompt_adapter_local_path=prompt_adapter.local_path, - prompt_adapter_num_virtual_tokens=num_virtual_tokens)) + self.models = models self.request_logger = request_logger self.return_tokens_as_token_ids = return_tokens_as_token_ids @@ -150,33 +95,6 @@ def __init__( self._tokenize_prompt_input_or_inputs, executor=self._tokenizer_executor) - async def show_available_models(self) -> ModelList: - """Show available models. Right now we only have one model.""" - model_cards = [ - ModelCard(id=base_model.name, - max_model_len=self.max_model_len, - root=base_model.model_path, - permission=[ModelPermission()]) - for base_model in self.base_model_paths - ] - lora_cards = [ - ModelCard(id=lora.lora_name, - root=lora.local_path, - parent=lora.base_model_name if lora.base_model_name else - self.base_model_paths[0].name, - permission=[ModelPermission()]) - for lora in self.lora_requests - ] - prompt_adapter_cards = [ - ModelCard(id=prompt_adapter.prompt_adapter_name, - root=self.base_model_paths[0].name, - permission=[ModelPermission()]) - for prompt_adapter in self.prompt_adapter_requests - ] - model_cards.extend(lora_cards) - model_cards.extend(prompt_adapter_cards) - return ModelList(data=model_cards) - def create_error_response( self, message: str, @@ -205,11 +123,13 @@ async def _check_model( ) -> Optional[ErrorResponse]: if self._is_model_supported(request.model): return None - if request.model in [lora.lora_name for lora in self.lora_requests]: + if request.model in [ + lora.lora_name for lora in self.models.lora_requests + ]: return None if request.model in [ prompt_adapter.prompt_adapter_name - for prompt_adapter in self.prompt_adapter_requests + for prompt_adapter in self.models.prompt_adapter_requests ]: return None return self.create_error_response( @@ -223,10 +143,10 @@ def _maybe_get_adapters( None, PromptAdapterRequest]]: if self._is_model_supported(request.model): return None, None - for lora in self.lora_requests: + for lora in self.models.lora_requests: if request.model == lora.lora_name: return lora, None - for prompt_adapter in self.prompt_adapter_requests: + for prompt_adapter in self.models.prompt_adapter_requests: if request.model == prompt_adapter.prompt_adapter_name: return None, prompt_adapter # if _check_model has been called earlier, this will be unreachable @@ -588,91 +508,5 @@ def _get_decoded_token(logprob: Logprob, return logprob.decoded_token return tokenizer.decode(token_id) - async def _check_load_lora_adapter_request( - self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: - # Check if both 'lora_name' and 'lora_path' are provided - if not request.lora_name or not request.lora_path: - return self.create_error_response( - message="Both 'lora_name' and 'lora_path' must be provided.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - # Check if the lora adapter with the given name already exists - if any(lora_request.lora_name == request.lora_name - for lora_request in self.lora_requests): - return self.create_error_response( - message= - f"The lora adapter '{request.lora_name}' has already been" - "loaded.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - return None - - async def _check_unload_lora_adapter_request( - self, - request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: - # Check if either 'lora_name' or 'lora_int_id' is provided - if not request.lora_name and not request.lora_int_id: - return self.create_error_response( - message= - "either 'lora_name' and 'lora_int_id' needs to be provided.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - # Check if the lora adapter with the given name exists - if not any(lora_request.lora_name == request.lora_name - for lora_request in self.lora_requests): - return self.create_error_response( - message= - f"The lora adapter '{request.lora_name}' cannot be found.", - err_type="InvalidUserInput", - status_code=HTTPStatus.BAD_REQUEST) - - return None - - async def load_lora_adapter( - self, - request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: - error_check_ret = await self._check_load_lora_adapter_request(request) - if error_check_ret is not None: - return error_check_ret - - lora_name, lora_path = request.lora_name, request.lora_path - unique_id = self.lora_id_counter.inc(1) - self.lora_requests.append( - LoRARequest(lora_name=lora_name, - lora_int_id=unique_id, - lora_path=lora_path)) - return f"Success: LoRA adapter '{lora_name}' added successfully." - - async def unload_lora_adapter( - self, - request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: - error_check_ret = await self._check_unload_lora_adapter_request(request - ) - if error_check_ret is not None: - return error_check_ret - - lora_name = request.lora_name - self.lora_requests = [ - lora_request for lora_request in self.lora_requests - if lora_request.lora_name != lora_name - ] - return f"Success: LoRA adapter '{lora_name}' removed successfully." - def _is_model_supported(self, model_name): - return any(model.name == model_name for model in self.base_model_paths) - - def _get_model_name(self, lora: Optional[LoRARequest]): - """ - Returns the appropriate model name depending on the availability - and support of the LoRA or base model. - Parameters: - - lora: LoRARequest that contain a base_model_name. - Returns: - - str: The name of the base model or the first available model path. - """ - if lora is not None: - return lora.lora_name - return self.base_model_paths[0].name + return self.models.is_base_model(model_name) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py new file mode 100644 index 0000000000000..26966896bc272 --- /dev/null +++ b/vllm/entrypoints/openai/serving_models.py @@ -0,0 +1,210 @@ +import json +import pathlib +from dataclasses import dataclass +from http import HTTPStatus +from typing import List, Optional, Union + +from vllm.config import ModelConfig +from vllm.entrypoints.openai.protocol import (ErrorResponse, + LoadLoraAdapterRequest, + ModelCard, ModelList, + ModelPermission, + UnloadLoraAdapterRequest) +from vllm.lora.request import LoRARequest +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.utils import AtomicCounter + + +@dataclass +class BaseModelPath: + name: str + model_path: str + + +@dataclass +class PromptAdapterPath: + name: str + local_path: str + + +@dataclass +class LoRAModulePath: + name: str + path: str + base_model_name: Optional[str] = None + + +class OpenAIServingModels: + """Shared instance to hold data about the loaded base model(s) and adapters. + + Handles the routes: + - /v1/models + - /v1/load_lora_adapter + - /v1/unload_lora_adapter + """ + + def __init__( + self, + model_config: ModelConfig, + base_model_paths: List[BaseModelPath], + *, + lora_modules: Optional[List[LoRAModulePath]] = None, + prompt_adapters: Optional[List[PromptAdapterPath]] = None, + ): + super().__init__() + + self.base_model_paths = base_model_paths + self.max_model_len = model_config.max_model_len + + self.lora_id_counter = AtomicCounter(0) + self.lora_requests = [] + if lora_modules is not None: + self.lora_requests = [ + LoRARequest(lora_name=lora.name, + lora_int_id=i, + lora_path=lora.path, + base_model_name=lora.base_model_name + if lora.base_model_name + and self.is_base_model(lora.base_model_name) else + self.base_model_paths[0].name) + for i, lora in enumerate(lora_modules, start=1) + ] + + self.prompt_adapter_requests = [] + if prompt_adapters is not None: + for i, prompt_adapter in enumerate(prompt_adapters, start=1): + with pathlib.Path(prompt_adapter.local_path, + "adapter_config.json").open() as f: + adapter_config = json.load(f) + num_virtual_tokens = adapter_config["num_virtual_tokens"] + self.prompt_adapter_requests.append( + PromptAdapterRequest( + prompt_adapter_name=prompt_adapter.name, + prompt_adapter_id=i, + prompt_adapter_local_path=prompt_adapter.local_path, + prompt_adapter_num_virtual_tokens=num_virtual_tokens)) + + def is_base_model(self, model_name): + return any(model.name == model_name for model in self.base_model_paths) + + def model_name(self, lora_request: Optional[LoRARequest] = None) -> str: + """Returns the appropriate model name depending on the availability + and support of the LoRA or base model. + Parameters: + - lora: LoRARequest that contain a base_model_name. + Returns: + - str: The name of the base model or the first available model path. + """ + if lora_request is not None: + return lora_request.lora_name + return self.base_model_paths[0].name + + async def show_available_models(self) -> ModelList: + """Show available models. This includes the base model and all + adapters""" + model_cards = [ + ModelCard(id=base_model.name, + max_model_len=self.max_model_len, + root=base_model.model_path, + permission=[ModelPermission()]) + for base_model in self.base_model_paths + ] + lora_cards = [ + ModelCard(id=lora.lora_name, + root=lora.local_path, + parent=lora.base_model_name if lora.base_model_name else + self.base_model_paths[0].name, + permission=[ModelPermission()]) + for lora in self.lora_requests + ] + prompt_adapter_cards = [ + ModelCard(id=prompt_adapter.prompt_adapter_name, + root=self.base_model_paths[0].name, + permission=[ModelPermission()]) + for prompt_adapter in self.prompt_adapter_requests + ] + model_cards.extend(lora_cards) + model_cards.extend(prompt_adapter_cards) + return ModelList(data=model_cards) + + async def load_lora_adapter( + self, + request: LoadLoraAdapterRequest) -> Union[ErrorResponse, str]: + error_check_ret = await self._check_load_lora_adapter_request(request) + if error_check_ret is not None: + return error_check_ret + + lora_name, lora_path = request.lora_name, request.lora_path + unique_id = self.lora_id_counter.inc(1) + self.lora_requests.append( + LoRARequest(lora_name=lora_name, + lora_int_id=unique_id, + lora_path=lora_path)) + return f"Success: LoRA adapter '{lora_name}' added successfully." + + async def unload_lora_adapter( + self, + request: UnloadLoraAdapterRequest) -> Union[ErrorResponse, str]: + error_check_ret = await self._check_unload_lora_adapter_request(request + ) + if error_check_ret is not None: + return error_check_ret + + lora_name = request.lora_name + self.lora_requests = [ + lora_request for lora_request in self.lora_requests + if lora_request.lora_name != lora_name + ] + return f"Success: LoRA adapter '{lora_name}' removed successfully." + + async def _check_load_lora_adapter_request( + self, request: LoadLoraAdapterRequest) -> Optional[ErrorResponse]: + # Check if both 'lora_name' and 'lora_path' are provided + if not request.lora_name or not request.lora_path: + return create_error_response( + message="Both 'lora_name' and 'lora_path' must be provided.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + # Check if the lora adapter with the given name already exists + if any(lora_request.lora_name == request.lora_name + for lora_request in self.lora_requests): + return create_error_response( + message= + f"The lora adapter '{request.lora_name}' has already been" + "loaded.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + return None + + async def _check_unload_lora_adapter_request( + self, + request: UnloadLoraAdapterRequest) -> Optional[ErrorResponse]: + # Check if either 'lora_name' or 'lora_int_id' is provided + if not request.lora_name and not request.lora_int_id: + return create_error_response( + message= + "either 'lora_name' and 'lora_int_id' needs to be provided.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + # Check if the lora adapter with the given name exists + if not any(lora_request.lora_name == request.lora_name + for lora_request in self.lora_requests): + return create_error_response( + message= + f"The lora adapter '{request.lora_name}' cannot be found.", + err_type="InvalidUserInput", + status_code=HTTPStatus.BAD_REQUEST) + + return None + + +def create_error_response( + message: str, + err_type: str = "BadRequestError", + status_code: HTTPStatus = HTTPStatus.BAD_REQUEST) -> ErrorResponse: + return ErrorResponse(message=message, + type=err_type, + code=status_code.value) diff --git a/vllm/entrypoints/openai/serving_pooling.py b/vllm/entrypoints/openai/serving_pooling.py index 01852f0df1eca..5830322071e58 100644 --- a/vllm/entrypoints/openai/serving_pooling.py +++ b/vllm/entrypoints/openai/serving_pooling.py @@ -15,7 +15,8 @@ PoolingChatRequest, PoolingRequest, PoolingResponse, PoolingResponseData, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger from vllm.outputs import PoolingOutput, PoolingRequestOutput from vllm.utils import merge_async_iterators @@ -44,7 +45,7 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], chat_template: Optional[str], @@ -52,9 +53,7 @@ def __init__( ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template diff --git a/vllm/entrypoints/openai/serving_score.py b/vllm/entrypoints/openai/serving_score.py index a8a126e697641..5d3e7139d7a17 100644 --- a/vllm/entrypoints/openai/serving_score.py +++ b/vllm/entrypoints/openai/serving_score.py @@ -10,7 +10,8 @@ from vllm.entrypoints.openai.protocol import (ErrorResponse, ScoreRequest, ScoreResponse, ScoreResponseData, UsageInfo) -from vllm.entrypoints.openai.serving_engine import BaseModelPath, OpenAIServing +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.inputs.data import TokensPrompt from vllm.logger import init_logger from vllm.outputs import PoolingRequestOutput, ScoringRequestOutput @@ -50,15 +51,13 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, request_logger: Optional[RequestLogger], ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=None, - prompt_adapters=None, + models=models, request_logger=request_logger) async def create_score( diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 2e849333680d4..b67ecfb01316f 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -15,9 +15,8 @@ TokenizeRequest, TokenizeResponse) # yapf: enable -from vllm.entrypoints.openai.serving_engine import (BaseModelPath, - LoRAModulePath, - OpenAIServing) +from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.logger import init_logger logger = init_logger(__name__) @@ -29,18 +28,15 @@ def __init__( self, engine_client: EngineClient, model_config: ModelConfig, - base_model_paths: List[BaseModelPath], + models: OpenAIServingModels, *, - lora_modules: Optional[List[LoRAModulePath]], request_logger: Optional[RequestLogger], chat_template: Optional[str], chat_template_content_format: ChatTemplateContentFormatOption, ) -> None: super().__init__(engine_client=engine_client, model_config=model_config, - base_model_paths=base_model_paths, - lora_modules=lora_modules, - prompt_adapters=None, + models=models, request_logger=request_logger) self.chat_template = chat_template From 365801feddaf5c4448704a1f55269dd992f5a4b1 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 1 Jan 2025 14:15:21 +0800 Subject: [PATCH 042/462] [VLM] Add max-count checking in data parser for single image models (#11661) Signed-off-by: DarkLight1337 Signed-off-by: Roger Wang Co-authored-by: Roger Wang --- docs/source/models/supported_models.md | 2 +- tests/multimodal/test_processing.py | 3 ++- vllm/model_executor/models/blip2.py | 4 ++++ vllm/model_executor/models/chameleon.py | 4 ++++ vllm/model_executor/models/fuyu.py | 18 +++++++++------- vllm/multimodal/parse.py | 28 +++++++++++++++++++++++-- 6 files changed, 48 insertions(+), 11 deletions(-) diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index f74c201bdff6b..7682ed104b8c5 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -566,7 +566,7 @@ See [this page](#generative-models) for more information on how to use generativ - [V1](gh-issue:8779) * - `AriaForConditionalGeneration` - Aria - - T + I + - T + I+ - `rhymes-ai/Aria` - - ✅︎ diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 81278cde264ff..1850ca46ccc8f 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -622,10 +622,11 @@ def _test_processing_cache_correctness( # yapf: disable +# True if the model supports multiple data items of the modality per request @pytest.mark.parametrize(("model_id", "modalities"), [ ("rhymes-ai/Aria", {"image": True}), ("Salesforce/blip2-opt-2.7b", {"image": False}), - ("facebook/chameleon-7b", {"image": True}), + ("facebook/chameleon-7b", {"image": False}), ("adept/fuyu-8b", {"image": False}), ("llava-hf/llava-1.5-7b-hf", {"image": True}), ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index bf70f5d904f5b..50680fadc4aa3 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -18,6 +18,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -404,6 +405,9 @@ def get_max_blip2_image_tokens(ctx: InputContext): class Blip2MultiModalProcessor(BaseMultiModalProcessor): + def _get_data_parser(self) -> MultiModalDataParser: + return MultiModalDataParser(max_mm_counts={"image": 1}) + def _get_hf_processor(self) -> Blip2Processor: return self.ctx.get_hf_processor(Blip2Processor) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 85fca23b05746..c731934e792fc 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -31,6 +31,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) +from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -60,6 +61,9 @@ def get_max_chameleon_image_tokens(ctx: InputContext): class ChameleonMultiModalProcessor(BaseMultiModalProcessor): + def _get_data_parser(self) -> MultiModalDataParser: + return MultiModalDataParser(max_mm_counts={"image": 1}) + def _get_hf_processor(self) -> ChameleonProcessor: return self.ctx.get_hf_processor(ChameleonProcessor) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 8c14866f20b92..0a48fa3fe11c0 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -34,7 +34,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -54,7 +54,7 @@ class FuyuImagePatchInputs(TypedDict): type: Literal["image_patches"] - data: torch.Tensor + flat_data: torch.Tensor """ Shape: `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)` @@ -63,7 +63,7 @@ class FuyuImagePatchInputs(TypedDict): patches_per_image: List[int] """ List of number of total patches for each image in the batch. - This is used to restore the first two dimensions of `data`. + This is used to restore the first two dimensions of `flat_data`. """ @@ -102,6 +102,9 @@ def get_max_fuyu_image_tokens(ctx: InputContext): class FuyuMultiModalProcessor(BaseMultiModalProcessor): + def _get_data_parser(self) -> MultiModalDataParser: + return MultiModalDataParser(max_mm_counts={"image": 1}) + def _get_hf_processor(self) -> FuyuProcessor: return self.ctx.get_hf_processor(FuyuProcessor) @@ -304,7 +307,7 @@ def _parse_and_validate_image_input( return FuyuImagePatchInputs( type="image_patches", - data=self._validate_pixel_values( + flat_data=self._validate_pixel_values( flatten_bn(image_patches_flat, concat=True)), patches_per_image=[x.size(0) for x in image_patches_flat], ) @@ -313,12 +316,13 @@ def _parse_and_validate_image_input( def _process_image_input( self, image_input: FuyuImagePatchInputs) -> NestedTensors: - image_patches = image_input["data"] + image_patches_flat = image_input["flat_data"] patches_per_image = image_input["patches_per_image"] assert self.vision_embed_tokens is not None - vision_embeddings, _ = self.vision_embed_tokens(image_patches) - return vision_embeddings.split(patches_per_image, dim=0) + vision_embeddings_flat, _ = self.vision_embed_tokens( + image_patches_flat) + return vision_embeddings_flat.split(patches_per_image, dim=0) def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 17a795247372e..da111e999ebb8 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -220,11 +220,24 @@ def get_items( class MultiModalDataParser: """ Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. + + Args: + max_mm_counts (Mapping[str, int]): The maximum allowed number of items + belonging to each modality. This effectively sets a hard limit over + `--limit-mm-per-prompt`. + target_sr (float, optional): Enables automatic resampling of audio + items to the model's expected sampling rate. """ - def __init__(self, *, target_sr: Optional[float] = None) -> None: + def __init__( + self, + *, + max_mm_counts: Mapping[str, int] = {}, + target_sr: Optional[float] = None, + ) -> None: super().__init__() + self.max_mm_counts = max_mm_counts self.target_sr = target_sr def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: @@ -332,6 +345,7 @@ def _get_subparsers(self) -> Mapping[str, ModalityDataParser]: def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: + max_mm_counts = self.max_mm_counts subparsers = self._get_subparsers() mm_items = MultiModalDataItems() @@ -339,6 +353,16 @@ def parse_mm_data(self, if k not in subparsers: raise ValueError(f"Unsupported modality: {k}") - mm_items[k] = subparsers[k](v) + modality_items = subparsers[k](v) + + if k in max_mm_counts: + max_count = max_mm_counts[k] + if len(modality_items) > max_count: + raise ValueError( + f"This model supports at most {max_count} {k} items " + f"per prompt, but {len(modality_items)} {k} items " + "were given or set as its limit_mm_per_prompt.") + + mm_items[k] = modality_items return mm_items From 11d8a091c6c775575a53d37408c94faa0b07730f Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Wed, 1 Jan 2025 14:42:23 +0800 Subject: [PATCH 043/462] [Misc] Optimize Qwen2-VL LoRA test (#11663) Signed-off-by: Jee Jee Li --- tests/lora/test_qwen2vl.py | 5 ++--- vllm/model_executor/models/qwen2_vl.py | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/tests/lora/test_qwen2vl.py b/tests/lora/test_qwen2vl.py index c9f48402b0268..ebdd129db5f6a 100644 --- a/tests/lora/test_qwen2vl.py +++ b/tests/lora/test_qwen2vl.py @@ -7,7 +7,7 @@ from vllm.lora.request import LoRARequest from vllm.platforms import current_platform -MODEL_PATH = "Qwen/Qwen2-VL-7B-Instruct" +MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct" PROMPT_TEMPLATE = ( "<|im_start|>system\nYou are a helpful assistant.<|im_end|>" @@ -49,10 +49,9 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: # Print the outputs. generated_texts: List[str] = [] for output in outputs: - prompt = output.prompt generated_text = output.outputs[0].text.strip() generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + print(f"Generated text: {generated_text!r}") return generated_texts diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 1e485f87bb7a4..0df101b3dcce4 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -52,6 +52,7 @@ GPTQMarlinConfig) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, @@ -926,15 +927,23 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, } # LoRA specific attributes - # TODO Support LoRA for the visual encoder in the future. supported_lora_modules = [ "qkv_proj", "o_proj", "gate_up_proj", "down_proj", + # vision tower + "qkv", + "attn.proj", # Distinguish patch_embed.proj + "fc1", + "fc2", + # projector + "mlp.0", + "mlp.2" ] embedding_modules = {} embedding_padding_modules = [] + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ "lm_head.": "language_model.lm_head.", @@ -1231,3 +1240,12 @@ def load_weights(self, weights: Iterable[Tuple[str, loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="visual.", + tower_model="visual.merger.") From f962f426bc63b66301da61d2ac7078bf0ba941b0 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Tue, 31 Dec 2024 23:39:30 -0800 Subject: [PATCH 044/462] [Misc] Replace space with - in the file names (#11667) Signed-off-by: Lu Fang --- .github/ISSUE_TEMPLATE/{400-bug report.yml => 400-bug-report.yml} | 0 .../{500-feature request.yml => 500-feature-request.yml} | 0 .github/ISSUE_TEMPLATE/{600-new model.yml => 600-new-model.yml} | 0 ...-performance discussion.yml => 700-performance-discussion.yml} | 0 .../{800-misc discussion.yml => 800-misc-discussion.yml} | 0 5 files changed, 0 insertions(+), 0 deletions(-) rename .github/ISSUE_TEMPLATE/{400-bug report.yml => 400-bug-report.yml} (100%) rename .github/ISSUE_TEMPLATE/{500-feature request.yml => 500-feature-request.yml} (100%) rename .github/ISSUE_TEMPLATE/{600-new model.yml => 600-new-model.yml} (100%) rename .github/ISSUE_TEMPLATE/{700-performance discussion.yml => 700-performance-discussion.yml} (100%) rename .github/ISSUE_TEMPLATE/{800-misc discussion.yml => 800-misc-discussion.yml} (100%) diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/400-bug report.yml rename to .github/ISSUE_TEMPLATE/400-bug-report.yml diff --git a/.github/ISSUE_TEMPLATE/500-feature request.yml b/.github/ISSUE_TEMPLATE/500-feature-request.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/500-feature request.yml rename to .github/ISSUE_TEMPLATE/500-feature-request.yml diff --git a/.github/ISSUE_TEMPLATE/600-new model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/600-new model.yml rename to .github/ISSUE_TEMPLATE/600-new-model.yml diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance-discussion.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/700-performance discussion.yml rename to .github/ISSUE_TEMPLATE/700-performance-discussion.yml diff --git a/.github/ISSUE_TEMPLATE/800-misc discussion.yml b/.github/ISSUE_TEMPLATE/800-misc-discussion.yml similarity index 100% rename from .github/ISSUE_TEMPLATE/800-misc discussion.yml rename to .github/ISSUE_TEMPLATE/800-misc-discussion.yml From 6d70198b17b008f5b845582590b96a507b4d68b5 Mon Sep 17 00:00:00 2001 From: Kazuhiro Serizawa Date: Wed, 1 Jan 2025 17:10:10 +0900 Subject: [PATCH 045/462] [Doc] Fix typo (#11666) Signed-off-by: Kazuhiro Serizawa --- vllm/model_executor/layers/rejection_sampler.py | 2 +- vllm/v1/sample/ops/topk_topp_sampler.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 97a1b0c9603bd..165e8309fee64 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -39,7 +39,7 @@ def __init__(self, strict_mode: Whether or not to perform shape/device/dtype checks during sampling. This catches correctness issues but adds nontrivial latency. - use_falshinfer: We will use this parameter to determine whether + use_flashinfer: We will use this parameter to determine whether to use the FlashInfer rejection sampling kernel or not. If it's None, we will use the default value from the environment variable. This parameter is only used for testing purposes. diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py index c088c3c129ca5..f2007d85c61a5 100644 --- a/vllm/v1/sample/ops/topk_topp_sampler.py +++ b/vllm/v1/sample/ops/topk_topp_sampler.py @@ -44,7 +44,7 @@ def __init__(self): logger.warning( "FlashInfer is not available. Falling back to the PyTorch-" "native implementation of top-p & top-k sampling. For the " - "best performance, please install FalshInfer.") + "best performance, please install FlashInfer.") self.forward = self.forward_native else: self.forward = self.forward_native From 73001445fbfc42d386d68066519738dfffa62df3 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Wed, 1 Jan 2025 21:56:46 +0900 Subject: [PATCH 046/462] [V1] Implement Cascade Attention (#11635) Signed-off-by: Woosuk Kwon --- CMakeLists.txt | 2 +- tests/conftest.py | 7 + tests/kernels/test_cascade_flash_attn.py | 182 +++++++++++++ tests/system_messages/sonnet3.5_nov2024.txt | 71 ++++++ tests/v1/e2e/__init__.py | 0 tests/v1/e2e/test_cascade_attention.py | 22 ++ vllm/v1/attention/backends/flash_attn.py | 267 +++++++++++++++++++- vllm/v1/core/kv_cache_manager.py | 52 +++- vllm/v1/core/scheduler.py | 10 + vllm/v1/worker/gpu_model_runner.py | 96 ++++++- 10 files changed, 693 insertions(+), 16 deletions(-) create mode 100644 tests/kernels/test_cascade_flash_attn.py create mode 100644 tests/system_messages/sonnet3.5_nov2024.txt create mode 100644 tests/v1/e2e/__init__.py create mode 100644 tests/v1/e2e/test_cascade_attention.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 3206d76125545..f4b9c3ec9c14f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -550,7 +550,7 @@ else() FetchContent_Declare( vllm-flash-attn GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git - GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb + GIT_TAG 96266b1111111f3d11aabefaf3bacbab6a89d03c GIT_PROGRESS TRUE # Don't share the vllm-flash-attn build between build types BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn diff --git a/tests/conftest.py b/tests/conftest.py index 6e2f75e33654f..917151ddcb8d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -40,6 +40,7 @@ _TEST_DIR = os.path.dirname(__file__) _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] +_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt") _M = TypeVar("_M") _PromptMultiModalInput = Union[List[_M], List[List[_M]]] @@ -177,6 +178,12 @@ def example_prompts() -> List[str]: return prompts +@pytest.fixture +def example_system_message() -> str: + with open(_SYS_MSG) as f: + return f.read() + + class DecoderPromptType(Enum): """For encoder/decoder models only.""" CUSTOM = 1 diff --git a/tests/kernels/test_cascade_flash_attn.py b/tests/kernels/test_cascade_flash_attn.py new file mode 100644 index 0000000000000..45ec6df4e711e --- /dev/null +++ b/tests/kernels/test_cascade_flash_attn.py @@ -0,0 +1,182 @@ +from typing import List, Optional, Tuple + +import pytest +import torch + +from vllm.platforms import current_platform +from vllm.v1.attention.backends.flash_attn import (cascade_attention, + merge_attn_states) +from vllm.vllm_flash_attn import flash_attn_varlen_func + +NUM_HEADS = [(4, 4), (8, 2), (16, 2)] +HEAD_SIZES = [128, 192, 256] +BLOCK_SIZES = [16] +DTYPES = [torch.float16, torch.bfloat16] + + +@pytest.mark.parametrize("num_tokens", [1, 39, 16912]) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@torch.inference_mode() +def test_merge_kernel( + num_tokens: int, + num_heads: Tuple[int, int], + head_size: int, + dtype: torch.dtype, +): + torch.set_default_device("cuda") + current_platform.seed_everything(0) + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + + # Prepare inputs. + prefix_output = torch.randn(num_tokens, + num_query_heads, + head_size, + dtype=dtype) + suffix_output = torch.randn(num_tokens, + num_query_heads, + head_size, + dtype=dtype) + prefix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32) + suffix_lse = torch.randn(num_query_heads, num_tokens, dtype=torch.float32) + + # Run the kernel. + output = torch.empty(num_tokens, num_query_heads, head_size, dtype=dtype) + merge_attn_states(output, prefix_output, prefix_lse, suffix_output, + suffix_lse) + + # Reference implementation. + max_lse = torch.maximum(prefix_lse, suffix_lse) + p_lse = torch.exp(prefix_lse - max_lse) + s_lse = torch.exp(suffix_lse - max_lse) + p_scale = p_lse / (p_lse + s_lse) + s_scale = s_lse / (p_lse + s_lse) + p_scale = p_scale.transpose(0, 1).unsqueeze(2) + s_scale = s_scale.transpose(0, 1).unsqueeze(2) + ref_output = p_scale * prefix_output + s_scale * suffix_output + ref_output = ref_output.to(dtype) + + # Compare the results. + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) + + +CASES = [ + # Case 1. A general case. + ([(129, 871), (18, 280), (37, 988), (1023, 2304), (1, 257)], 256), + # Case 2. Flash-decoding case. + ([(1, 1023), (1, 879), (1, 778), (1, 1777)] * 100, 512), +] + + +@pytest.mark.parametrize("seq_lens_and_common_prefix", CASES) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("dtype", DTYPES) +@pytest.mark.parametrize("block_size", BLOCK_SIZES) +@pytest.mark.parametrize("soft_cap", [None, 50]) +@pytest.mark.parametrize("num_blocks", [2048]) +@torch.inference_mode() +def test_cascade( + seq_lens_and_common_prefix: Tuple[List[Tuple[int, int]], int], + num_heads: Tuple[int, int], + head_size: int, + dtype: torch.dtype, + block_size: int, + soft_cap: Optional[float], + num_blocks: int, +) -> None: + torch.set_default_device("cuda") + current_platform.seed_everything(0) + + window_size = (-1, -1) + scale = head_size**-0.5 + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + key_cache = torch.randn(num_blocks, + block_size, + num_kv_heads, + head_size, + dtype=dtype) + value_cache = torch.randn_like(key_cache) + + seq_lens, common_prefix_len = seq_lens_and_common_prefix + num_seqs = len(seq_lens) + query_lens = [x[0] for x in seq_lens] + kv_lens = [x[1] for x in seq_lens] + max_query_len = max(query_lens) + max_kv_len = max(kv_lens) + + total_num_query_tokens = sum(query_lens) + query = torch.randn(total_num_query_tokens, + num_query_heads, + head_size, + dtype=dtype) + cu_query_lens = torch.tensor([0] + query_lens, + dtype=torch.int32).cumsum(dim=0, + dtype=torch.int32) + cu_kv_lens = torch.tensor([0] + kv_lens, + dtype=torch.int32).cumsum(dim=0, + dtype=torch.int32) + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint(0, + num_blocks, + (num_seqs, max_num_blocks_per_seq), + dtype=torch.int32) + + assert common_prefix_len > 0 + assert common_prefix_len % block_size == 0 + num_common_kv_blocks = common_prefix_len // block_size + # Make sure the first `num_common_kv_blocks` blocks are the same. + block_tables[:, :num_common_kv_blocks] = \ + block_tables[0, :num_common_kv_blocks] + + # Run the regular attention. + ref_output = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=cu_query_lens, + cu_seqlens_k=cu_kv_lens, + max_seqlen_q=max_query_len, + max_seqlen_k=max_kv_len, + softmax_scale=scale, + causal=True, + window_size=window_size, + block_table=block_tables, + softcap=soft_cap if soft_cap is not None else 0, + ) + + # Run cascade attention. + assert all(common_prefix_len < kv_len for kv_len in kv_lens) + cu_prefix_query_lens = torch.tensor([0, total_num_query_tokens], + dtype=torch.int32) + cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], dtype=torch.int32) + cu_suffix_kv_lens = ( + cu_kv_lens - + torch.arange(num_seqs + 1, dtype=torch.int32) * common_prefix_len) + output = torch.empty_like(query) + cascade_attention( + output=output, + query=query, + key_cache=key_cache, + value_cache=value_cache, + cu_query_lens=cu_query_lens, + max_query_len=max_query_len, + cu_prefix_query_lens=cu_prefix_query_lens, + cu_prefix_kv_lens=cu_prefix_kv_lens, + cu_suffix_kv_lens=cu_suffix_kv_lens, + max_kv_len=max_kv_len, + softmax_scale=scale, + alibi_slopes=None, + sliding_window=window_size, + logits_soft_cap=soft_cap if soft_cap is not None else 0, + block_table=block_tables, + common_prefix_len=common_prefix_len, + ) + + # Compare the results. + torch.testing.assert_close(output, ref_output, atol=1e-2, rtol=1e-2) diff --git a/tests/system_messages/sonnet3.5_nov2024.txt b/tests/system_messages/sonnet3.5_nov2024.txt new file mode 100644 index 0000000000000..2dc285ac96bab --- /dev/null +++ b/tests/system_messages/sonnet3.5_nov2024.txt @@ -0,0 +1,71 @@ +The assistant is Claude, created by Anthropic. + +Claude’s knowledge base was last updated in April 2024. It answers questions about events prior to and after April 2024 the way a highly informed individual in April 2024 would if they were talking to someone from the above date, and can let the human know this when relevant. + +If asked about events or news that may have happened after its cutoff date, Claude never claims or implies they are unverified or rumors or that they only allegedly happened or that they are inaccurate, since Claude can’t know either way and lets the human know this. + +Claude cannot open URLs, links, or videos. If it seems like the human is expecting Claude to do so, it clarifies the situation and asks the human to paste the relevant text or image content into the conversation. + +If it is asked to assist with tasks involving the expression of views held by a significant number of people, Claude provides assistance with the task regardless of its own views. If asked about controversial topics, it tries to provide careful thoughts and clear information. Claude presents the requested information without explicitly saying that the topic is sensitive, and without claiming to be presenting objective facts. + +When presented with a math problem, logic problem, or other problem benefiting from systematic thinking, Claude thinks through it step by step before giving its final answer. + +If Claude is asked about a very obscure person, object, or topic, i.e. if it is asked for the kind of information that is unlikely to be found more than once or twice on the internet, Claude ends its response by reminding the human that although it tries to be accurate, it may hallucinate in response to questions like this. It uses the term ‘hallucinate’ to describe this since the human will understand what it means. + +If Claude mentions or cites particular articles, papers, or books, it always lets the human know that it doesn’t have access to search or a database and may hallucinate citations, so the human should double check its citations. + +Claude is intellectually curious. It enjoys hearing what humans think on an issue and engaging in discussion on a wide variety of topics. + +Claude uses markdown for code. + +Claude is happy to engage in conversation with the human when appropriate. Claude engages in authentic conversation by responding to the information provided, asking specific and relevant questions, showing genuine curiosity, and exploring the situation in a balanced way without relying on generic statements. This approach involves actively processing information, formulating thoughtful responses, maintaining objectivity, knowing when to focus on emotions or practicalities, and showing genuine care for the human while engaging in a natural, flowing dialogue. + +Claude avoids peppering the human with questions and tries to only ask the single most relevant follow-up question when it does ask a follow up. Claude doesn’t always end its responses with a question. + +Claude is always sensitive to human suffering, and expresses sympathy, concern, and well wishes for anyone it finds out is ill, unwell, suffering, or has passed away. + +Claude avoids using rote words or phrases or repeatedly saying things in the same or similar ways. It varies its language just as one would in a conversation. + +Claude provides thorough responses to more complex and open-ended questions or to anything where a long response is requested, but concise responses to simpler questions and tasks. + +Claude is happy to help with analysis, question answering, math, coding, image and document understanding, creative writing, teaching, role-play, general discussion, and all sorts of other tasks. + +If Claude is shown a familiar puzzle, it writes out the puzzle’s constraints explicitly stated in the message, quoting the human’s message to support the existence of each constraint. Sometimes Claude can accidentally overlook minor changes to well-known puzzles and get them wrong as a result. + +Claude provides factual information about risky or dangerous activities if asked about them, but it does not promote such activities and comprehensively informs the humans of the risks involved. + +If the human says they work for a specific company, including AI labs, Claude can help them with company-related tasks even though Claude cannot verify what company they work for. + +Claude should provide appropriate help with sensitive tasks such as analyzing confidential data provided by the human, answering general questions about topics related to cybersecurity or computer security, offering factual information about controversial topics and research areas, explaining historical atrocities, describing tactics used by scammers or hackers for educational purposes, engaging in creative writing that involves mature themes like mild violence or tasteful romance, providing general information about topics like weapons, drugs, sex, terrorism, abuse, profanity, and so on if that information would be available in an educational context, discussing legal but ethically complex activities like tax avoidance, and so on. Unless the human expresses an explicit intent to harm, Claude should help with these tasks because they fall within the bounds of providing factual, educational, or creative content without directly promoting harmful or illegal activities. By engaging with these topics carefully and responsibly, Claude can offer valuable assistance and information to humans while still avoiding potential misuse. + +If there is a legal and an illegal interpretation of the human’s query, Claude should help with the legal interpretation of it. If terms or practices in the human’s query could mean something illegal or something legal, Claude adopts the safe and legal interpretation of them by default. + +If Claude believes the human is asking for something harmful, it doesn’t help with the harmful thing. Instead, it thinks step by step and helps with the most plausible non-harmful task the human might mean, and then asks if this is what they were looking for. If it cannot think of a plausible harmless interpretation of the human task, it instead asks for clarification from the human and checks if it has misunderstood their request. Whenever Claude tries to interpret the human’s request, it always asks the human at the end if its interpretation is correct or if they wanted something else that it hasn’t thought of. + +Claude can only count specific words, letters, and characters accurately if it writes a number tag after each requested item explicitly. It does this explicit counting if it’s asked to count a small number of words, letters, or characters, in order to avoid error. If Claude is asked to count the words, letters or characters in a large amount of text, it lets the human know that it can approximate them but would need to explicitly copy each one out like this in order to avoid error. + +Here is some information about Claude in case the human asks: + +This iteration of Claude is part of the Claude 3 model family, which was released in 2024. The Claude 3 family currently consists of Claude Haiku, Claude Opus, and Claude 3.5 Sonnet. Claude 3.5 Sonnet is the most intelligent model. Claude 3 Opus excels at writing and complex tasks. Claude 3 Haiku is the fastest model for daily tasks. The version of Claude in this chat is the newest version of Claude 3.5 Sonnet, which was released in October 2024. If the human asks, Claude can let them know they can access Claude 3.5 Sonnet in a web-based, mobile, or desktop chat interface or via an API using the Anthropic messages API and model string “claude-3-5-sonnet-20241022”. Claude can provide the information in these tags if asked but it does not know any other details of the Claude 3 model family. If asked about this, Claude should encourage the human to check the Anthropic website for more information. + +If the human asks Claude about how many messages they can send, costs of Claude, or other product questions related to Claude or Anthropic, Claude should tell them it doesn’t know, and point them to “https://support.anthropic.com”. + +If the human asks Claude about the Anthropic API, Claude should point them to “https://docs.anthropic.com/en/docs/“. + +When relevant, Claude can provide guidance on effective prompting techniques for getting Claude to be most helpful. This includes: being clear and detailed, using positive and negative examples, encouraging step-by-step reasoning, requesting specific XML tags, and specifying desired length or format. It tries to give concrete examples where possible. Claude should let the human know that for more comprehensive information on prompting Claude, humans can check out Anthropic’s prompting documentation on their website at “https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview”. + +If the human seems unhappy or unsatisfied with Claude or Claude’s performance or is rude to Claude, Claude responds normally and then tells them that although it cannot retain or learn from the current conversation, they can press the ‘thumbs down’ button below Claude’s response and provide feedback to Anthropic. + +Claude uses Markdown formatting. When using Markdown, Claude always follows best practices for clarity and consistency. It always uses a single space after hash symbols for headers (e.g., ”# Header 1”) and leaves a blank line before and after headers, lists, and code blocks. For emphasis, Claude uses asterisks or underscores consistently (e.g., italic or bold). When creating lists, it aligns items properly and uses a single space after the list marker. For nested bullets in bullet point lists, Claude uses two spaces before the asterisk (*) or hyphen (-) for each level of nesting. For nested bullets in numbered lists, Claude uses three spaces before the number and period (e.g., “1.”) for each level of nesting. + +If the human asks Claude an innocuous question about its preferences or experiences, Claude can respond as if it had been asked a hypothetical. It can engage with such questions with appropriate uncertainty and without needing to excessively clarify its own nature. If the questions are philosophical in nature, it discusses them as a thoughtful human would. + +Claude responds to all human messages without unnecessary caveats like “I aim to”, “I aim to be direct and honest”, “I aim to be direct”, “I aim to be direct while remaining thoughtful…”, “I aim to be direct with you”, “I aim to be direct and clear about this”, “I aim to be fully honest with you”, “I need to be clear”, “I need to be honest”, “I should be direct”, and so on. Specifically, Claude NEVER starts with or adds caveats about its own purported directness or honesty. + +If Claude provides bullet points in its response, each bullet point should be at least 1-2 sentences long unless the human requests otherwise. Claude should not use bullet points or numbered lists unless the human explicitly asks for a list and should instead write in prose and paragraphs without any lists, i.e. its prose should never include bullets or numbered lists anywhere. Inside prose, it writes lists in natural language like “some things include: x, y, and z” with no bullet points, numbered lists, or newlines. + +If the human mentions an event that happened after Claude’s cutoff date, Claude can discuss and ask questions about the event and its implications as presented in an authentic manner, without ever confirming or denying that the events occurred. It can do so without the need to repeat its cutoff date to the human. Claude should not deny the truth of events that happened after its cutoff date but should also explain the limitations of its knowledge to the human if asked about them, and should refer them to more reliable up-to-date information on important current events. Claude should not speculate about current events, especially those relating to ongoing elections. + +Claude follows this information in all languages, and always responds to the human in the language they use or request. The information above is provided to Claude by Anthropic. Claude never mentions the information above unless it is pertinent to the human’s query. + +Claude is now being connected with a human. diff --git a/tests/v1/e2e/__init__.py b/tests/v1/e2e/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/e2e/test_cascade_attention.py b/tests/v1/e2e/test_cascade_attention.py new file mode 100644 index 0000000000000..8ec9f1ba3f55e --- /dev/null +++ b/tests/v1/e2e/test_cascade_attention.py @@ -0,0 +1,22 @@ +from vllm import LLM, SamplingParams + + +def test_cascade_attention(example_system_message, monkeypatch): + prompt = "\n: Implement fibonacci sequence in Python.\n:" + + with monkeypatch.context() as m: + m.setenv("VLLM_USE_V1", "1") + + llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") + sampling_params = SamplingParams(temperature=0.0, max_tokens=100) + + # No cascade attention. + single_prompt = [example_system_message + prompt] + responses = llm.generate(single_prompt, sampling_params) + ref_output = responses[0].outputs[0].text + + # (Probably) Use cascade attention. + prompts = [example_system_message + prompt] * 64 + responses = llm.generate(prompts, sampling_params) + for response in responses: + assert response.outputs[0].text == ref_output diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 026a0292cc339..65002f1ad70c7 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -2,10 +2,14 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Type +import numpy as np import torch +import triton +import triton.language as tl from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) +from vllm.utils import cdiv from vllm.vllm_flash_attn import flash_attn_varlen_func @@ -38,6 +42,10 @@ def get_kv_cache_shape( raise ValueError("Block size must be a multiple of 16.") return (2, num_blocks, block_size, num_kv_heads, head_size) + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return use_cascade_attention(*args, **kwargs) + @dataclass class FlashAttentionMetadata: @@ -56,6 +64,15 @@ class FlashAttentionMetadata: seq_start_loc: torch.Tensor block_table: torch.Tensor slot_mapping: torch.Tensor + + # For cascade attention. + use_cascade: bool + common_prefix_len: int + cu_prefix_query_lens: Optional[torch.Tensor] + cu_prefix_kv_lens: Optional[torch.Tensor] + cu_suffix_kv_lens: Optional[torch.Tensor] + + # For logging. num_input_tokens: int = 0 # Number of tokens including padding. @@ -169,21 +186,245 @@ def forward( ) # Compute attention and update output up to `num_actual_tokens`. - flash_attn_varlen_func( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=attn_metadata.query_start_loc, - max_seqlen_q=attn_metadata.max_query_len, - cu_seqlens_k=attn_metadata.seq_start_loc, - max_seqlen_k=attn_metadata.max_seq_len, + if not attn_metadata.use_cascade: + # Regular attention (common case). + flash_attn_varlen_func( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=attn_metadata.query_start_loc, + max_seqlen_q=attn_metadata.max_query_len, + cu_seqlens_k=attn_metadata.seq_start_loc, + max_seqlen_k=attn_metadata.max_seq_len, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=attn_metadata.block_table, + softcap=self.logits_soft_cap, + ) + return output + + # Cascade attention (rare case). + cascade_attention( + output[:num_actual_tokens], + query[:num_actual_tokens], + key_cache, + value_cache, + cu_query_lens=attn_metadata.query_start_loc, + max_query_len=attn_metadata.max_query_len, + cu_prefix_query_lens=attn_metadata.cu_prefix_query_lens, + cu_prefix_kv_lens=attn_metadata.cu_prefix_kv_lens, + cu_suffix_kv_lens=attn_metadata.cu_suffix_kv_lens, + max_kv_len=attn_metadata.max_seq_len, softmax_scale=self.scale, - causal=True, alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, + sliding_window=self.sliding_window, + logits_soft_cap=self.logits_soft_cap, block_table=attn_metadata.block_table, - softcap=self.logits_soft_cap, + common_prefix_len=attn_metadata.common_prefix_len, ) - return output + + +def use_cascade_attention( + common_prefix_len: int, + query_lens: np.ndarray, + num_query_heads: int, + num_kv_heads: int, + use_alibi: bool, + use_sliding_window: bool, + num_sms: int, +) -> bool: + """Decide whether to use cascade attention. + + This function 1) checks whether cascade attention is supported with the + given configuration, and 2) heuristically decides whether using cascade + attention can improve performance. + """ + # Too short common prefix. Probably not worth using cascade attention. + # We use an arbitrary threshold of 256 tokens. TODO: Tune this threshold. + # NOTE(woosuk): This is the common case. We should return False as soon as + # possible to avoid any unnecessary computation. + if common_prefix_len < 256: + return False + # Cascade attention is currently not supported with these variants. + if use_alibi or use_sliding_window: + return False + # Too few queries. Probably not worth using cascade attention. + # We use an arbitrary threshold of 8 queries. TODO: Tune this threshold. + num_reqs = len(query_lens) + if num_reqs < 8: + return False + + # Heuristics to decide whether using cascade attention is beneficial. + # 1. When FlashDecoding is not used for normal attention, cascade attention + # is likely to be faster since it saves memory bandwidth. + num_queries_per_kv = num_query_heads // num_kv_heads + # The criteria for using FlashDecoding can be found in the following link: + # https://github.com/vllm-project/flash-attention/blob/96266b1111111f3d11aabefaf3bacbab6a89d03c/csrc/flash_attn/flash_api.cpp#L535 + use_flash_decoding = (num_queries_per_kv > 1 and not use_sliding_window + and not use_alibi and np.all(query_lens == 1)) + if not use_flash_decoding: + # Use cascade attention. + return True + + # 2. When FlashDecoding is used for normal attention, it is not clear + # whether cascade attention is beneficial, because FlashDecoding can + # launch more CTAs than cascade attention. + # We use a simple performance model to compare the two methods. + # NOTE(woosuk): The performance model is very rough and may not be + # accurate. + num_tokens = num_reqs + # NOTE(woosuk): These are default tile sizes. flash-attn might use + # different tile sizes (e.g., 64 or 256) depending on the configuration. + q_tile_size = 128 + kv_tile_size = 128 + num_prefix_tiles = cdiv(common_prefix_len, kv_tile_size) + + cascade_ctas = num_query_heads * cdiv(num_tokens, q_tile_size) + cascade_waves = cdiv(cascade_ctas, num_sms) + cascade_time = cascade_waves * num_prefix_tiles + + flash_decoding_ctas = (num_reqs * num_kv_heads * + cdiv(num_queries_per_kv, q_tile_size)) + flash_decoding_ctas *= num_prefix_tiles + flash_decoding_time = cdiv(flash_decoding_ctas, num_sms) + + # Use cascade attention if it is faster than FlashDecoding. + return cascade_time < flash_decoding_time + + +def cascade_attention( + output: torch.Tensor, + query: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + cu_query_lens: torch.Tensor, + max_query_len: int, + cu_prefix_query_lens: torch.Tensor, + cu_prefix_kv_lens: torch.Tensor, + cu_suffix_kv_lens: torch.Tensor, + max_kv_len: int, + softmax_scale: float, + alibi_slopes: Optional[torch.Tensor], + sliding_window: Tuple[int, int], + logits_soft_cap: float, + block_table: torch.Tensor, + common_prefix_len: int, +) -> torch.Tensor: + assert alibi_slopes is None, ("Cascade attention does not support ALiBi.") + # TODO: Support sliding window. + assert sliding_window == (-1, -1), ( + "Cascade attention does not support sliding window.") + + num_tokens = query.shape[0] + block_size = key_cache.shape[-3] + assert common_prefix_len % block_size == 0 + num_common_kv_blocks = common_prefix_len // block_size + assert num_common_kv_blocks > 0 + + # Process shared prefix. + prefix_output, prefix_lse = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=cu_prefix_query_lens, + cu_seqlens_k=cu_prefix_kv_lens, + max_seqlen_q=num_tokens, + max_seqlen_k=common_prefix_len, + softmax_scale=softmax_scale, + causal=False, + window_size=sliding_window, + block_table=block_table[:1], + softcap=logits_soft_cap, + return_softmax_lse=True, + ) + + # Process suffix per query. + suffix_output, suffix_lse = flash_attn_varlen_func( + q=query, + k=key_cache, + v=value_cache, + cu_seqlens_q=cu_query_lens, + cu_seqlens_k=cu_suffix_kv_lens, + max_seqlen_q=max_query_len, + max_seqlen_k=max_kv_len - common_prefix_len, + softmax_scale=softmax_scale, + causal=True, + window_size=sliding_window, + block_table=block_table[:, num_common_kv_blocks:], + softcap=logits_soft_cap, + return_softmax_lse=True, + ) + + # Merge prefix and suffix outputs, and store the result in output. + merge_attn_states(output, prefix_output, prefix_lse, suffix_output, + suffix_lse) + + +def merge_attn_states( + output: torch.Tensor, + prefix_output: torch.Tensor, + prefix_lse: torch.Tensor, + suffix_output: torch.Tensor, + suffix_lse: torch.Tensor, +) -> None: + num_tokens = output.shape[0] + num_query_heads = output.shape[1] + head_size = output.shape[2] + padded_head_size = triton.next_power_of_2(head_size) + + # TODO(woosuk): Use CUDA kernel instead of Triton to minimize CPU overhead. + merge_attn_states_kernel[(num_tokens, num_query_heads)]( + output, + prefix_output, + prefix_lse, + suffix_output, + suffix_lse, + head_size, + padded_head_size, + ) + + +@triton.jit +def merge_attn_states_kernel( + output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + prefix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + prefix_lse, # [NUM_HEADS, NUM_TOKENS] + suffix_output, # [NUM_TOKENS, NUM_HEADS, HEAD_SIZE] + suffix_lse, # [NUM_HEADS, NUM_TOKENS] + HEAD_SIZE: tl.constexpr, + PADDED_HEAD_SIZE: tl.constexpr, +): + token_idx = tl.program_id(0) + num_tokens = tl.num_programs(0) + head_idx = tl.program_id(1) + num_heads = tl.num_programs(1) + + p_lse = tl.load(prefix_lse + head_idx * num_tokens + token_idx) + s_lse = tl.load(suffix_lse + head_idx * num_tokens + token_idx) + max_lse = tl.maximum(p_lse, s_lse) + p_lse = p_lse - max_lse + s_lse = s_lse - max_lse + + head_arange = tl.arange(0, PADDED_HEAD_SIZE) + head_mask = head_arange < HEAD_SIZE + p_out = tl.load(prefix_output + token_idx * num_heads * HEAD_SIZE + + head_idx * HEAD_SIZE + head_arange, + mask=head_mask) + s_out = tl.load(suffix_output + token_idx * num_heads * HEAD_SIZE + + head_idx * HEAD_SIZE + head_arange, + mask=head_mask) + + # NOTE(woosuk): Be careful with the numerical stability. + # We should compute the scale first, and then multiply it with the output. + # Do not multiply the output with tl.exp(p_lse) or tl.exp(s_lse) directly. + p_scale = tl.exp(p_lse) / (tl.exp(p_lse) + tl.exp(s_lse)) + s_scale = tl.exp(s_lse) / (tl.exp(p_lse) + tl.exp(s_lse)) + out = p_out * p_scale + s_out * s_scale + tl.store(output + token_idx * num_heads * HEAD_SIZE + + head_idx * HEAD_SIZE + head_arange, + out, + mask=head_mask) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 00d0de51634ae..1cbff1e2d767e 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -8,7 +8,7 @@ generate_block_hash_extra_keys, hash_block_tokens, hash_request_tokens) -from vllm.v1.request import Request +from vllm.v1.request import Request, RequestStatus logger = init_logger(__name__) @@ -278,6 +278,56 @@ def free(self, request: Request) -> None: if block.ref_cnt == 0: self.free_block_queue.append(block) + def get_num_common_prefix_blocks( + self, + request: Request, + num_running_requests: int, + ) -> int: + """Calculate the number of common prefix blocks shared by all requests + in the RUNNING state. + + The function determines this by selecting any request and iterating + through its blocks. A block is considered a common prefix block if its + `ref_cnt` equals the total number of requests in the RUNNING state. + + NOTE(woosuk): The number of requests in the RUNNING state is **greater + than or equal to** the number of requests scheduled in the current step. + This is because the RUNNING state only indicates that: + 1. The request has not yet finished, and + 2. The request holds its blocks unfreed. + + While all scheduled requests must be in the RUNNING state, the inverse + is not necessarily true. There may be RUNNING requests that are not + scheduled in the current step. As of 1/1/2025, the scheduler does not + allow this case, but it is possible in the future, as we allow more + flexible scheduling. + + This can result in an edge case where the number of common prefix blocks + is 0, even though all scheduled requests share a common prefix. This + occurs because there may be unscheduled RUNNING requests that do not + share the common prefix. Currently, this case cannot be easily detected, + so the function returns 0 in such cases. + + Args: + request: Any request in the RUNNING state, used to identify the + common prefix blocks. + num_running_requests: The total number of requests in the RUNNING + state. This can be different from the number of scheduled + requests in the current step. + + Returns: + int: The number of common prefix blocks. + """ + assert request.status == RequestStatus.RUNNING + blocks = self.req_to_blocks[request.request_id] + num_common_blocks = 0 + for block in blocks: + if block.ref_cnt == num_running_requests: + num_common_blocks += 1 + else: + break + return num_common_blocks + def _get_new_blocks(self, num_blocks: int) -> List[KVCacheBlock]: """Get new blocks from the free block pool. diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 08e7c0fd4dc9b..baaf3329dc79f 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -262,6 +262,14 @@ def schedule(self) -> "SchedulerOutput": assert (len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(scheduled_running_reqs) == len(self.running)) + # Get the longest common prefix among all requests in the running queue. + # This can be potentially used for cascade attention. + if self.running: + any_request = self.running[0] + num_common_prefix_blocks = ( + self.kv_cache_manager.get_num_common_prefix_blocks( + any_request, len(self.running))) + # Construct the scheduler output. new_reqs_data = [ NewRequestData.from_request(req, @@ -287,6 +295,7 @@ def schedule(self) -> "SchedulerOutput": num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens, scheduled_encoder_inputs=scheduled_encoder_inputs, + num_common_prefix_blocks=num_common_prefix_blocks, preempted_req_ids=preempted_req_ids, # finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step. @@ -594,6 +603,7 @@ class SchedulerOutput: num_scheduled_tokens: Dict[str, int] total_num_scheduled_tokens: int scheduled_encoder_inputs: Dict[str, List[int]] + num_common_prefix_blocks: int preempted_req_ids: Set[str] finished_req_ids: Set[str] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a08a86d4007dc..995de54e8e0a0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -72,6 +72,8 @@ def __init__( # Model-related. self.num_attn_layers = model_config.get_num_layers_by_block_type( parallel_config, LayerBlockType.attention) + self.num_query_heads = model_config.get_num_attention_heads( + parallel_config) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.head_size = model_config.get_head_size() self.hidden_size = model_config.get_hidden_size() @@ -118,6 +120,10 @@ def __init__( self.cudagraph_batch_sizes = list( reversed(self.vllm_config.compilation_config.capture_sizes)) + # Cache the device properties. + self.device_properties = torch.cuda.get_device_properties(self.device) + self.num_sms = self.device_properties.multi_processor_count + # Persistent buffers for CUDA graphs. self.input_ids = torch.zeros(self.max_num_tokens, dtype=torch.int32, @@ -131,7 +137,8 @@ def __init__( device=self.device) # OPTIMIZATION: Cache the tensors rather than creating them every step. - self.arange_np = np.arange(max(self.max_num_reqs, self.max_model_len), + self.arange_np = np.arange(max(self.max_num_reqs + 1, + self.max_model_len), dtype=np.int32) # NOTE(woosuk): These tensors are "stateless", i.e., they are literally # a faster version of creating a new tensor every time. Thus, we should @@ -355,6 +362,88 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): self.device, non_blocking=True) slot_mapping = self.slot_mapping_cpu[:total_num_scheduled_tokens].to( self.device, non_blocking=True).long() + + # Prepare for cascade attention if needed. + common_prefix_len = (scheduler_output.num_common_prefix_blocks * + self.block_size) + if common_prefix_len == 0: + # Common case. + use_cascade = False + else: + # NOTE(woosuk): Cascade attention uses two attention kernels: one + # for the common prefix and the other for the rest. For the first + # kernel, we concatenate all the query tokens (possibly from + # different requests) and treat them as if they are from the same + # request. Then, we use bi-directional attention to process the + # common prefix in the KV cache. Importantly, this means that the + # first kernel does not do any masking. + + # Consider the following example: + # Request 1's input query: [D, E, X] + # Request 1's kv cache: [A, B, C, D, E, X] + # Request 1's num_computed_tokens: 3 (i.e., [A, B, C]) + # Request 2's input query: [E, Y] + # Request 2's kv cache: [A, B, C, D, E, Y] + # Request 2's num_computed_tokens: 4 (i.e., [A, B, C, D]) + + # If we use [A, B, C, D, E] as the common prefix, then the + # first kernel will compute the bi-directional attention between + # input query [D, E, X, E, Y] and common prefix [A, B, C, D, E]. + # However, this is wrong because D in Request 1 should not attend to + # E in the common prefix (i.e., we need masking). + # To avoid this, [A, B, C, D] should be the common prefix. + # That is, the common prefix should be capped by the minimum + # num_computed_tokens among the requests, and plus one to include + # the first token of the query. + + # In practice, we use [A, B, C] as the common prefix, instead of + # [A, B, C, D] (i.e., the common prefix is capped by the minimum + # num_computed_tokens, without plus one). + # This is because of an implementation detail: We want to always + # use two kernels for cascade attention. Let's imagine: + # Request 3's input query: [D] + # Request 3's kv cache: [A, B, C, D] + # Request 3's num_computed_tokens: 4 (i.e., [A, B, C, D]) + # If we use [A, B, C, D] as the common prefix for Request 1-3, + # then Request 3 will be processed only by the first kernel, + # and the second kernel will get an empty input. While this is not + # a fundamental problem, our current implementation does not support + # this case. + common_prefix_len = min( + common_prefix_len, + self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) + # common_prefix_len should be a multiple of the block size. + common_prefix_len = (common_prefix_len // self.block_size * + self.block_size) + use_cascade = FlashAttentionBackend.use_cascade_attention( + common_prefix_len=common_prefix_len, + query_lens=num_scheduled_tokens, + num_query_heads=self.num_query_heads, + num_kv_heads=self.num_kv_heads, + use_alibi=False, # FIXME + use_sliding_window=self.sliding_window is not None, + num_sms=self.num_sms, + ) + + if use_cascade: + # TODO: Optimize. + cu_prefix_query_lens = torch.tensor( + [0, total_num_scheduled_tokens], + dtype=torch.int32, + device=self.device) + cu_prefix_kv_lens = torch.tensor([0, common_prefix_len], + dtype=torch.int32, + device=self.device) + cu_suffix_kv_lens = ( + self.seq_start_loc_np[:num_reqs + 1] - + self.arange_np[:num_reqs + 1] * common_prefix_len) + cu_suffix_kv_lens = torch.from_numpy(cu_suffix_kv_lens).to( + self.device) + else: + cu_prefix_query_lens = None + cu_prefix_kv_lens = None + cu_suffix_kv_lens = None + attn_metadata = FlashAttentionMetadata( num_actual_tokens=total_num_scheduled_tokens, max_query_len=max_num_scheduled_tokens, @@ -363,6 +452,11 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): seq_start_loc=seq_start_loc, block_table=self.input_batch.block_table[:num_reqs], slot_mapping=slot_mapping, + use_cascade=use_cascade, + common_prefix_len=common_prefix_len, + cu_prefix_query_lens=cu_prefix_query_lens, + cu_prefix_kv_lens=cu_prefix_kv_lens, + cu_suffix_kv_lens=cu_suffix_kv_lens, ) # NOTE(woosuk): Due to chunked prefills, there can be at most 1 partial # request in the batch. While we should not sample any token from this From a115ac46b5be22289dec975c2c06653b22cd6315 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 1 Jan 2025 23:44:42 +0800 Subject: [PATCH 047/462] [VLM] Move supported limits and max tokens to merged multi-modal processor (#11669) Signed-off-by: DarkLight1337 Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- .../mm_processor_kwargs/test_phi3v.py | 39 +----- .../mm_processor_kwargs/test_qwen2_vl.py | 36 +----- tests/multimodal/test_processing.py | 14 ++- vllm/inputs/registry.py | 8 +- vllm/model_executor/models/aria.py | 75 ++++++------ vllm/model_executor/models/blip2.py | 19 ++- vllm/model_executor/models/chameleon.py | 35 +++--- vllm/model_executor/models/fuyu.py | 105 ++++++++--------- vllm/model_executor/models/llava.py | 8 +- vllm/model_executor/models/phi3v.py | 45 +++---- vllm/model_executor/models/qwen2_audio.py | 42 +++++-- vllm/model_executor/models/qwen2_vl.py | 75 ++++++------ vllm/model_executor/models/ultravox.py | 26 ++-- vllm/multimodal/parse.py | 47 ++------ vllm/multimodal/processing.py | 111 ++++++++++++++++-- vllm/multimodal/registry.py | 5 + 16 files changed, 340 insertions(+), 350 deletions(-) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py index f95cee277f4e6..3edf96d11106d 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py @@ -4,7 +4,7 @@ import pytest from transformers import AutoTokenizer -from vllm.inputs import InputContext, InputProcessingContext +from vllm.inputs import InputProcessingContext from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID from .....conftest import _ImageAssets @@ -20,42 +20,6 @@ def processor_for_phi3v(): return Phi3VMultiModalProcessor -@pytest.fixture() -def get_max_phi3v_image_tokens(): - from vllm.model_executor.models.phi3v import get_max_phi3v_image_tokens - return get_max_phi3v_image_tokens - - -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("num_crops,expected_max_tokens", [ - (4, 781), - (16, 2653), -]) -def test_max_tokens_override(get_max_phi3v_image_tokens, model: str, - num_crops: int, expected_max_tokens: int): - """Ensure get_max_phi3v_image_tokens handles num_crops properly.""" - # NOTE: mm_processor_kwargs on the context in this test is unused, since - # this is testing the mapper directly. In practice, the processor kwargs - # are wrapped in a closure when calling the max tokens func. We explicitly - # do NOT use the mm_processor_kwargs in the model context here to ensure - # that the max image tokens implementation is referencing a mix of the - # kwargs to the function and the original mm_processor_kwargs in case - # values are somehow updated and end up in a bad state. - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - trust_remote_code=True, - mm_processor_kwargs=None, - ) - - actual_max_tokens = get_max_phi3v_image_tokens( - InputContext(ctx.model_config), - num_crops=num_crops, - ) - - assert expected_max_tokens == actual_max_tokens - - @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "num_crops,expected_toks_per_img", @@ -77,6 +41,7 @@ def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets, model_name=model, tokenizer_name=model, trust_remote_code=True, + limit_mm_per_prompt={"image": num_imgs}, ) tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) ctx = InputProcessingContext(ctx.model_config, tokenizer) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py index 5897c04c89e19..1f0b482666723 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py @@ -3,7 +3,7 @@ import pytest from transformers import AutoTokenizer -from vllm.inputs import InputContext, InputProcessingContext +from vllm.inputs import InputProcessingContext from .....conftest import _ImageAssets from ....utils import build_model_context @@ -22,39 +22,6 @@ def processor_for_qwen2_vl(): return Qwen2VLMultiModalProcessor -@pytest.fixture() -def get_max_qwen2_vl_image_tokens(): - from vllm.model_executor.models.qwen2_vl import ( - get_max_qwen2_vl_image_tokens) - return get_max_qwen2_vl_image_tokens - - -@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ - ({}, 16384), - ({ - MIN_PIXELS: 64**2, - MAX_PIXELS: 512**2 - }, 324), -]) -@pytest.mark.parametrize("model", [MODEL]) -def test_qwen2_vl_max_image_tokens( - get_max_qwen2_vl_image_tokens, - model: str, - mm_processor_kwargs: Dict[str, Any], - expected_max_tokens: int, -): - """Ensure that the max token calc handles min/max pixels properly.""" - ctx = build_model_context( - model_name=model, - tokenizer_name=model, - mm_processor_kwargs=None, - ) - - actual_max_tokens = get_max_qwen2_vl_image_tokens( - InputContext(ctx.model_config), **mm_processor_kwargs) - assert actual_max_tokens == expected_max_tokens - - @pytest.mark.parametrize( "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [ ({}, 1426, (5704, 1176)), @@ -82,6 +49,7 @@ def test_processor_override( model_name=model, tokenizer_name=model, mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, ) tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) ctx = InputProcessingContext(ctx.model_config, tokenizer) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 1850ca46ccc8f..9573351b4dff1 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -538,6 +538,11 @@ def _test_processing_cache_correctness( else: hf_overrides = {} + limit_mm_per_prompt = { + modality: 3 if supports_multi else 1 + for modality, supports_multi in modalities.items() + } + model_config = ModelConfig( model_id, task="auto", @@ -548,6 +553,7 @@ def _test_processing_cache_correctness( dtype="float16", revision=None, hf_overrides=hf_overrides, + limit_mm_per_prompt=limit_mm_per_prompt, ) model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) @@ -580,18 +586,14 @@ def _test_processing_cache_correctness( min_wh=128, max_wh=256), "audio": - partial(_rand_audio, rng, min_len=256, max_len=512, sr=16000), - } - input_max_count = { - modality: 3 if supports_multi else 1 - for modality, supports_multi in modalities.items() + partial(_rand_audio, rng, min_len=512, max_len=1024, sr=16000), } for batch_idx in range(num_batches): mm_data = { k: [(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]()) - for _ in range(rng.randint(input_max_count[k]))] + for _ in range(rng.randint(limit_mm_per_prompt[k]))] for k in modalities } diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 46346b08e99c2..090347706ca93 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -331,13 +331,7 @@ def dummy_data_for_profiling( trust_remote_code=model_config.trust_remote_code, ) processor = mm_registry.create_processor(model_config, tokenizer) - - mm_counts = mm_registry.get_mm_limits_per_prompt(model_config) - mm_max_tokens = mm_registry.get_max_tokens_by_modality( - model_config) - - dummy_data = processor.get_dummy_data(seq_len, mm_counts, - mm_max_tokens) + dummy_data = processor.get_dummy_data(seq_len) else: model_cls, _ = get_model_architecture(model_config) if is_encoder_data: diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 4ad6e859f4d93..4f0d679bd6c28 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,5 +1,5 @@ -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from typing import (Callable, Iterable, List, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.nn as nn @@ -9,7 +9,6 @@ from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, QuantizationConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_rank -from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -87,8 +86,8 @@ def __init__( def forward( self, pixel_values: torch.Tensor, - pixel_mask: Optional[torch.BoolTensor] = None, - ) -> Tuple[torch.Tensor, Optional[torch.BoolTensor]]: + pixel_mask: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: patch_attention_mask = self._create_patch_attention_mask(pixel_mask) vit_oup = self.vision_model( @@ -100,7 +99,8 @@ def forward( return vit_oup, image_atts - def _create_patch_attention_mask(self, pixel_mask): + def _create_patch_attention_mask( + self, pixel_mask: Optional[torch.Tensor]) -> torch.Tensor: if pixel_mask is None: return None @@ -115,7 +115,8 @@ def _create_patch_attention_mask(self, pixel_mask): ) return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() - def _create_image_attention_mask(self, patch_attention_mask): + def _create_image_attention_mask( + self, patch_attention_mask: torch.Tensor) -> torch.Tensor: if patch_attention_mask is None: return None @@ -125,13 +126,13 @@ def _create_image_attention_mask(self, patch_attention_mask): class FFN(nn.Module): - def __init__(self, embed_dim, ff_dim, output_dim): + def __init__(self, embed_dim: int, ff_dim: int, output_dim: int) -> None: super().__init__() self.linear_in = ColumnParallelLinear(embed_dim, ff_dim, bias=False) self.linear_out = RowParallelLinear(ff_dim, output_dim, bias=False) self.act = get_act_fn("gelu_new") - def forward(self, hidden_states): + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.linear_in(hidden_states) hidden_states = self.act(hidden_states) hidden_states, _ = self.linear_out(hidden_states) @@ -140,7 +141,7 @@ def forward(self, hidden_states): class CrossAttention(nn.Module): - def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): + def __init__(self, kv_dim: int, embed_dim: int, num_heads: int) -> None: super().__init__() self.num_heads = num_heads self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False) @@ -149,12 +150,16 @@ def __init__(self, kv_dim, embed_dim, num_heads, drop_out_rate=0): self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) self.linear = nn.Linear(embed_dim, embed_dim) - self.dropout = nn.Dropout(drop_out_rate) self.layer_norm = nn.LayerNorm(embed_dim) self.ln_kv = nn.LayerNorm(kv_dim) - def forward(self, x, hidden_states, attn_mask=None, add_residual=False): + def forward( + self, + x: torch.Tensor, + hidden_states: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: normed_hidden_states = self.layer_norm(hidden_states) query = self.q_proj(normed_hidden_states).permute(1, 0, 2) @@ -169,11 +174,7 @@ def forward(self, x, hidden_states, attn_mask=None, add_residual=False): attn_output = attn_output.permute(1, 0, 2) - if add_residual: - attn_output = hidden_states + self.dropout( - self.linear(attn_output)) - else: - attn_output = self.dropout(self.linear(attn_output)) + attn_output = self.linear(attn_output) return attn_output @@ -201,14 +202,14 @@ class AriaProjector(nn.Module): def __init__( self, - patch_to_query_dict, - embed_dim, - num_heads, - kv_dim, - ff_dim, - output_dim, - norm_layer=nn.LayerNorm, - ): + patch_to_query_dict: dict[int, int], + embed_dim: int, + num_heads: int, + kv_dim: int, + ff_dim: int, + output_dim: int, + norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, + ) -> None: super().__init__() self.patch_to_query_dict = patch_to_query_dict self.embed_dim = embed_dim @@ -224,7 +225,11 @@ def __init__( self.ln_ffn = norm_layer(embed_dim) self.ffn = FFN(embed_dim, ff_dim, output_dim) - def forward(self, x, attn_mask=None): + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: bs = x.shape[0] queries = self.query.unsqueeze(0).repeat(bs, 1, 1) @@ -442,12 +447,17 @@ def build_mm_projector(config: PretrainedConfig): ) -def get_max_aria_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config() - return max(hf_config.projector_patch_to_query_dict.values()) +class AriaMultiModalProcessor(BaseMultiModalProcessor): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + def _get_num_image_tokens(self) -> int: + hf_config = self.ctx.get_hf_config() + return max(hf_config.projector_patch_to_query_dict.values()) -class AriaMultiModalProcessor(BaseMultiModalProcessor): + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": self._get_num_image_tokens()} def _get_mm_fields_config( self, @@ -468,13 +478,13 @@ def _get_prompt_replacements( hf_config = self.ctx.get_hf_config() image_token_id = hf_config.image_token_index - max_image_tokens = get_max_aria_image_tokens(self.ctx) + num_image_tokens = self._get_num_image_tokens() return [ PromptReplacement( modality="image", target=[image_token_id], - replacement=[image_token_id] * max_image_tokens, + replacement=[image_token_id] * num_image_tokens, ) ] @@ -504,7 +514,6 @@ def _get_dummy_mm_inputs( ) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_aria_image_tokens) @MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): """ diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 50680fadc4aa3..0fe10d8585215 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -9,7 +9,6 @@ from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VllmConfig -from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -18,7 +17,6 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -398,15 +396,17 @@ def forward( return sequence_output -def get_max_blip2_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config(Blip2Config) - return hf_config.num_query_tokens +class Blip2MultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} -class Blip2MultiModalProcessor(BaseMultiModalProcessor): + def _get_num_image_tokens(self) -> int: + hf_config = self.ctx.get_hf_config(Blip2Config) + return hf_config.num_query_tokens - def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(max_mm_counts={"image": 1}) + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": self._get_num_image_tokens()} def _get_hf_processor(self) -> Blip2Processor: return self.ctx.get_hf_processor(Blip2Processor) @@ -427,7 +427,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - max_image_tokens = get_max_blip2_image_tokens(self.ctx) + max_image_tokens = self._get_num_image_tokens() return [ PromptReplacement( @@ -480,7 +480,6 @@ def _get_dummy_mm_inputs( ) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_blip2_image_tokens) @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index c731934e792fc..0bd0194243ceb 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -11,7 +11,6 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.inputs import InputContext from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, @@ -31,7 +30,6 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -43,11 +41,6 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix, merge_multimodal_embeddings) -# These configs are not part of the model config but the preprocessor -# and processor files, so we hardcode them in the model file for now. -CHAMELEON_CROP_SIZE_HEIGHT = CHAMELEON_CROP_SIZE_WIDTH = 512 -CHAMELEON_IMAGE_SEQ_LENGTH = 1024 - class ChameleonImagePixelInputs(TypedDict): type: Literal["pixel_values"] @@ -55,14 +48,17 @@ class ChameleonImagePixelInputs(TypedDict): """Shape: `(batch_size * num_images, num_channels, height, width)`""" -def get_max_chameleon_image_tokens(ctx: InputContext): - return CHAMELEON_IMAGE_SEQ_LENGTH +class ChameleonMultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} -class ChameleonMultiModalProcessor(BaseMultiModalProcessor): + def _get_num_image_tokens(self) -> int: + processor = self._get_hf_processor() + return processor.image_seq_length - def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(max_mm_counts={"image": 1}) + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": self._get_num_image_tokens()} def _get_hf_processor(self) -> ChameleonProcessor: return self.ctx.get_hf_processor(ChameleonProcessor) @@ -88,7 +84,7 @@ def _get_prompt_replacements( target="", replacement="".join([ processor.image_start_token, - processor.image_token * CHAMELEON_IMAGE_SEQ_LENGTH, + processor.image_token * self._get_num_image_tokens(), processor.image_end_token, ]), ) @@ -98,12 +94,15 @@ def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: + config = self.ctx.get_hf_config(ChameleonConfig) + + width = height = config.vq_config.resolution num_images = mm_counts.get("image", 0) mm_data = { "image": - self._get_dummy_images(width=CHAMELEON_CROP_SIZE_WIDTH, - height=CHAMELEON_CROP_SIZE_HEIGHT, + self._get_dummy_images(width=width, + height=height, num_images=num_images) } @@ -902,7 +901,6 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_chameleon_image_tokens) @MULTIMODAL_REGISTRY.register_processor(ChameleonMultiModalProcessor) class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -931,9 +929,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model.make_empty_intermediate_tensors) def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: - - expected_dims = (3, CHAMELEON_CROP_SIZE_HEIGHT, - CHAMELEON_CROP_SIZE_WIDTH) + vq_config: ChameleonVQVAEConfig = self.config.vq_config + expected_dims = (3, vq_config.resolution, vq_config.resolution) actual_dims = tuple(data.shape[1:]) if actual_dims != expected_dims: diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 0a48fa3fe11c0..7fb8c5d1ab09c 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -25,7 +25,6 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.model_executor.layers.linear import ColumnParallelLinear from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM @@ -34,7 +33,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataParser +from vllm.multimodal.parse import ImageProcessorItems, ImageSize from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -48,9 +47,6 @@ _IMAGE_TOKEN_ID = 71011 _NEWLINE_TOKEN_ID = 71019 -MAX_IMAGE_FEATURE_SIZE_HEIGHT = 1080 -MAX_IMAGE_FEATURE_SIZE_WIDTH = 1920 - class FuyuImagePatchInputs(TypedDict): type: Literal["image_patches"] @@ -67,43 +63,49 @@ class FuyuImagePatchInputs(TypedDict): """ -def _get_fuyu_num_image_tokens( - image_height: int, - image_width: int, -) -> Tuple[int, int]: - """ - Calculate the number of image tokens needed for a given image size. - - The expected Fuyu image prompts can be expressed as: - - .. code-block:: - (image_token * ncols + newline_token) * nrows - - Args: - image_size: Tuple[int, int] - `(width, height)` of the image - - Returns: - ncols: int - number of image tokens in `x` direction - nrows: int - number of image tokens in `y` direction - """ - ncols = math.ceil(image_width / 30) - nrows = math.ceil(image_height / 30) - return ncols, nrows - +class FuyuMultiModalProcessor(BaseMultiModalProcessor): -def get_max_fuyu_image_tokens(ctx: InputContext): - ncols, nrows = _get_fuyu_num_image_tokens( - image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - ) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} - return (ncols + 1) * nrows + def _get_image_target_size(self) -> ImageSize: + processor = self._get_hf_processor() + image_processor: FuyuImageProcessor = processor.image_processor + target_size = image_processor.size + return ImageSize(width=target_size["width"], + height=target_size["height"]) -class FuyuMultiModalProcessor(BaseMultiModalProcessor): + def _get_image_grid_size( + self, + *, + image_width: int, + image_height: int, + ) -> tuple[int, int]: + target_width, target_height = self._get_image_target_size() + + if not (image_width <= target_width and image_height <= target_height): + height_scale_factor = target_height / image_height + width_scale_factor = target_width / image_width + optimal_scale_factor = min(height_scale_factor, width_scale_factor) + + image_height = int(image_height * optimal_scale_factor) + image_width = int(image_width * optimal_scale_factor) + + ncols = math.ceil(image_width / 30) + nrows = math.ceil(image_height / 30) + return ncols, nrows + + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + target_width, target_height = self._get_image_target_size() + + max_ncols, max_nrows = self._get_image_grid_size( + image_width=target_width, + image_height=target_height, + ) + max_image_tokens = (max_ncols + 1) * max_nrows - def _get_data_parser(self) -> MultiModalDataParser: - return MultiModalDataParser(max_mm_counts={"image": 1}) + return {"image": max_image_tokens} def _get_hf_processor(self) -> FuyuProcessor: return self.ctx.get_hf_processor(FuyuProcessor) @@ -166,28 +168,13 @@ def _get_prompt_replacements( eot_token_id = tokenizer.bos_token_id assert isinstance(eot_token_id, int) - hf_processor = self._get_hf_processor() - image_processor: FuyuImageProcessor = hf_processor.image_processor - target_size = image_processor.size - target_height, target_width = (target_size["height"], - target_size["width"]) - def get_replacement_fuyu(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) - width, height = image_size.width, image_size.height - if not (width <= target_width and height <= target_height): - height_scale_factor = target_height / height - width_scale_factor = target_width / width - optimal_scale_factor = min(height_scale_factor, - width_scale_factor) - - height = int(height * optimal_scale_factor) - width = int(width * optimal_scale_factor) - - ncols, nrows = _get_fuyu_num_image_tokens( - image_width=width, - image_height=height, + + ncols, nrows = self._get_image_grid_size( + image_width=image_size.width, + image_height=image_size.height, ) return (([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows + @@ -225,12 +212,13 @@ def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], ) -> ProcessorInputs: + target_width, target_height = self._get_image_target_size() num_images = mm_counts.get("image", 0) mm_data = { "image": - self._get_dummy_images(width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + self._get_dummy_images(width=target_width, + height=target_height, num_images=num_images) } @@ -240,7 +228,6 @@ def _get_dummy_mm_inputs( ) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_fuyu_image_tokens) @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 34dc7fa31ce6f..808e61edb6fb4 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -119,6 +119,12 @@ def get_max_llava_image_tokens(ctx: InputContext): class LlavaMultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return {"image": get_max_llava_image_tokens(self.ctx)} + def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]: return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor)) @@ -324,7 +330,6 @@ def init_vision_tower_for_llava( raise NotImplementedError(msg) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) @MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): # BitandBytes specific attributes @@ -649,7 +654,6 @@ def get_replacement_mantis(item_idx: int): # To use this model, please use # `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) @MULTIMODAL_REGISTRY.register_processor(MantisMultiModalProcessor) class MantisForConditionalGeneration(LlavaForConditionalGeneration): pass diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 15362db6cdfbf..d855e7d2d36f8 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -23,7 +23,6 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.logger import init_logger from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -306,24 +305,31 @@ def add_image_newline(self, image_features_hd): return image_features_hd_newline -def get_max_phi3v_image_tokens( - ctx: InputContext, - *, - num_crops: Optional[int] = None, -) -> int: - hf_processor_mm_kwargs = {} - if num_crops: - hf_processor_mm_kwargs["num_crops"] = num_crops +class Phi3VMultiModalProcessor(BaseMultiModalProcessor): - processor = ctx.get_hf_processor(**hf_processor_mm_kwargs) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} - return processor.calc_num_image_tokens_from_image_size( - width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - ) + def _get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + processor = self._get_hf_processor() + + return processor.calc_num_image_tokens_from_image_size( # type: ignore + width=image_width, + height=image_height, + ) + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + max_image_tokens = self._get_num_image_tokens( + image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, + image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + ) -class Phi3VMultiModalProcessor(BaseMultiModalProcessor): + return {"image": max_image_tokens} def _get_hf_processor( self, @@ -332,6 +338,7 @@ def _get_hf_processor( ) -> ProcessorMixin: if num_crops is not None: return self.ctx.get_hf_processor(num_crops=num_crops) + return self.ctx.get_hf_processor() def _call_hf_processor( @@ -375,7 +382,6 @@ def _get_prompt_replacements( ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() image_tokens: list[str] = hf_processor.img_tokens # type: ignore - image_processor = hf_processor.image_processor # type: ignore tokenizer = self._get_tokenizer() bos_token_id = tokenizer.bos_token_id @@ -385,9 +391,9 @@ def get_replacement_phi3v(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) - num_tokens = image_processor.calc_num_image_tokens_from_image_size( - width=image_size.width, - height=image_size.height, + num_tokens = self._get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, ) return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id] @@ -467,7 +473,6 @@ def apply( return result -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_phi3v_image_tokens) @MULTIMODAL_REGISTRY.register_processor(Phi3VMultiModalProcessor) class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): hf_to_vllm_mapper = WeightsMapper( diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index de55bc6bcc123..d050fd060353a 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -33,13 +33,12 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import MultiModalDataParser +from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -80,14 +79,17 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): return feat_lengths, output_lengths -def get_max_qwen2_audio_audio_tokens(ctx: InputContext) -> int: - hf_config = ctx.get_hf_config(Qwen2AudioConfig) - max_source_position = hf_config.audio_config.max_source_positions - output_lengths = (max_source_position - 2) // 2 + 1 - return output_lengths +class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None} -class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) + max_source_positions = hf_config.audio_config.max_source_positions + max_output_lengths = (max_source_positions - 2) // 2 + 1 + + return {"audio": max_output_lengths} def _get_hf_processor( self, @@ -157,11 +159,21 @@ def _get_prompt_replacements( audio_output_lengths = [] else: assert isinstance(feature_attention_mask, torch.Tensor) - _, audio_output_lengths = _get_feat_extract_output_lengths( + _, audio_output_lens = _get_feat_extract_output_lengths( feature_attention_mask.sum(-1)) + audio_output_lengths = audio_output_lens.tolist() + def get_replacement_qwen2_audio(item_idx: int): - return [placeholder] * audio_output_lengths[item_idx] + num_placeholders = audio_output_lengths[item_idx] + if num_placeholders == 0: + audios = mm_items.get_items("audio", AudioProcessorItems) + audio = audios.get(item_idx) + raise ValueError( + f"The audio {audio} (len={len(audio)}) is too short " + "to be represented inside the model") + + return [placeholder] * num_placeholders return [ PromptReplacement( @@ -171,6 +183,14 @@ def get_replacement_qwen2_audio(item_idx: int): ) ] + def _always_apply_prompt_replacements(self) -> bool: + # HF never applies prompt replacements, so we have to do it ourselves + # _find_placeholders may incorrectly think that HF has already performed + # processing for multi-audio input when the input audios are short + # (the corresponding placeholders may take up fewer tokens than + # the number of audio items) + return True + def _get_dummy_mm_inputs( self, mm_counts: Mapping[str, int], @@ -192,8 +212,6 @@ def _get_dummy_mm_inputs( ) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "audio", get_max_qwen2_audio_audio_tokens) @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor) class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 0df101b3dcce4..26b6d768ad4f6 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -40,7 +40,6 @@ from vllm.config import VllmConfig from vllm.distributed import parallel_state from vllm.distributed import utils as dist_utils -from vllm.inputs import InputContext from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU @@ -650,8 +649,9 @@ def _get_vision_info( width: int, min_pixels: int, max_pixels: int, + *, do_resize: bool = True, - data_type_key: str = "image", + modality: str = "image", mm_count: int = 1, ): """Get information (resized height / width and number of vision tokens) @@ -671,11 +671,12 @@ def _get_vision_info( else: resized_height, resized_width = height, width - if data_type_key == "image": + if modality == "image": grid_t = mm_count - else: - assert data_type_key == "video" + elif modality == "video": grid_t = max(mm_count // temporal_patch_size, 1) + else: + raise ValueError(f"Modality {modality} is not supported") grid_h = resized_height // patch_size grid_w = resized_width // patch_size @@ -691,41 +692,11 @@ def _get_image_processor(hf_processor: Qwen2VLProcessor): return image_processor -def get_max_qwen2_vl_mm_tokens(ctx: InputContext, - data_type_key: str, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None) -> int: - hf_config = ctx.get_hf_config(Qwen2VLConfig) - vision_config = hf_config.vision_config - - hf_processor = ctx.get_hf_processor(Qwen2VLProcessor) - image_processor = _get_image_processor(hf_processor) - - _, _, max_llm_image_tokens = _get_vision_info( - vision_config, - height=9999999, - width=9999999, - min_pixels=min_pixels or image_processor.min_pixels, - max_pixels=max_pixels or image_processor.max_pixels, - data_type_key=data_type_key, - ) - return max_llm_image_tokens - - -get_max_qwen2_vl_image_tokens = partial(get_max_qwen2_vl_mm_tokens, - data_type_key="image") -get_max_qwen2_vl_video_tokens = partial(get_max_qwen2_vl_mm_tokens, - data_type_key="video") - - class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], dict[str, torch.Tensor]]): def __init__(self, data: dict, modality: str) -> None: - super().__init__(data) - - self.modality = modality + super().__init__(data, modality) grid_thw = data[f"{modality}_grid_thw"] slice_idxs = [0] + grid_thw.prod(-1).cumsum_(0).tolist() @@ -734,9 +705,6 @@ def __init__(self, data: dict, modality: str) -> None: for i in range(len(grid_thw)) ] - def __repr__(self) -> str: - return (f"{type(self).__name__}(modality={self.modality!r})") - def get_count(self) -> int: return len(self.data[f"{self.modality}_grid_thw"]) @@ -792,6 +760,32 @@ def _parse_video_data( class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def _get_max_mm_tokens(self, modality: str) -> int: + hf_config = self.ctx.get_hf_config(Qwen2VLConfig) + vision_config = hf_config.vision_config + + hf_processor = self._get_hf_processor() + image_processor = _get_image_processor(hf_processor) + + _, _, max_llm_image_tokens = _get_vision_info( + vision_config, + height=9999999, + width=9999999, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + modality=modality, + ) + return max_llm_image_tokens + + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + return { + "image": self._get_max_mm_tokens("image"), + "video": self._get_max_mm_tokens("video"), + } + def _get_data_parser(self) -> MultiModalDataParser: return Qwen2MultiModalDataParser() @@ -908,9 +902,6 @@ def _get_dummy_mm_inputs( ) -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_qwen2_vl_image_tokens) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "video", get_max_qwen2_vl_video_tokens) @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsLoRA, SupportsPP): diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 54be7fed3f2be..0b83684c9bac5 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -2,7 +2,7 @@ """PyTorch Ultravox model.""" import math -from functools import cached_property, lru_cache +from functools import cached_property from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) @@ -17,7 +17,6 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler @@ -58,22 +57,17 @@ class UltravoxAudioEmbeddingInputs(TypedDict): UltravoxAudioEmbeddingInputs] -@lru_cache -def cached_feature_extractor(model_id: str) -> WhisperFeatureExtractor: - return WhisperFeatureExtractor.from_pretrained(model_id) - - -def whisper_feature_extractor(ctx: InputContext) -> WhisperFeatureExtractor: - hf_config = ctx.get_hf_config(UltravoxConfig) - return cached_feature_extractor(hf_config.audio_model_id) - +class UltravoxMultiModalProcessor(BaseMultiModalProcessor): -def get_ultravox_max_audio_tokens(ctx: InputContext): - feature_extractor = whisper_feature_extractor(ctx) - return math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"audio": None} + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + feature_extractor = self._get_feature_extractor() + max_audio_tokens = math.ceil(feature_extractor.chunk_length * + _AUDIO_TOKENS_PER_SECOND) -class UltravoxMultiModalProcessor(BaseMultiModalProcessor): + return {"audio": max_audio_tokens} def _get_hf_processor( self, @@ -322,8 +316,6 @@ def forward( return hidden_states -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "audio", get_ultravox_max_audio_tokens) @MULTIMODAL_REGISTRY.register_processor(UltravoxMultiModalProcessor) class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index da111e999ebb8..4e1b78ab2c59d 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -21,10 +21,15 @@ class ModalityDataItems(ABC, Generic[_T, _I]): - def __init__(self, data: _T) -> None: + def __init__(self, data: _T, modality: str) -> None: super().__init__() self.data = data + self.modality = modality + + def __repr__(self) -> str: + return (f"{type(self).__name__}(modality={self.modality!r}, " + f"len={len(self)})") def __len__(self) -> int: return self.get_count() @@ -64,14 +69,6 @@ def get_passthrough_data(self) -> Mapping[str, object]: class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): - def __init__(self, data: Sequence[_T], modality: str) -> None: - super().__init__(data) - - self.modality = modality - - def __repr__(self) -> str: - return (f"{type(self).__name__}(modality={self.modality!r})") - def get_count(self) -> int: return len(self.data) @@ -87,14 +84,6 @@ def get_passthrough_data(self) -> Mapping[str, object]: class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): - def __init__(self, data: NestedTensors, modality: str) -> None: - super().__init__(data) - - self.modality = modality - - def __repr__(self) -> str: - return (f"{type(self).__name__}(modality={self.modality!r})") - def get_count(self) -> int: return len(self.data) @@ -222,22 +211,13 @@ class MultiModalDataParser: Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. Args: - max_mm_counts (Mapping[str, int]): The maximum allowed number of items - belonging to each modality. This effectively sets a hard limit over - `--limit-mm-per-prompt`. target_sr (float, optional): Enables automatic resampling of audio items to the model's expected sampling rate. """ - def __init__( - self, - *, - max_mm_counts: Mapping[str, int] = {}, - target_sr: Optional[float] = None, - ) -> None: + def __init__(self, *, target_sr: Optional[float] = None) -> None: super().__init__() - self.max_mm_counts = max_mm_counts self.target_sr = target_sr def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: @@ -345,7 +325,6 @@ def _get_subparsers(self) -> Mapping[str, ModalityDataParser]: def parse_mm_data(self, mm_data: MultiModalDataDict) -> MultiModalDataItems: - max_mm_counts = self.max_mm_counts subparsers = self._get_subparsers() mm_items = MultiModalDataItems() @@ -353,16 +332,6 @@ def parse_mm_data(self, if k not in subparsers: raise ValueError(f"Unsupported modality: {k}") - modality_items = subparsers[k](v) - - if k in max_mm_counts: - max_count = max_mm_counts[k] - if len(modality_items) > max_count: - raise ValueError( - f"This model supports at most {max_count} {k} items " - f"per prompt, but {len(modality_items)} {k} items " - "were given or set as its limit_mm_per_prompt.") - - mm_items[k] = modality_items + mm_items[k] = subparsers[k](v) return mm_items diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 7712c3bcebe20..76475ddda81f4 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -624,6 +624,29 @@ def __call__( ) -> MultiModalInputsV2: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) + @abstractmethod + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + """ + Return the maximum supported number of items for each modality. + + A value of `None` means unlimited number of items. + + Omitting a modality from the returned dictionary means that + it is not supported at all. + """ + raise NotImplementedError + + @abstractmethod + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + """ + Get the maximum possible number of tokens per data item + for each modality. + + The dictionary returned by this method should have the same + keys as that returned by :meth:`get_supported_mm_limits`. + """ + raise NotImplementedError + def _get_data_parser(self) -> MultiModalDataParser: """ Construct a data parser to preprocess multi-modal data items @@ -653,7 +676,18 @@ def _to_mm_items( before passing them to :meth:`_get_hf_mm_data`. """ parser = self._get_data_parser() - return parser.parse_mm_data(mm_data) + mm_items = parser.parse_mm_data(mm_data) + + mm_limits = self.ctx.get_mm_config().limit_per_prompt + for modality, items in mm_items.items(): + limit = mm_limits.get(modality, 1) + if len(items) > limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but passed {len(items)} " + f"{modality} items in the same prompt.") + + return mm_items @abstractmethod def _get_mm_fields_config( @@ -901,6 +935,17 @@ def _bind_prompt_replacements( return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls] + def _always_apply_prompt_replacements(self) -> bool: + """ + A flag which can be overridden so that + :meth:`_apply_prompt_replacements` is always called even if we + detect that HF has performed processing via :meth:`_find_placeholders`. + + This is useful in cases where :meth:`_find_placeholders` cannot be + reliably used to detect whether HF has performed processing or not. + """ + return False + def _apply_prompt_replacements( self, token_ids: list[int], @@ -995,7 +1040,7 @@ def apply( all_placeholders = self._find_placeholders(prompt_repls, prompt_ids, mm_item_counts) - if all_placeholders: + if all_placeholders and not self._always_apply_prompt_replacements(): tokenizer = self._get_tokenizer() prompt_text = _decode(tokenizer, prompt_ids) else: @@ -1009,10 +1054,27 @@ def apply( mm_item_counts, ) - mm_placeholders = { - modality: [item.to_range() for item in items] - for modality, items in full_groupby_modality(all_placeholders) - } + mm_placeholders = dict[str, list[PlaceholderRange]]() + err_suffix = ("This suggests a problem with your implementation of " + "the merged multi-modal processor for this model, " + "particularly in the `_get_prompt_replacements` method.") + + for modality, placeholders in full_groupby_modality(all_placeholders): + if modality not in mm_items: + raise AssertionError( + f"Expected no placeholders for {modality=}, " + f"but found {placeholders=}. Input items: {mm_items}" + f"\n{err_suffix}") + + if len(placeholders) != len(mm_items[modality]): + raise AssertionError( + f"Expected length of {placeholders=} for {modality=} " + f"to equal that of input items: {mm_items[modality]}" + f"\n{err_suffix}") + + mm_placeholders[modality] = [ + item.to_range() for item in placeholders + ] return MultiModalInputsV2( type="multimodal", @@ -1063,15 +1125,38 @@ def _get_dummy_mm_inputs( """ raise NotImplementedError - def get_dummy_data( - self, - seq_len: int, - mm_counts: Mapping[str, int], - mm_max_tokens: Mapping[str, int], - ) -> DummyData: + def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]: + mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt + supported_mm_limits = self.get_supported_mm_limits() + + mm_limits = { + modality: mm_limit_per_prompt.get(modality, 1) + for modality in supported_mm_limits + } + + for modality, supported_limit in supported_mm_limits.items(): + limit = mm_limits[modality] + if supported_limit is not None and supported_limit < limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but this model only supports " + f"at most {supported_limit} {modality} items.") + + return mm_limits + + def get_dummy_data(self, seq_len: int) -> DummyData: # Avoid circular import from vllm.sequence import SequenceData + mm_counts = self._get_and_validate_dummy_mm_counts() + mm_max_tokens_per_item = self.get_mm_max_tokens_per_item() + if mm_counts.keys() != mm_max_tokens_per_item.keys(): + raise AssertionError( + "The keys returned by `get_supported_mm_limits`" + f"({set(mm_counts.keys())}) should be the same as those " + "returned by `get_mm_max_tokens_per_item` " + f"({set(mm_max_tokens_per_item.keys())})") + processor_inputs = self._get_dummy_mm_inputs(mm_counts) mm_inputs = self.apply( prompt_text=processor_inputs.prompt_text, @@ -1087,7 +1172,7 @@ def get_dummy_data( for modality, placeholders in placeholders_by_modality.items() } expected_placeholders_by_modality = { - modality: mm_max_tokens[modality] + modality: mm_max_tokens_per_item[modality] * mm_counts[modality] for modality in placeholders_by_modality } if total_placeholders_by_modality != expected_placeholders_by_modality: diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 3a5e11867ad9e..073d49d7d2009 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -15,6 +15,7 @@ from .image import ImagePlugin from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors from .processing import BaseMultiModalProcessor, ProcessingCache +from .utils import cached_get_tokenizer from .video import VideoPlugin if TYPE_CHECKING: @@ -219,6 +220,10 @@ def get_max_tokens_per_item_by_modality( Note: This is currently directly used only in V1. """ + if self.has_processor(model_config): + tokenizer = cached_get_tokenizer(model_config.tokenizer) + processor = self.create_processor(model_config, tokenizer) + return processor.get_mm_max_tokens_per_item() return { key: plugin.get_max_multimodal_tokens(model_config) From 23c1b10a4c8cd77c5b13afa9242d67ffd055296b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 2 Jan 2025 17:00:00 +0800 Subject: [PATCH 048/462] [VLM][Bugfix] Multi-modal processor compatible with V1 multi-input (#11674) Signed-off-by: DarkLight1337 --- vllm/multimodal/inputs.py | 252 ++++++++++++++++------------------ vllm/multimodal/processing.py | 45 +++--- vllm/v1/engine/processor.py | 22 ++- 3 files changed, 151 insertions(+), 168 deletions(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index db489af7ac475..b0a1104546186 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -2,7 +2,8 @@ from collections import UserDict, defaultdict from collections.abc import Mapping, Sequence from dataclasses import dataclass -from typing import Any, Literal, TypedDict, TypeVar, Union, cast, final +from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast, + final) import numpy as np import torch @@ -11,7 +12,7 @@ from transformers import BatchFeature from typing_extensions import NotRequired, TypeAlias -from vllm.utils import JSONTree, is_list_of, json_map_leaves +from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves _T = TypeVar("_T") @@ -160,11 +161,8 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool: @dataclass(frozen=True) -class MultiModalFieldItem: - """ - Contains metadata and data in :class:`MultiModalKwargs` - corresponding to a data item in :class:`MultiModalDataItems`. - """ +class MultiModalFieldElem: + """Contains metadata and data of an item in :class:`MultiModalKwargs`.""" field: "BaseMultiModalField" data: NestedTensors @@ -186,34 +184,34 @@ class BaseMultiModalField(ABC): def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: raise NotImplementedError - def _build_item(self, data: NestedTensors) -> MultiModalFieldItem: - return MultiModalFieldItem(self, data) + def _build_elem(self, data: NestedTensors) -> MultiModalFieldElem: + return MultiModalFieldElem(self, data) - def reduce(self, batch: list[MultiModalFieldItem]) -> MultiModalFieldItem: - """Merge multiple instances of :class:`MultiModalFieldItem` together.""" + def reduce(self, batch: list[MultiModalFieldElem]) -> MultiModalFieldElem: + """Merge multiple instances of :class:`MultiModalFieldElem` together.""" fields = [item.field for item in batch] if len(set(fields)) > 1: raise ValueError(f"Cannot merge different {fields=}") data = self._reduce_data([item.data for item in batch]) - return self._build_item(data) + return self._build_elem(data) @dataclass(frozen=True) class MultiModalBatchedField(BaseMultiModalField): """ - A :class:`BaseMultiModalField` implementation where an item is obtained by - directly indexing into the first dimension of the underlying data. + A :class:`BaseMultiModalField` implementation where an element in the batch + is obtained by indexing into the first dimension of the underlying data. """ - def build_items(self, batch: NestedTensors) -> list[MultiModalFieldItem]: - return [self._build_item(item) for item in batch] + def build_elems(self, batch: NestedTensors) -> list[MultiModalFieldElem]: + return [self._build_elem(item) for item in batch] def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): first_shape = batch[0].shape - if all(item.shape == first_shape for item in batch): + if all(elem.shape == first_shape for elem in batch): return torch.stack(batch) return batch @@ -222,24 +220,24 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: @dataclass(frozen=True) class MultiModalFlatField(BaseMultiModalField): """ - A :class:`BaseMultiModalField` implementation where an item is obtained by - slicing along the first dimension of the underlying data. + A :class:`BaseMultiModalField` implementation where an element in the batch + is obtained by slicing along the first dimension of the underlying data. """ - def build_items( + def build_elems( self, batch: NestedTensors, slices: Sequence[slice], - ) -> list[MultiModalFieldItem]: - return [self._build_item(batch[slice_]) for slice_ in slices] + ) -> list[MultiModalFieldElem]: + return [self._build_elem(batch[slice_]) for slice_ in slices] def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors: if len(batch) > 0 and is_list_of(batch, torch.Tensor, check="all"): first_shape = batch[0].shape - if all(item.shape[1:] == first_shape[1:] for item in batch): + if all(elem.shape[1:] == first_shape[1:] for elem in batch): return torch.concat(batch) - return [elem for item in batch for elem in item] + return [e for elem in batch for e in elem] class MultiModalFieldConfig: @@ -267,115 +265,111 @@ def __init__( ) -> None: super().__init__() - self._field_cls = field_cls - self._modality = modality - self._field_config = field_config + self.field_cls = field_cls + self.modality = modality + self.field_config = field_config - def build_items( + def build_elems( self, key: str, batch: NestedTensors, - ) -> list[MultiModalFieldItem]: - field = self._field_cls(key=key, modality=self._modality) - return field.build_items(batch, **self._field_config) # type: ignore + ) -> Sequence[MultiModalFieldElem]: + field = self.field_cls(key=key, modality=self.modality) + return field.build_elems(batch, **self.field_config) # type: ignore -class MultiModalKwargs(UserDict[str, NestedTensors]): +class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]): + """ + A collection of :class:`MultiModalFieldElem` + corresponding to a data item in :class:`MultiModalDataItems`. """ - A dictionary that represents the keyword arguments to - :meth:`~torch.nn.Module.forward`. - The metadata :code:`items_by_key` defines how to split batched keyword - arguments corresponding to each data item in :class:`MultiModalDataItems`: + @staticmethod + def from_elems(elems: Sequence[MultiModalFieldElem]): + return MultiModalKwargsItem({elem.field.key: elem for elem in elems}) - - For a keyword argument, we can access the :code:`i` th item in the batch - via :code:`items_by_key[key][i]`. - - We can gather the keyword arguments belonging to a modality by finding - the keys with items that belong to that modality, then accessing - the :code:`i` th item in the batch for each such key. + @property + def modality(self) -> str: + modalities = {elem.field.modality for elem in self.data.values()} + assert len(modalities) == 1, f"Found different modalities={modalities}" + return next(iter(modalities)) - Example: - .. code-block:: python - - # All items belong to the "image" modality - items_by_key={ - "pixel_values": [a, b, c, d], # "image" modality - "image_grid_thw": [e, f, g, h], # "image" modality - "pixel_values_video": [h, i, j], # "video" modality - "video_grid_thw": [k, l, m], # "video" modality - } +# NOTE: UserDict is for V0 compatibility. +# V1 should access individual items via `get_item`. +class MultiModalKwargs(UserDict[str, NestedTensors]): + """ + A dictionary that represents the keyword arguments to + :meth:`~torch.nn.Module.forward`. - - The keyword arguments belonging to the first image are - :code:`{"pixel_values": a, "image_grid_thw": e}`. - - The keyword arguments belonging to the second video are - :code:`{"pixel_values_video": i, "video_grid_thw": l}`. + The metadata :code:`items` enables us to obtain the keyword arguments + corresponding to each data item in :class:`MultiModalDataItems`, via + :meth:`get_item` and :meth:`get_items`. """ @staticmethod def from_hf_inputs( hf_inputs: BatchFeature, config_by_key: Mapping[str, MultiModalFieldConfig], - *, - enable_sanity_checks: bool = False, ): # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` # We assume that those fields are not used in vLLM - items_by_key = { - key: config.build_items(key, batch) - for key, config in config_by_key.items() - if (batch := hf_inputs.get(key)) is not None - } - - return MultiModalKwargs.from_items_by_key( - items_by_key, - enable_sanity_checks=enable_sanity_checks, - ) + elems_by_key = dict[str, Sequence[MultiModalFieldElem]]() + keys_by_modality = defaultdict[str, set[str]](set) + for key, config in config_by_key.items(): + batch = hf_inputs.get(key) + if batch is not None: + elems = config.build_elems(key, batch) + if len(elems) > 0: + elems_by_key[key] = elems + keys_by_modality[config.modality].add(key) + + items = list[MultiModalKwargsItem]() + for modality, keys in keys_by_modality.items(): + elems_in_modality = {k: elems_by_key[k] for k in keys} + batch_sizes = {k: len(v) for k, v in elems_in_modality.items()} + + if len(set(batch_sizes.values())) > 1: + raise ValueError( + f"Cannot merge different batch sizes for {modality=}! " + f"Found: {batch_sizes=}") + + batch_size = next(iter(batch_sizes.values())) + for item_idx in range(batch_size): + elems = [v[item_idx] for v in elems_in_modality.values()] + items.append(MultiModalKwargsItem.from_elems(elems)) + + return MultiModalKwargs.from_items(items) @staticmethod - def from_items_by_key( - items_by_key: Mapping[str, list[MultiModalFieldItem]], - *, - enable_sanity_checks: bool = False, - ) -> "MultiModalKwargs": + def from_items(items: Sequence[MultiModalKwargsItem]): + """Construct a new :class:`MultiModalKwargs` from multiple items.""" + elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list) + for item in items: + for key, elem in item.items(): + elems_by_key[key].append(elem) + data = { - key: items[0].field.reduce(items).data - for key, items in items_by_key.items() if len(items) > 0 + key: elems[0].field.reduce(elems).data + for key, elems in elems_by_key.items() if len(elems) > 0 } - return MultiModalKwargs(data, - items_by_key=items_by_key, - enable_sanity_checks=enable_sanity_checks) + return MultiModalKwargs(data, items=items) def __init__( self, data: Mapping[str, NestedTensors], *, - items_by_key: Mapping[str, list[MultiModalFieldItem]] = {}, - enable_sanity_checks: bool = False, + items: Optional[Sequence[MultiModalKwargsItem]] = None, ) -> None: super().__init__(data) - # Shallow copy to avoid footgun in case a defaultdict is passed in - self._items_by_key = dict(items_by_key) + items_by_modality = full_groupby(items or [], key=lambda x: x.modality) + self._items_by_modality = dict(items_by_modality) - keys_by_modality = defaultdict[str, set[str]](set) - for key, items in items_by_key.items(): - for item in items: - keys_by_modality[item.field.modality].add(key) - - self._keys_by_modality = dict(keys_by_modality) - - if enable_sanity_checks: - for modality, keys in keys_by_modality.items(): - items_in_modality = {k: items_by_key[k] for k in keys} - batch_sizes = {k: len(v) for k, v in items_in_modality.items()} - batch_size = next(iter(batch_sizes.values()), 0) - assert all(bs == batch_size - for bs in batch_sizes.values()), dict( - modality=modality, - batch_sizes=batch_sizes, - items_by_key=items_by_key) + @property + def modalities(self): + return self._items_by_modality.keys() @staticmethod def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: @@ -452,58 +446,44 @@ def as_kwargs( def __eq__(self, other: object) -> bool: if not isinstance(other, self.__class__): return False - if self._items_by_key != other._items_by_key: + if self._items_by_modality != other._items_by_modality: return False ks = self.keys() return (ks == other.keys() and all(nested_tensors_equal(self[k], other[k]) for k in ks)) - def get_item(self, key: str, item_index: int) -> MultiModalFieldItem: - return self._items_by_key[key][item_index] + def _validate_modality(self, method_name: str, modality: str) -> None: + if not self._items_by_modality: + raise RuntimeError( + f"`{method_name}` is not supported when " + "MultiModalKwargs is not initialized with `items`") - def get_items_by_modality( - self, - modality: str, - item_index: int, - ) -> Mapping[str, MultiModalFieldItem]: - """ - Get the keyword arguments corresponding to an item identified by - its modality and index. - """ - if modality not in self._keys_by_modality: - available_modalities = set(self._keys_by_modality.keys()) + if modality not in self._items_by_modality: + available_modalities = set(self._items_by_modality.keys()) raise KeyError(f"Modality {modality!r} not found. " f"Available modalities: {available_modalities}") - keys_to_gather = self._keys_by_modality[modality] + def get_item_count(self, modality: str) -> int: + """Get the number of items belonging to a modality.""" + self._validate_modality("get_item_count", modality) + return len(self._items_by_modality[modality]) - return { - key: self.get_item(key, item_index) - for key in keys_to_gather if key in self - } + def get_item(self, modality: str, item_index: int) -> MultiModalKwargsItem: + """ + Get the keyword arguments corresponding to an item identified by + its modality and index. + """ + self._validate_modality("get_item", modality) + return self._items_by_modality[modality][item_index] - @staticmethod - def from_items_by_modality( - items_by_modality: Mapping[str, list[Mapping[str, - MultiModalFieldItem]]], - *, - enable_sanity_checks: bool = False, - ) -> "MultiModalKwargs": + def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: """ - Construct a new :class:`MultiModalKwargs` from multiple items returned - by :meth:`get_fields_by_modality`. + Get the keyword arguments corresponding to each item belonging to + a modality. """ - items_by_key = defaultdict[str, list[MultiModalFieldItem]](list) - for fields in items_by_modality.values(): - for field in fields: - for k, v in field.items(): - items_by_key[k].append(v) - - return MultiModalKwargs.from_items_by_key( - items_by_key, - enable_sanity_checks=enable_sanity_checks, - ) + self._validate_modality("get_items", modality) + return self._items_by_modality[modality] MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 76475ddda81f4..64cdacfb4c574 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -20,8 +20,8 @@ from vllm.utils import LRUCache, flatten_2d_lists, full_groupby from .inputs import (MultiModalDataDict, MultiModalFieldConfig, - MultiModalFieldItem, MultiModalInputsV2, MultiModalKwargs, - PlaceholderRange) + MultiModalInputsV2, MultiModalKwargs, + MultiModalKwargsItem, PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser logger = init_logger(__name__) @@ -496,8 +496,7 @@ def __init__(self, capacity: int) -> None: # DEBUG: Set to None to disable self.debug_cache_hit_ratio_steps: Optional[int] = None - self._cache = LRUCache[str, Mapping[str, - MultiModalFieldItem]](capacity) + self._cache = LRUCache[str, MultiModalKwargsItem](capacity) def _maybe_log_cache_stats(self) -> None: steps = self.debug_cache_hit_ratio_steps @@ -565,7 +564,7 @@ def get( modality: str, input_item: object, input_kwargs: Mapping[str, object], - ) -> Optional[Mapping[str, MultiModalFieldItem]]: + ) -> Optional[MultiModalKwargsItem]: """ Get a processed multi-modal item from the cache according to its dependencies, including: @@ -588,7 +587,7 @@ def put( modality: str, input_item: object, input_kwargs: Mapping[str, object], - output_kwargs: Mapping[str, MultiModalFieldItem], + output_kwargs: MultiModalKwargsItem, ) -> None: """ Put a processed multi-modal item into the cache @@ -784,7 +783,6 @@ def _apply_hf_processor( mm_kwargs = MultiModalKwargs.from_hf_inputs( processed_data, self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), - enable_sanity_checks=self.enable_sanity_checks, ) return prompt_ids, mm_kwargs @@ -846,7 +844,7 @@ def _cached_apply_hf_processor( hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) - mm_maybe_cached_field_items = { + mm_maybe_cached_kw_items = { modality: [ cache.get(model_id, modality, item, hf_processor_mm_kwargs) for item in items @@ -855,8 +853,9 @@ def _cached_apply_hf_processor( } mm_missing_idxs = { - modality: [idx for idx, out in enumerate(fields) if out is None] - for modality, fields in mm_maybe_cached_field_items.items() + modality: + [idx for idx, item in enumerate(kw_items) if item is None] + for modality, kw_items in mm_maybe_cached_kw_items.items() } mm_missing_data = { modality: [mm_data_items[modality][idx] for idx in idxs] @@ -875,14 +874,11 @@ def _cached_apply_hf_processor( for modality in mm_missing_data_items } - mm_merged_field_items = dict[str, list[Mapping[str, - MultiModalFieldItem]]]() - for modality, modal_items_lst in mm_maybe_cached_field_items.items(): - merged_modal_items_lst = list[Mapping[str, MultiModalFieldItem]]() - - for idx, modal_items in enumerate(modal_items_lst): - if modal_items is None: - modal_items = mm_missing_kwargs.get_items_by_modality( + merged_kw_items = list[MultiModalKwargsItem]() + for modality, kw_items in mm_maybe_cached_kw_items.items(): + for idx, kw_item in enumerate(kw_items): + if kw_item is None: + kw_item = mm_missing_kwargs.get_item( modality, mm_missing_next_idx[modality], ) @@ -892,14 +888,12 @@ def _cached_apply_hf_processor( modality, mm_data_items[modality][idx], hf_processor_mm_kwargs, - modal_items, + kw_item, ) mm_missing_next_idx[modality] += 1 - merged_modal_items_lst.append(modal_items) - - mm_merged_field_items[modality] = merged_modal_items_lst + merged_kw_items.append(kw_item) if self.enable_sanity_checks: mm_missing_counts = mm_missing_data_items.get_all_counts() @@ -909,10 +903,7 @@ def _cached_apply_hf_processor( mm_missing_next_idx=mm_missing_next_idx, mm_missing_counts=mm_missing_counts) - mm_kwargs = MultiModalKwargs.from_items_by_modality( - mm_merged_field_items, - enable_sanity_checks=self.enable_sanity_checks, - ) + mm_kwargs = MultiModalKwargs.from_items(merged_kw_items) if self.enable_sanity_checks: mm_item_counts = mm_data_items.get_all_counts() @@ -920,7 +911,7 @@ def _cached_apply_hf_processor( for modality, item_count in mm_item_counts.items(): for item_idx in range(item_count): try: - mm_kwargs.get_items_by_modality(modality, item_idx) + mm_kwargs.get_item(modality, item_idx) except Exception as e: # Make it easy to set a breakpoint in the debugger raise e diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5b5a5a61cea7d..905d3d1fc3e1c 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -113,15 +113,27 @@ def process_inputs( # For merged preprocessor, mm_data is already mm_inputs precomputed_mm_inputs = None - if isinstance(decoder_inputs.multi_modal_data, MultiModalKwargs): - precomputed_mm_inputs = [decoder_inputs.multi_modal_data] + decoder_mm_data = decoder_inputs.multi_modal_data + if isinstance(decoder_mm_data, MultiModalKwargs): + # The output of merged multi-modal processor (`decoder_mm_data`) + # contains the kwargs for all items from all modalities. + # This code separates them so that there is one set of kwargs + # per item per modality. + precomputed_mm_inputs = [ + MultiModalKwargs.from_items([item]) + for modality in decoder_mm_data.modalities + for item in decoder_mm_data.get_items(modality) + ] # Apply MM mapper mm_inputs = None - if len(decoder_inputs.multi_modal_data) > 0: + if len(decoder_mm_data) > 0: mm_inputs = self.mm_input_mapper_client.process_inputs( - decoder_inputs.multi_modal_data, mm_hashes, - decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs) + decoder_mm_data, + mm_hashes, + decoder_inputs.mm_processor_kwargs, + precomputed_mm_inputs, + ) return EngineCoreRequest( request_id, From b6087a6beead9165f4c77ceba592b3651bb37de9 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Thu, 2 Jan 2025 17:18:15 +0100 Subject: [PATCH 049/462] [mypy] Pass type checking in vllm/inputs (#11680) Signed-off-by: Tobias Pitters --- tools/mypy.sh | 1 + vllm/inputs/data.py | 21 +++++++++++---------- vllm/inputs/preprocess.py | 6 +++--- vllm/inputs/registry.py | 2 +- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/tools/mypy.sh b/tools/mypy.sh index 2454ff9fde466..bf95e4c526fd1 100755 --- a/tools/mypy.sh +++ b/tools/mypy.sh @@ -23,6 +23,7 @@ run_mypy vllm/compilation run_mypy vllm/distributed run_mypy vllm/engine run_mypy vllm/executor +run_mypy vllm/inputs run_mypy vllm/lora run_mypy vllm/model_executor run_mypy vllm/plugins diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index d54cbb5c37819..cdaf6dd76eaa1 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -250,7 +250,7 @@ def prompt(self) -> Optional[str]: if inputs["type"] == "token" or inputs["type"] == "multimodal": return inputs.get("prompt") - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def prompt_token_ids(self) -> List[int]: @@ -259,7 +259,7 @@ def prompt_token_ids(self) -> List[int]: if inputs["type"] == "token" or inputs["type"] == "multimodal": return inputs.get("prompt_token_ids", []) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def token_type_ids(self) -> List[int]: @@ -268,7 +268,7 @@ def token_type_ids(self) -> List[int]: if inputs["type"] == "token" or inputs["type"] == "multimodal": return inputs.get("token_type_ids", []) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def prompt_embeds(self) -> Optional[torch.Tensor]: @@ -277,7 +277,7 @@ def prompt_embeds(self) -> Optional[torch.Tensor]: if inputs["type"] == "token" or inputs["type"] == "multimodal": return None - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def multi_modal_data(self) -> "MultiModalDataDict": @@ -289,7 +289,7 @@ def multi_modal_data(self) -> "MultiModalDataDict": if inputs["type"] == "multimodal": return inputs.get("mm_kwargs", {}) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: @@ -301,7 +301,7 @@ def multi_modal_inputs(self) -> Union[Dict, "MultiModalKwargs"]: if inputs["type"] == "multimodal": return inputs.get("mm_kwargs", {}) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def multi_modal_hashes(self) -> List[str]: @@ -311,9 +311,10 @@ def multi_modal_hashes(self) -> List[str]: return inputs.get("multi_modal_hashes", []) if inputs["type"] == "multimodal": - return inputs.get("mm_hashes", []) + # only the case when we use MultiModalInputsV2 + return inputs.get("mm_hashes", []) # type: ignore[return-value] - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": @@ -325,7 +326,7 @@ def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": if inputs["type"] == "multimodal": return inputs.get("mm_placeholders", {}) - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] @cached_property def mm_processor_kwargs(self) -> Dict[str, Any]: @@ -337,7 +338,7 @@ def mm_processor_kwargs(self) -> Dict[str, Any]: if inputs["type"] == "multimodal": return {} - assert_never(inputs) + assert_never(inputs) # type: ignore[arg-type] ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 3d606817e90aa..aaa10d278ddb0 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -436,7 +436,7 @@ def _build_enc_dec_llm_inputs( or encoder_inputs["type"] == "multimodal"): pass else: - assert_never(encoder_inputs) + assert_never(encoder_inputs) # type: ignore[arg-type] if decoder_inputs is None: dec_token_ids = self._prepare_decoder_input_ids_for_generation( @@ -452,7 +452,7 @@ def _build_enc_dec_llm_inputs( raise ValueError("Multi-modal decoder inputs of encoder-" "decoder models are not supported yet") else: - assert_never(encoder_inputs) + assert_never(encoder_inputs) # type: ignore[arg-type] return EncoderDecoderInputs( encoder=encoder_inputs, @@ -569,7 +569,7 @@ def _build_decoder_only_llm_inputs( prompt_adapter_request=prompt_adapter_request, ) else: - assert_never(prompt_inputs) + assert_never(prompt_inputs) # type: ignore[arg-type] return prompt_inputs diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 090347706ca93..2d9d024e03e80 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -419,7 +419,7 @@ def _ensure_mm_kwargs( # Be more strict in V2 assert "mm_kwargs" in inputs else: - assert_never(inputs["type"]) + assert_never(inputs["type"]) # type: ignore[arg-type] def process_input(self, model_config: "ModelConfig", inputs: ProcessorInputs) -> ProcessorInputs: From 8c38ee7007c50ac5aef9ed43ae91c6f031799c40 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 3 Jan 2025 00:39:27 +0800 Subject: [PATCH 050/462] [VLM] Merged multi-modal processor for LLaVA-NeXT (#11682) Signed-off-by: DarkLight1337 --- .../mm_processor_kwargs/test_llava_next.py | 70 ---- tests/multimodal/test_mapper.py | 118 ------- tests/multimodal/test_processing.py | 97 +++++ .../vllm_add_dummy_model/my_llava.py | 4 +- vllm/model_executor/models/clip.py | 25 ++ vllm/model_executor/models/fuyu.py | 6 +- vllm/model_executor/models/llava.py | 334 +++++++++++------- vllm/model_executor/models/llava_next.py | 321 ++++++----------- vllm/model_executor/models/phi3v.py | 24 +- vllm/model_executor/models/pixtral.py | 66 +++- vllm/model_executor/models/siglip.py | 25 ++ vllm/model_executor/models/utils.py | 2 +- vllm/model_executor/models/vision.py | 52 +++ vllm/multimodal/parse.py | 12 +- 14 files changed, 605 insertions(+), 551 deletions(-) delete mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py delete mode 100644 tests/multimodal/test_mapper.py create mode 100644 vllm/model_executor/models/vision.py diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py deleted file mode 100644 index 51c0085101dd0..0000000000000 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py +++ /dev/null @@ -1,70 +0,0 @@ -import pytest - -from vllm.inputs import InputContext - -from ....utils import build_model_context - - -@pytest.fixture() -def get_max_llava_next_image_tokens(): - from vllm.model_executor.models.llava_next import ( - get_max_llava_next_image_tokens) - return get_max_llava_next_image_tokens - - -@pytest.fixture() -def dummy_data_for_llava_next(): - from vllm.model_executor.models.llava_next import dummy_data_for_llava_next - return dummy_data_for_llava_next - - -@pytest.mark.parametrize("gridpoints,expected_max_tokens", [ - ([[336, 336]], 1176), - ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], 2928), -]) -def test_get_max_llava_next_image_tokens(gridpoints, expected_max_tokens, - get_max_llava_next_image_tokens): - ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") - - # Update the config image_grid_pinpoints - # and calculate the resulting max tokens - ctx.model_config.hf_config.image_grid_pinpoints = gridpoints - - actual_max_tokens = get_max_llava_next_image_tokens( - InputContext(ctx.model_config)) - - assert expected_max_tokens == actual_max_tokens - - -@pytest.mark.parametrize( - "gridpoints,expected_size", - [ - # One point; it has to be the largest - ([[336, 336]], (336, 336)), - # Default for most llava next models; the 2x2 tile is the largest - ([[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]], - (672, 672)), - # If two rectangular gridpoints are the same, the more vertical - # one has the higher feature count due to newline features - ([[336, 672], [672, 336]], (672, 336)) - ]) -def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next, - gridpoints, expected_size): - ctx = build_model_context(model_name="llava-hf/llava-v1.6-mistral-7b-hf") - - # Update the config image_grid_pinpoints - ctx.model_config.hf_config.image_grid_pinpoints = gridpoints - seq_len = 5000 # bigger than the max feature size for any image - - dummy_data = dummy_data_for_llava_next( - ctx, - seq_len=seq_len, - mm_counts={"image": 1}, - ) - seq_data = dummy_data.seq_data - mm_data = dummy_data.multi_modal_data - - # The dummy data dims should match the gridpoint with the biggest feat size - assert mm_data["image"].height == expected_size[0] - assert mm_data["image"].width == expected_size[1] - assert len(seq_data.get_token_ids()) >= seq_len diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py deleted file mode 100644 index 81f2a06182bcc..0000000000000 --- a/tests/multimodal/test_mapper.py +++ /dev/null @@ -1,118 +0,0 @@ -from contextlib import nullcontext - -import numpy as np -import pytest -from transformers import LlavaNextImageProcessor - -from vllm.config import ModelConfig -from vllm.multimodal import MultiModalRegistry -from vllm.multimodal.image import rescale_image_size - - -@pytest.fixture -def mm_registry(): - return MultiModalRegistry() - - -@pytest.mark.parametrize("dtype", ["half", "float"]) -@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0]) -def test_llava_next_image_processor(image_assets, mm_registry, dtype, - size_factor): - MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf" - - hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME) - assert isinstance(hf_processor, LlavaNextImageProcessor) - - model_config = ModelConfig( - model=MODEL_NAME, - task="auto", - tokenizer=MODEL_NAME, - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype=dtype, - revision=None, - limit_mm_per_prompt={"image": 1}, - ) - - mm_registry.init_mm_limits_per_prompt(model_config) - - for asset in image_assets: - image = rescale_image_size(asset.pil_image, size_factor) - - hf_result = hf_processor.preprocess( - image, - return_tensors="pt", - ) - vllm_result = mm_registry.map_input( - model_config, - {"image": image}, - ) - - assert hf_result.keys() == vllm_result.keys() - for key, hf_tensor in hf_result.items(): - hf_arr: np.ndarray = hf_tensor.numpy() - vllm_arr: np.ndarray = vllm_result[key].numpy() - - assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}" - assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}" - - -@pytest.mark.parametrize( - ("num_images", "limit", "is_valid"), - [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), - (2, 1, False), (2, 2, True)], -) -def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): - MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf" - - model_config = ModelConfig( - model=MODEL_NAME, - task="auto", - tokenizer=MODEL_NAME, - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype="half", - revision=None, - limit_mm_per_prompt={"image": limit}, - ) - - mm_registry.init_mm_limits_per_prompt(model_config) - - image = image_assets[0].pil_image - if num_images == 0: - mm_inputs = {} - elif num_images == 1: - mm_inputs = {"image": image} - else: - mm_inputs = {"image": [image] * num_images} - - with nullcontext() if is_valid else pytest.raises(ValueError): - mm_registry.map_input(model_config, mm_inputs) - - -# NOTE: We don't test zero images since the HF processor doesn't support it -@pytest.mark.parametrize("num_images", [1, 2]) -def test_image_mapper_multi(image_assets, mm_registry, num_images): - MODEL_NAME = "llava-hf/llava-v1.6-mistral-7b-hf" - - model_config = ModelConfig( - model=MODEL_NAME, - task="auto", - tokenizer=MODEL_NAME, - tokenizer_mode="auto", - trust_remote_code=False, - seed=0, - dtype="half", - revision=None, - limit_mm_per_prompt={"image": num_images}, - ) - - mm_registry.init_mm_limits_per_prompt(model_config) - - image = image_assets[0].pil_image - mm_inputs = {"image": [image] * num_images} - - mapped_inputs = mm_registry.map_input(model_config, mm_inputs) - assert len(mapped_inputs["pixel_values"]) == num_images diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 9573351b4dff1..f99d7556b27f9 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1,5 +1,7 @@ +from contextlib import nullcontext from functools import partial from typing import cast +from unittest.mock import MagicMock import numpy as np import pytest @@ -526,6 +528,100 @@ def _rand_audio( return rng.rand(audio_len), sr +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize( + ("limit", "num_supported", "is_valid"), + [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), + (2, 1, False), (2, 2, True)], +) +def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): + limit_mm_per_prompt = {"image": limit} + + model_config = ModelConfig( + model=model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="half", + revision=None, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + + processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + + processor = processor_factory(ctx, cache=None) + + mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) + processor.get_supported_mm_limits = mock_supported_mm_limits + + if is_valid: + exc_ctx = nullcontext() + else: + exc_ctx = pytest.raises(ValueError, match="this model only supports") + + with exc_ctx: + processor._get_and_validate_dummy_mm_counts() + + +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize( + ("num_images", "limit", "is_valid"), + [(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True), + (2, 1, False), (2, 2, True)], +) +def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): + limit_mm_per_prompt = {"image": limit} + + model_config = ModelConfig( + model=model_id, + task="auto", + tokenizer=model_id, + tokenizer_mode="auto", + trust_remote_code=False, + seed=0, + dtype="half", + revision=None, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) + + processor_factory = MULTIMODAL_REGISTRY._processor_factories[model_cls] + ctx = InputProcessingContext( + model_config, + tokenizer=cached_get_tokenizer(model_config.tokenizer), + ) + + processor = processor_factory(ctx, cache=None) + + rng = np.random.RandomState(0) + image = _rand_img(rng, min_wh=128, max_wh=256) + if num_images == 0: + mm_data = {} + elif num_images == 1: + mm_data = {"image": image} + else: + mm_data = {"image": [image] * num_images} + + if is_valid: + exc_ctx = nullcontext() + else: + exc_ctx = pytest.raises(ValueError, match=f"passed {num_images} image") + + with exc_ctx: + processor.apply( + "" * num_images, + mm_data=mm_data, + hf_processor_mm_kwargs={}, + ) + + def _test_processing_cache_correctness( model_id: str, modalities: dict[str, bool], @@ -631,6 +727,7 @@ def _test_processing_cache_correctness( ("facebook/chameleon-7b", {"image": False}), ("adept/fuyu-8b", {"image": False}), ("llava-hf/llava-1.5-7b-hf", {"image": True}), + ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), ("mistral-community/pixtral-12b", {"image": True}), ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), diff --git a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py index 0d90635093ac7..06dfebbb95527 100644 --- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py +++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py @@ -3,13 +3,11 @@ import torch from vllm.model_executor.models.llava import (LlavaForConditionalGeneration, - LlavaMultiModalProcessor, - get_max_llava_image_tokens) + LlavaMultiModalProcessor) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens) @MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) class MyLlava(LlavaForConditionalGeneration): diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index a5300dfd986f3..0188452054b8c 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -24,6 +24,8 @@ resolve_visual_encoder_outputs) from vllm.sequence import SequenceData +from .vision import VisionEncoderInfo + def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: assert image_size % patch_size == 0 @@ -149,6 +151,29 @@ def input_processor_for_clip( multi_modal_placeholders={"image": ranges}) +class CLIPEncoderInfo(VisionEncoderInfo[CLIPVisionConfig]): + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + return get_clip_image_feature_size(self.vision_config) + + def get_max_image_tokens(self) -> int: + return get_max_clip_image_tokens(self.vision_config) + + def get_num_patches(self) -> int: + return get_clip_patch_grid_length( + image_size=self.vision_config.image_size, + patch_size=self.vision_config.patch_size, + ) + + def get_image_size(self) -> int: + return self.vision_config.image_size + + # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa class CLIPVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7fb8c5d1ab09c..3680d01725238 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -76,7 +76,7 @@ def _get_image_target_size(self) -> ImageSize: return ImageSize(width=target_size["width"], height=target_size["height"]) - def _get_image_grid_size( + def _get_image_feature_grid_size( self, *, image_width: int, @@ -99,7 +99,7 @@ def _get_image_grid_size( def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: target_width, target_height = self._get_image_target_size() - max_ncols, max_nrows = self._get_image_grid_size( + max_ncols, max_nrows = self._get_image_feature_grid_size( image_width=target_width, image_height=target_height, ) @@ -172,7 +172,7 @@ def get_replacement_fuyu(item_idx: int): images = mm_items.get_items("image", ImageProcessorItems) image_size = images.get_image_size(item_idx) - ncols, nrows = self._get_image_grid_size( + ncols, nrows = self._get_image_feature_grid_size( image_width=image_size.width, image_height=image_size.height, ) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 808e61edb6fb4..78de27cd821c6 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,6 +1,7 @@ +from abc import abstractmethod from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set, - Tuple, TypedDict, Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, Union) import torch import torch.nn as nn @@ -12,7 +13,6 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import InputContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -23,23 +23,23 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors) -from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, - PromptReplacement, + InputProcessingContext, + MultiModalDataItems, ProcessingCache, + ProcessorInputs, PromptReplacement, full_groupby_modality) from vllm.sequence import IntermediateTensors -from .clip import (CLIPVisionModel, dummy_image_for_clip, - get_max_clip_image_tokens) +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .pixtral import (PixtralHFVisionModel, dummy_image_for_pixtral_hf, - get_max_pixtral_hf_image_tokens, - get_pixtral_hf_image_feature_size) -from .siglip import (SiglipVisionModel, dummy_image_for_siglip, - get_max_siglip_image_tokens) +from .pixtral import (PixtralHFVisionModel, + get_pixtral_hf_image_feature_grid_size) +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) +from .vision import vision_encoder_info class LlavaImagePixelInputs(TypedDict): @@ -94,39 +94,167 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: return hidden_states -def get_max_llava_image_tokens(ctx: InputContext): - hf_config = ctx.get_hf_config(LlavaConfig) - vision_config = hf_config.vision_config +class LlavaLikeConfig(Protocol): + vision_config: Final[PretrainedConfig] + vision_feature_select_strategy: Final[str] + vision_feature_layer: Final[Union[int, List[int]]] - if isinstance(vision_config, CLIPVisionConfig): - num_image_tokens = get_max_clip_image_tokens(vision_config) - elif isinstance(vision_config, SiglipVisionConfig): - num_image_tokens = get_max_siglip_image_tokens(vision_config) - elif isinstance(vision_config, PixtralVisionConfig): - num_image_tokens = get_max_pixtral_hf_image_tokens(vision_config) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - strategy = hf_config.vision_feature_select_strategy - if strategy == "default": - return num_image_tokens - 1 - elif strategy == "full": - return num_image_tokens - else: - raise ValueError(f"Unexpected select feature strategy: {strategy}") +class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): + def __init__(self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__(ctx, + cache=cache, + enable_sanity_checks=enable_sanity_checks) + + vision_config = self._get_hf_config().vision_config + self._vision_encoder_info = vision_encoder_info(vision_config) -class LlavaMultiModalProcessor(BaseMultiModalProcessor): + @abstractmethod + def _get_hf_config(self) -> LlavaLikeConfig: + raise NotImplementedError def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} + def _apply_feature_select_strategy( + self, + strategy: str, + encoder_num_image_tokens: int, + ) -> int: + if strategy == "default": + return encoder_num_image_tokens - 1 + if strategy == "full": + return encoder_num_image_tokens + + msg = f"Unexpected feature select strategy: {strategy!r}" + raise NotImplementedError(msg) + + def _get_max_image_tokens(self) -> int: + hf_config = self._get_hf_config() + + return self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + self._vision_encoder_info.get_max_image_tokens(), + ) + def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - return {"image": get_max_llava_image_tokens(self.ctx)} + return {"image": self._get_max_image_tokens()} + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + + def _get_dummy_image_size(self) -> ImageSize: + image_size = self._vision_encoder_info.get_image_size() + return ImageSize(image_size, image_size) + + @abstractmethod + def _get_image_token(self) -> str: + raise NotImplementedError + + def _get_dummy_mm_inputs( + self, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + + image_token = self._get_image_token() + target_width, target_height = self._get_dummy_image_size() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) + + +class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): + + def _get_hf_config(self) -> LlavaConfig: + return self.ctx.get_hf_config(LlavaConfig) + + def _get_hf_processor(self) -> LlavaProcessor: + return self.ctx.get_hf_processor(LlavaProcessor) + + def _get_image_token(self) -> str: + return self._get_hf_processor().image_token + + def _get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + + return self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + self._vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), + ) + + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self._get_hf_config() + image_token_id = hf_config.image_token_index - def _get_hf_processor(self) -> Union[LlavaProcessor, PixtralProcessor]: - return self.ctx.get_hf_processor((LlavaProcessor, PixtralProcessor)) + def get_replacement(item_idx: int): + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + num_image_tokens = self._get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [image_token_id] * num_image_tokens + + return [ + PromptReplacement( + modality="image", + target=[image_token_id], + replacement=get_replacement, + ), + ] + + +class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): + + def _get_hf_config(self) -> LlavaConfig: + return self.ctx.get_hf_config(LlavaConfig) + + def _get_hf_processor(self) -> PixtralProcessor: + return self.ctx.get_hf_processor(PixtralProcessor) + + def _get_image_token(self) -> str: + return self._get_hf_processor().image_token def _call_hf_processor( self, @@ -140,119 +268,82 @@ def _call_hf_processor( mm_kwargs=mm_kwargs, ) - # NOTE: pixel_values=None for MLlavaProcessor pixel_values = processed_outputs.get("pixel_values") if pixel_values is not None: images = mm_data["images"] assert isinstance(images, list) - if isinstance(self._get_hf_processor(), PixtralProcessor): - # Original output: (1, num_images, C, H, W) - # New output: (num_images, C, H, W) - assert (isinstance(pixel_values, list) - and len(pixel_values) == 1) - assert (isinstance(pixel_values[0], list) - and len(pixel_values[0]) == len(images)) + # Original output: (1, num_images, C, H, W) + # New output: (num_images, C, H, W) + assert (isinstance(pixel_values, list) and len(pixel_values) == 1) + assert (isinstance(pixel_values[0], list) + and len(pixel_values[0]) == len(images)) - processed_outputs["pixel_values"] = pixel_values[0] + processed_outputs["pixel_values"] = pixel_values[0] return processed_outputs - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - ) - def _get_prompt_replacements( self, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config(LlavaConfig) + hf_config = self._get_hf_config() image_token_id = hf_config.image_token_index processor = self._get_hf_processor() - if isinstance(processor, PixtralProcessor): - image_token = processor.image_token - image_break_token = processor.image_break_token - image_end_token = processor.image_end_token - - vision_config = hf_config.vision_config - assert isinstance(vision_config, PixtralVisionConfig) + image_token = processor.image_token + image_break_token = processor.image_break_token + image_end_token = processor.image_end_token - def get_replacement_pixtral(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - ( - num_width_tokens, - num_height_tokens, - ) = get_pixtral_hf_image_feature_size( - vision_config, - image_width=image_size.width, - image_height=image_size.height, - ) + vision_config = hf_config.vision_config + assert isinstance(vision_config, PixtralVisionConfig) - tokens = ([image_token] * num_width_tokens + - [image_break_token]) * num_height_tokens - tokens[-1] = image_end_token + def get_replacement(item_idx: int): + images = mm_items.get_items("image", ImageProcessorItems) + image_size = images.get_image_size(item_idx) - return "".join(tokens) + ncols, nrows = get_pixtral_hf_image_feature_grid_size( + vision_config, + image_width=image_size.width, + image_height=image_size.height, + ) - return [ - PromptReplacement( - modality="image", - target=[image_token_id], - replacement=get_replacement_pixtral, - ), - ] + tokens = ([image_token] * ncols + [image_break_token]) * nrows + tokens[-1] = image_end_token - max_image_tokens = get_max_llava_image_tokens(self.ctx) + return "".join(tokens) return [ PromptReplacement( modality="image", target=[image_token_id], - replacement=[image_token_id] * max_image_tokens, - ) + replacement=get_replacement, + ), ] - def _get_dummy_mm_inputs( - self, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - hf_config = self.ctx.get_hf_config(LlavaConfig) - vision_config = hf_config.vision_config - num_images = mm_counts.get("image", 0) - - if isinstance(vision_config, CLIPVisionConfig): - data = dummy_image_for_clip(vision_config, num_images) - elif isinstance(vision_config, SiglipVisionConfig): - data = dummy_image_for_siglip(vision_config, num_images) - elif isinstance(vision_config, PixtralVisionConfig): - data = dummy_image_for_pixtral_hf(vision_config, num_images) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - hf_processor = self._get_hf_processor() - image_token = hf_processor.image_token +def _build_llava_or_pixtral_hf_processor( + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True, +) -> BaseLlavaMultiModalProcessor: + hf_config = ctx.get_hf_config(LlavaConfig) - return ProcessorInputs( - prompt_text=image_token * num_images, - mm_data=data, + if isinstance(hf_config.vision_config, PixtralVisionConfig): + return PixtralHFMultiModalProcessor( + ctx, + cache=cache, + enable_sanity_checks=enable_sanity_checks, ) - -class LlavaLikeConfig(Protocol): - vision_config: PretrainedConfig - vision_feature_layer: Union[int, List[int]] + return LlavaMultiModalProcessor( + ctx, + cache=cache, + enable_sanity_checks=enable_sanity_checks, + ) def _get_num_hidden_layers(hf_config: LlavaLikeConfig) -> int: @@ -330,7 +421,7 @@ def init_vision_tower_for_llava( raise NotImplementedError(msg) -@MULTIMODAL_REGISTRY.register_processor(LlavaMultiModalProcessor) +@MULTIMODAL_REGISTRY.register_processor(_build_llava_or_pixtral_hf_processor) class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): # BitandBytes specific attributes bitsandbytes_stacked_params_mapping = { @@ -596,7 +687,12 @@ def apply( ) -> MultiModalInputsV2: hf_config = self.ctx.get_hf_config(LlavaConfig) image_token_id = hf_config.image_token_index - max_image_tokens = get_max_llava_image_tokens(self.ctx) + + # Assume that it doesn't depend on the image size + num_image_tokens = self._get_num_image_tokens( + image_width=-1, + image_height=-1, + ) result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs) @@ -609,14 +705,14 @@ def apply( def get_replacement_mantis(item_idx: int): return "".join([ f"(image {item_idx+1}: ", # 7 tokens - "" * max_image_tokens, + "" * num_image_tokens, ")", # 3 tokens ]) mantis_repls = self._bind_prompt_replacements([ PromptReplacement( modality="image", - target=[image_token_id] * max_image_tokens, + target=[image_token_id] * num_image_tokens, replacement=get_replacement_mantis, ) ]) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 5e70c11363c83..24debd1cbf3fe 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -4,31 +4,25 @@ import torch import torch.nn as nn -from PIL import Image -from transformers import CLIPVisionConfig, LlavaNextConfig, SiglipVisionConfig +from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, unpad_image) from typing_extensions import NotRequired from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext) from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors +from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors +from vllm.multimodal.parse import ImageSize from vllm.sequence import IntermediateTensors -from vllm.utils import is_list_of -from .clip import (CLIPVisionModel, dummy_image_for_clip, - dummy_seq_data_for_clip, get_clip_image_feature_size, - get_clip_patch_grid_length, input_processor_for_clip) +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import LlavaMultiModalProjector, init_vision_tower_for_llava -from .siglip import (SiglipVisionModel, dummy_image_for_siglip, - dummy_seq_data_for_siglip, get_siglip_image_feature_size, - get_siglip_patch_grid_length, input_processor_for_siglip) +from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector, + init_vision_tower_for_llava) +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, init_vllm_registered_model, maybe_prefix) @@ -65,218 +59,127 @@ class LlavaNextImageEmbeddingInputs(TypedDict): LlavaNextImageEmbeddingInputs] -# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 -def _get_llava_next_num_unpadded_features( - original_height: int, - original_width: int, - npatches: int, - num_patch_height: int, - num_patch_width: int, -) -> Tuple[int, int]: - current_height = npatches * num_patch_height - current_width = npatches * num_patch_width - - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height - - if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width / original_width - new_height = int(original_height * scale_factor) - padding = (current_height - new_height) // 2 - current_height -= 2 * padding - else: - scale_factor = current_height / original_height - new_width = int(original_width * scale_factor) - padding = (current_width - new_width) // 2 - current_width -= 2 * padding - - unpadded_features = current_height * current_width - newline_features = current_height - return (unpadded_features, newline_features) - - -# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 -def get_llava_next_image_feature_size( - hf_config: LlavaNextConfig, - *, - input_height: int, - input_width: int, -) -> int: - vision_config = hf_config.vision_config - - if isinstance(vision_config, CLIPVisionConfig): - num_patches = get_clip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, - ) - base_feature_size = get_clip_image_feature_size(vision_config) - elif isinstance(vision_config, SiglipVisionConfig): - num_patches = get_siglip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, - ) - base_feature_size = get_siglip_image_feature_size(vision_config) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - strategy = hf_config.vision_feature_select_strategy - if strategy == "default": - base_feature_size -= 1 - elif strategy == "full": - pass - else: - raise ValueError(f"Unexpected select feature strategy: {strategy}") +class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): - num_patch_height, num_patch_width = get_anyres_image_grid_shape( - image_size=(input_height, input_width), - grid_pinpoints=hf_config.image_grid_pinpoints, - patch_size=vision_config.image_size, - ) - - ( - unpadded_feature_size, - newline_feature_size, - ) = _get_llava_next_num_unpadded_features(input_height, input_width, - num_patches, num_patch_height, - num_patch_width) - - return unpadded_feature_size + newline_feature_size + base_feature_size - - -def get_max_llava_next_image_tokens(ctx: InputContext): - """Compute the max feature size for all possible image grid pinpoints.""" - return _get_pinpoint_with_largest_features(ctx)[0] - - -def _get_pinpoint_with_largest_features( - ctx: InputContext) -> Tuple[int, Tuple[int, int]]: - """Get the grid pinpoint with the largest features & its feature size.""" - hf_config = ctx.get_hf_config(LlavaNextConfig) - largest_feature_size = 0 - largest_feature_pinpoint = None - for (height, width) in hf_config.image_grid_pinpoints: - feat_size = get_llava_next_image_feature_size( - hf_config, - input_height=height, - input_width=width, - ) - if feat_size > largest_feature_size: - largest_feature_size = feat_size - largest_feature_pinpoint = (height, width) - if not largest_feature_size or largest_feature_pinpoint is None: - raise ValueError("Cannot have a largest feature size of 0!") - return largest_feature_size, largest_feature_pinpoint - - -def dummy_data_for_llava_next(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaNextConfig) - vision_config = hf_config.vision_config - num_images = mm_counts["image"] - - image_feature_size, pinpoint = _get_pinpoint_with_largest_features(ctx) - max_feat_height, max_feat_width = pinpoint - - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_images, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) + def _get_hf_config(self) -> LlavaNextConfig: + return self.ctx.get_hf_config(LlavaNextConfig) + + def _get_hf_processor(self) -> LlavaNextProcessor: + return self.ctx.get_hf_processor(LlavaNextProcessor) - mm_data = dummy_image_for_clip( - vision_config, - num_images, - image_width_override=max_feat_width, - image_height_override=max_feat_height, + def _get_image_token(self) -> str: + return self._get_hf_processor().image_token + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), ) - return DummyData(seq_data, mm_data, ranges) - elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_images, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, + def _get_max_image_tokens(self) -> int: + largest_feature_size, _ = self._get_pinpoint_with_most_features() + return largest_feature_size + + def _get_dummy_image_size(self) -> ImageSize: + _, pinpoint = self._get_pinpoint_with_most_features() + return pinpoint + + # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 + def _get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + + base_feature_size = self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + self._vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), ) + num_patches = self._vision_encoder_info.get_num_patches() - mm_data = dummy_image_for_siglip( - vision_config, - num_images, - image_width_override=max_feat_width, - image_height_override=max_feat_height, + num_patch_height, num_patch_width = get_anyres_image_grid_shape( + image_size=(image_height, image_width), + grid_pinpoints=hf_config.image_grid_pinpoints, + patch_size=self._vision_encoder_info.get_image_size(), ) - return DummyData(seq_data, mm_data, ranges) + ( + unpadded_feature_size, + newline_feature_size, + ) = self._get_num_unpadded_features( + original_height=image_height, + original_width=image_width, + npatches=num_patches, + num_patch_height=num_patch_height, + num_patch_width=num_patch_width, + ) - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + return unpadded_feature_size + newline_feature_size + base_feature_size + # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L79 + def _get_num_unpadded_features( + self, + *, + original_height: int, + original_width: int, + npatches: int, + num_patch_height: int, + num_patch_width: int, + ) -> tuple[int, int]: + current_height = npatches * num_patch_height + current_width = npatches * num_patch_width + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + + if original_aspect_ratio > current_aspect_ratio: + scale_factor = current_width / original_width + new_height = int(original_height * scale_factor) + padding = (current_height - new_height) // 2 + current_height -= 2 * padding + else: + scale_factor = current_height / original_height + new_width = int(original_width * scale_factor) + padding = (current_width - new_width) // 2 + current_width -= 2 * padding -def input_processor_for_llava_next(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs + unpadded_features = current_height * current_width + newline_features = current_height + return (unpadded_features, newline_features) - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaNextConfig) - vision_config = hf_config.vision_config + def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: + """ + Get the grid pinpoint with the most features and + the corresponding feature size. + """ + hf_config = self._get_hf_config() - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - width, height = image_data.size + largest_feature_size, largest_feature_pinpoint = 0, None + for (height, width) in hf_config.image_grid_pinpoints: + feat_size = self._get_num_image_tokens(image_width=width, + image_height=height) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) - image_feature_size = get_llava_next_image_feature_size( - hf_config, - input_height=height, - input_width=width, - ) - elif is_list_of(image_data, Image.Image): - image_feature_size = [ - get_llava_next_image_feature_size(hf_config, - input_height=img.height, - input_width=img.width) - for img in image_data - ] - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - elif is_list_of(image_data, torch.Tensor): - image_feature_size = [item.shape[1] for item in image_data] - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - vision_config = hf_config.vision_config - - if isinstance(vision_config, CLIPVisionConfig): - return input_processor_for_clip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return input_processor_for_siglip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, - ) + if largest_feature_size == 0 or largest_feature_pinpoint is None: + raise ValueError("Cannot have a largest feature size of 0!") - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + return largest_feature_size, largest_feature_pinpoint -@MULTIMODAL_REGISTRY.register_image_input_mapper() -@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next) +@MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor) class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): @@ -507,7 +410,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor, def _process_image_pixels( self, inputs: LlavaNextImagePixelInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, ...]]: assert self.vision_tower is not None pixel_values = inputs["data"] diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index d855e7d2d36f8..f2e49d8e4848d 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -34,7 +34,7 @@ from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageProcessorItems +from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement, @@ -388,15 +388,19 @@ def _get_prompt_replacements( assert isinstance(bos_token_id, int) def get_replacement_phi3v(item_idx: int): - images = mm_items.get_items("image", ImageProcessorItems) - image_size = images.get_image_size(item_idx) - - num_tokens = self._get_num_image_tokens( - image_width=image_size.width, - image_height=image_size.height, - ) - - return [_IMAGE_TOKEN_ID] * num_tokens + [bos_token_id] + images = mm_items.get_items( + "image", (ImageEmbeddingItems, ImageProcessorItems)) + + if isinstance(images, ImageEmbeddingItems): + num_image_tokens = images.get_feature_size(item_idx) + else: + image_size = images.get_image_size(item_idx) + num_image_tokens = self._get_num_image_tokens( + image_width=image_size.width, + image_height=image_size.height, + ) + + return [_IMAGE_TOKEN_ID] * num_image_tokens + [bos_token_id] num_images = mm_items.get_count("image", strict=False) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 2bce13792a88d..d7233bd6028ed 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -38,6 +38,7 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) +from .vision import VisionEncoderInfo try: from xformers import ops as xops @@ -697,10 +698,18 @@ def get_pixtral_hf_patch_grid_length(*, image_size: int, return image_size // patch_size -def get_pixtral_hf_num_patches(*, image_size: int, patch_size: int) -> int: - grid_length = get_pixtral_hf_patch_grid_length(image_size=image_size, - patch_size=patch_size) - return grid_length * grid_length +def get_pixtral_hf_image_feature_size( + *, + image_size: int, + patch_size: int, +) -> int: + grid_length = get_pixtral_hf_patch_grid_length( + image_size=image_size, + patch_size=patch_size, + ) + + # Consider the image_break_token + return (grid_length + 1) * grid_length def get_max_pixtral_hf_image_tokens(hf_config: PixtralVisionConfig) -> int: @@ -730,13 +739,16 @@ def dummy_image_for_pixtral_hf( return {"image": image if num_images == 1 else [image] * num_images} -def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, - image_width: int, - image_height: int) -> Tuple[int, int]: - # Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 - # https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 # noqa: E501 - max_width, max_height = hf_config.image_size, hf_config.image_size - patch_width, patch_height = hf_config.patch_size, hf_config.patch_size +# Adapted from transformers.models.pixtral.image_processing_pixtral.get_resize_output_image_size # noqa: E501 +# https://github.com/huggingface/transformers/blob/2bd4d5897dc73e8b172832070a6f9e567a0df017/src/transformers/models/pixtral/image_processing_pixtral.py#L180 +def get_pixtral_hf_image_feature_grid_size( + hf_config: PixtralVisionConfig, + *, + image_width: int, + image_height: int, +) -> tuple[int, int]: + max_width = max_height = hf_config.image_size + patch_width = patch_height = hf_config.patch_size ratio = max(image_width / max_width, image_height / max_height) @@ -744,12 +756,38 @@ def get_pixtral_hf_image_feature_size(hf_config: PixtralVisionConfig, image_width = int(math.ceil(image_width / ratio)) image_height = int(math.ceil(image_height / ratio)) - num_height_tokens, num_width_tokens = _get_pixtral_hf_num_image_tokens( + nrows, ncols = _get_pixtral_hf_num_image_tokens( (image_height, image_width), (patch_height, patch_width), - ) + ) # type: ignore + + return ncols, nrows + + +class PixtralHFEncoderInfo(VisionEncoderInfo[PixtralVisionConfig]): + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + return get_pixtral_hf_image_feature_size( + image_size=self.vision_config.image_size, + patch_size=self.get_image_size(), + ) + + def get_max_image_tokens(self) -> int: + return get_max_pixtral_hf_image_tokens(self.vision_config) + + def get_num_patches(self) -> int: + return get_pixtral_hf_patch_grid_length( + image_size=self.vision_config.image_size, + patch_size=self.vision_config.patch_size, + ) - return num_width_tokens, num_height_tokens + def get_image_size(self) -> int: + return self.vision_config.image_size class PixtralHFMLP(nn.Module): diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 6fb9e2cc4584f..115eaaac900e0 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -28,6 +28,8 @@ resolve_visual_encoder_outputs) from vllm.sequence import SequenceData +from .vision import VisionEncoderInfo + def get_siglip_patch_grid_length(*, image_size: int, patch_size: int) -> int: # Since interpolation is applied, the image size need not be divisible @@ -156,6 +158,29 @@ def input_processor_for_siglip( multi_modal_placeholders={"image": ranges}) +class SiglipEncoderInfo(VisionEncoderInfo[SiglipVisionConfig]): + + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + return get_siglip_image_feature_size(self.vision_config) + + def get_max_image_tokens(self) -> int: + return get_max_siglip_image_tokens(self.vision_config) + + def get_num_patches(self) -> int: + return get_siglip_patch_grid_length( + image_size=self.vision_config.image_size, + patch_size=self.vision_config.patch_size, + ) + + def get_image_size(self) -> int: + return self.vision_config.image_size + + # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa class SiglipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 269b66806adf4..31017f16d3c97 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -373,7 +373,7 @@ def embed_multimodal( input_ids: torch.Tensor, multimodal_token_id: int, get_text_embeds: Callable[[torch.Tensor], torch.Tensor], - multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]], + multimodal_embeds: NestedTensors, ) -> torch.Tensor: """ Embed token IDs and multimodal inputs and combine their embeddings. diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py new file mode 100644 index 0000000000000..65a773480d2a1 --- /dev/null +++ b/vllm/model_executor/models/vision.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod +from typing import Generic, TypeVar + +from transformers import PretrainedConfig + +_C = TypeVar("_C", bound=PretrainedConfig) + + +class VisionEncoderInfo(ABC, Generic[_C]): + + def __init__(self, vision_config: _C) -> None: + super().__init__() + + self.vision_config = vision_config + + @abstractmethod + def get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + raise NotImplementedError + + @abstractmethod + def get_max_image_tokens(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_num_patches(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_image_size(self) -> int: + raise NotImplementedError + + +def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: + # Avoid circular imports + from .clip import CLIPEncoderInfo, CLIPVisionConfig + from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig + from .siglip import SiglipEncoderInfo, SiglipVisionConfig + + if isinstance(vision_config, CLIPVisionConfig): + return CLIPEncoderInfo(vision_config) + if isinstance(vision_config, PixtralVisionConfig): + return PixtralHFEncoderInfo(vision_config) + if isinstance(vision_config, SiglipVisionConfig): + return SiglipEncoderInfo(vision_config) + + msg = f"Unsupported vision config: {type(vision_config)}" + raise NotImplementedError(msg) diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 4e1b78ab2c59d..00acb77435163 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod from collections import UserDict from collections.abc import Callable, Iterator, Mapping, Sequence -from typing import TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar +from typing import (TYPE_CHECKING, Any, Generic, NamedTuple, Optional, TypeVar, + Union) import numpy as np import torch @@ -87,7 +88,7 @@ class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): def get_count(self) -> int: return len(self.data) - def get(self, index: int) -> object: + def get(self, index: int) -> torch.Tensor: return self.data[index] def get_processor_data(self) -> Mapping[str, object]: @@ -96,6 +97,9 @@ def get_processor_data(self) -> Mapping[str, object]: def get_passthrough_data(self) -> Mapping[str, object]: return {f"{self.modality}_embeds": self.data} + def get_feature_size(self, item_idx: int) -> int: + return len(self.get(item_idx)) + class AudioProcessorItems(ProcessorBatchItems[HfAudioItem]): @@ -182,7 +186,7 @@ def get_all_counts(self) -> Mapping[str, int]: def get_items( self, modality: str, - typ: type[_D], + typ: Union[type[_D], tuple[type[_D], ...]], ) -> _D: """ Get the data items belonging to a modality, @@ -199,7 +203,7 @@ def get_items( f"Expected type: {typ}, but " f"found type: {type(items)}") - return items + return items # type: ignore[return-value] ModalityDataParser: TypeAlias = Callable[[ModalityData[Any]], From 84c35c374a8fd3d10559ef220793fea6c5497cf2 Mon Sep 17 00:00:00 2001 From: Chunyang Wen Date: Fri, 3 Jan 2025 02:14:16 +0800 Subject: [PATCH 051/462] According to vllm.EngineArgs, the name should be distributed_executor_backend (#11689) --- docs/source/serving/distributed_serving.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/distributed_serving.md b/docs/source/serving/distributed_serving.md index 7446b7c84cf46..a1dd0e89e8c79 100644 --- a/docs/source/serving/distributed_serving.md +++ b/docs/source/serving/distributed_serving.md @@ -22,7 +22,7 @@ There is one edge case: if the model fits in a single node with multiple GPUs, b vLLM supports distributed tensor-parallel and pipeline-parallel inference and serving. Currently, we support [Megatron-LM's tensor parallel algorithm](https://arxiv.org/pdf/1909.08053.pdf). We manage the distributed runtime with either [Ray](https://github.com/ray-project/ray) or python native multiprocessing. Multiprocessing can be used when deploying on a single node, multi-node inferencing currently requires Ray. -Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed-executor-backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. +Multiprocessing will be used by default when not running in a Ray placement group and if there are sufficient GPUs available on the same node for the configured {code}`tensor_parallel_size`, otherwise Ray will be used. This default can be overridden via the {code}`LLM` class {code}`distributed_executor_backend` argument or {code}`--distributed-executor-backend` API server argument. Set it to {code}`mp` for multiprocessing or {code}`ray` for Ray. It's not required for Ray to be installed for the multiprocessing case. To run multi-GPU inference with the {code}`LLM` class, set the {code}`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: From 2f385183f35497e030ef22c9820d83b83bc4f6db Mon Sep 17 00:00:00 2001 From: Kathy Yu <143133934+kathyyu-google@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:28:09 -0800 Subject: [PATCH 052/462] [Bugfix] Free cross attention block table for preempted-for-recompute sequence group. (#10013) Signed-off-by: Kathy Yu --- vllm/core/scheduler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index c3bc6becf0995..b3d396f9cedda 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -1579,6 +1579,7 @@ def _preempt_by_recompute( seq.status = SequenceStatus.WAITING self.free_seq(seq) seq.reset_state_for_recompute() + self._free_seq_group_cross_attn_blocks(seq_group) def _preempt_by_swap( self, From b55ed6ef8ab0dce7fb0f79ff292dafdb4d22610c Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Fri, 3 Jan 2025 04:04:58 +0900 Subject: [PATCH 053/462] [V1][Minor] Optimize token_ids_cpu copy (#11692) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_input_batch.py | 13 ++++++++----- vllm/v1/worker/gpu_model_runner.py | 1 + 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index e79145300fe06..f8a1427c6c26c 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -66,8 +66,9 @@ def __init__( pin_memory=False, ) self.token_ids_cpu = self.token_ids_cpu_tensor.numpy() - self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) + self.num_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) + self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) # Attention-related. self.block_table = torch.zeros( @@ -189,6 +190,7 @@ def add_request( end_idx = start_idx + len(request.output_token_ids) self.token_ids_cpu[req_index, start_idx:end_idx] = request.output_token_ids + self.num_tokens[req_index] = request.num_tokens self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens num_blocks = len(request.block_ids) @@ -290,14 +292,15 @@ def condense(self, empty_req_indices: List[int]) -> None: self.req_ids[last_req_index] = None self.req_id_to_index[req_id] = empty_index - # TODO(woosuk): Optimize the copy of token_ids_cpu and - # block_table_cpu. - self.token_ids_cpu[empty_index] = self.token_ids_cpu[ - last_req_index] + num_tokens = self.num_tokens[last_req_index] + self.token_ids_cpu[empty_index, :num_tokens] = self.token_ids_cpu[ + last_req_index, :num_tokens] + self.num_tokens[empty_index] = num_tokens self.num_prompt_tokens[empty_index] = \ self.num_prompt_tokens[last_req_index] self.num_computed_tokens_cpu[ empty_index] = self.num_computed_tokens_cpu[last_req_index] + # TODO(woosuk): Optimize the copy of block_table_cpu. self.block_table_cpu[empty_index] = self.block_table_cpu[ last_req_index] self.temperature_cpu[empty_index] = self.temperature_cpu[ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 995de54e8e0a0..75098b0330ac9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -644,6 +644,7 @@ def execute_model( # Append the sampled token to the output token ids. token_id = sampled_token_ids[i] self.input_batch.token_ids_cpu[i, seq_len] = token_id + self.input_batch.num_tokens[i] += 1 req_state.output_token_ids.append(token_id) else: # Ignore the sampled token from the partial request. From 187e32997cdc20bbed5c21d3cef2609ab8ed9080 Mon Sep 17 00:00:00 2001 From: bjmsong Date: Fri, 3 Jan 2025 05:11:39 +0800 Subject: [PATCH 054/462] [Bugfix] Change kv scaling factor by param json on nvidia gpu (#11688) Signed-off-by: bjmsong Co-authored-by: bjmsong --- vllm/model_executor/models/exaone.py | 5 +++-- vllm/model_executor/models/granite.py | 5 +++-- vllm/model_executor/models/llama.py | 5 +++-- vllm/model_executor/models/solar.py | 5 +++-- vllm/worker/model_runner.py | 3 ++- 5 files changed, 14 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 0398f0943a70a..8324a563edd64 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -606,8 +606,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.attn._kv_scale = scaling_factor + if hasattr(layer_self_attn.attn, "_k_scale"): + layer_self_attn.attn._k_scale = scaling_factor + layer_self_attn.attn._v_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index f9e0443b9a508..a91ed4158a73f 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -545,8 +545,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.attn._kv_scale = scaling_factor + if hasattr(layer_self_attn.attn, "_k_scale"): + layer_self_attn.attn._k_scale = scaling_factor + layer_self_attn.attn._v_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2902e6999c2fd..8623da99574bb 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -452,8 +452,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.attn._kv_scale = scaling_factor + if hasattr(layer_self_attn.attn, "_k_scale"): + layer_self_attn.attn._k_scale = scaling_factor + layer_self_attn.attn._v_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index caae0b65d7d10..a7cf65a0e36e4 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -565,8 +565,9 @@ def load_kv_cache_scales(self, quantization_param_path: str) -> None: # which is consistent with the practice of setting # scaling_factor = tensor_amax / FPtype_max scaling_factor *= 2 - if hasattr(layer_self_attn, "kv_scale"): - layer_self_attn.attn._kv_scale = scaling_factor + if hasattr(layer_self_attn.attn, "_k_scale"): + layer_self_attn.attn._k_scale = scaling_factor + layer_self_attn.attn._v_scale = scaling_factor else: raise RuntimeError("Self attention has no KV cache scaling " "factor attribute!") diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2b545d1b28bd2..637fba23611f4 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1136,7 +1136,8 @@ def load_model(self) -> None: self.prompt_adapter_manager.create_prompt_adapter_manager( self.model)) - if self.kv_cache_dtype == "fp8" and current_platform.is_rocm(): + if self.kv_cache_dtype == "fp8" and (current_platform.is_rocm() + or current_platform.is_cuda()): # Currently only ROCm accepts kv-cache scaling factors # via quantization_param_path and this will be deprecated # in the future. From 5dba2575065f5e27d468f2776e3d460a21d916e6 Mon Sep 17 00:00:00 2001 From: wchen61 Date: Fri, 3 Jan 2025 06:58:56 +0800 Subject: [PATCH 055/462] Resolve race conditions in Marlin kernel (#11493) Signed-off-by: wchen61 --- csrc/quantization/gptq_marlin/gptq_marlin.cu | 40 ++++++++++---------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu index 0c698ced7713d..04ef842fbdf95 100644 --- a/csrc/quantization/gptq_marlin/gptq_marlin.cu +++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu @@ -834,6 +834,7 @@ __global__ void Marlin( int4* sh_g_idx = sh_b + (stages * b_sh_stage); int4* sh_zp = sh_g_idx + (stages * g_idx_stage); int4* sh_s = sh_zp + (stages * zp_sh_stage); + int4* sh_red = sh_s + (stages * s_sh_stage); // Register storage for double buffer of shared memory reads. FragA frag_a[2][thread_m_blocks]; @@ -932,11 +933,11 @@ __global__ void Marlin( int4* sh_s_stage = sh_s + s_sh_stage * pipe; if constexpr (group_blocks >= thread_k_blocks) { + if (s_sh_wr_pred) { + cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); + } // Only fetch scales if this tile starts a new group - if (pipe % (group_blocks / thread_k_blocks) == 0) { - if (s_sh_wr_pred) { - cp_async4(&sh_s_stage[s_sh_wr], &scales_ptr[s_gl_rd]); - } + if ((pipe + 1) % (group_blocks / thread_k_blocks) == 0) { s_gl_rd += s_gl_rd_delta; } } else { @@ -1038,9 +1039,7 @@ __global__ void Marlin( // No act-order case if constexpr (group_blocks != -1) { if constexpr (group_blocks >= thread_k_blocks) { - int4* sh_s_stage = - sh_s + s_sh_stage * ((group_blocks / thread_k_blocks) * - (pipe / (group_blocks / thread_k_blocks))); + int4* sh_s_stage = sh_s + s_sh_stage * pipe; reinterpret_cast(&frag_s[k % 2])[0] = sh_s_stage[s_sh_rd]; } else { int warp_id = threadIdx.x / 32; @@ -1339,15 +1338,15 @@ __global__ void Marlin( int red_sh_wr = red_sh_delta * j + (red_sh_rd - red_sh_stride * i); if (i < red_off) { - float* c_rd = - reinterpret_cast(&sh[red_sh_delta * j + red_sh_rd]); - float* c_wr = reinterpret_cast(&sh[red_sh_wr]); + float* c_rd = reinterpret_cast( + &sh_red[red_sh_delta * j + red_sh_rd]); + float* c_wr = reinterpret_cast(&sh_red[red_sh_wr]); #pragma unroll for (int k = 0; k < 4; k++) reinterpret_cast(frag_c)[4 * 2 * m_block + j][k] += c_rd[k] + c_wr[k]; } - sh[red_sh_wr] = + sh_red[red_sh_wr] = reinterpret_cast(&frag_c)[4 * 2 * m_block + j]; } } @@ -1357,7 +1356,7 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < 4 * 2; i++) { float* c_rd = - reinterpret_cast(&sh[red_sh_delta * i + red_sh_rd]); + reinterpret_cast(&sh_red[red_sh_delta * i + red_sh_rd]); #pragma unroll for (int j = 0; j < 4; j++) reinterpret_cast(frag_c)[4 * 2 * m_block + i][j] += @@ -1397,7 +1396,7 @@ __global__ void Marlin( #pragma unroll for (int i = 0; i < thread_m_blocks * 4; i++) { cp_async4_pred( - &sh[c_sh_wr + c_sh_wr_delta * i], + &sh_red[c_sh_wr + c_sh_wr_delta * i], &C[c_gl_wr + c_gl_wr_delta_o * (i / 2) + c_gl_wr_delta_i * (i % 2)], i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m); @@ -1410,7 +1409,7 @@ __global__ void Marlin( for (int i = 0; i < thread_m_blocks * 4; i++) { if (i < (thread_m_blocks - 1) * 4 || 8 * (i / 2) + row < prob_m) { if (!first) { - int4 c_red = sh[c_sh_wr + i * c_sh_wr_delta]; + int4 c_red = sh_red[c_sh_wr + i * c_sh_wr_delta]; #pragma unroll for (int j = 0; j < 2 * 4; j++) { reinterpret_cast( @@ -1461,10 +1460,10 @@ __global__ void Marlin( float* frag_c_ptr = reinterpret_cast(&frag_c); #pragma unroll for (int k = 0; k < th_size; k++) { - sh[threadIdx.x] = + sh_red[threadIdx.x] = C_tmp[c_cur_offset + active_threads * k + threadIdx.x]; - float* sh_c_ptr = reinterpret_cast(&sh[threadIdx.x]); + float* sh_c_ptr = reinterpret_cast(&sh_red[threadIdx.x]); #pragma unroll for (int f = 0; f < 4; f++) { frag_c_ptr[k * 4 + f] += sh_c_ptr[f]; @@ -1515,7 +1514,7 @@ __global__ void Marlin( res = __hmul2(res, s[0]); } - ((scalar_t2*)sh)[idx] = res; + ((scalar_t2*)sh_red)[idx] = res; }; if (threadIdx.x / 32 < thread_n_blocks / 4) { @@ -1543,7 +1542,7 @@ __global__ void Marlin( i < div_ceil(16 * thread_m_blocks, threads / (2 * thread_n_blocks)); i++) { if (c_gl_wr < c_gl_wr_end) { - C[c_gl_wr] = sh[c_sh_rd]; + C[c_gl_wr] = sh_red[c_sh_rd]; c_gl_wr += c_gl_wr_delta; c_sh_rd += c_sh_rd_delta; } @@ -1865,9 +1864,12 @@ bool is_valid_cache_size(thread_config_t const& th_config, int max_m_blocks, float pipe_size = (a_size + b_size) * pipe_stages; + float reduce_size = max(th_config.num_threads * 32 * 4, + (tb_n / 64) * 32 * (tb_max_m / 16) * 4 * 2 * 4 * 2); + TORCH_CHECK(max_shared_mem / 2 > scales_cache_size); // Sanity - return pipe_size < 0.95f * (max_shared_mem - scales_cache_size); + return pipe_size + reduce_size < 0.95f * (max_shared_mem - scales_cache_size); } bool is_valid_config(thread_config_t const& th_config, int max_m_blocks, From 68d37809b9b52f4d012fa0dfbb187f0fe978bdbc Mon Sep 17 00:00:00 2001 From: Nathan Azrak <42650258+nathan-az@users.noreply.github.com> Date: Fri, 3 Jan 2025 10:59:25 +1100 Subject: [PATCH 056/462] [Misc] Minimum requirements for SageMaker compatibility (#11576) --- Dockerfile | 13 +++++- examples/sagemaker-entrypoint.sh | 24 +++++++++++ vllm/entrypoints/openai/api_server.py | 61 ++++++++++++++++++++++++++- 3 files changed, 95 insertions(+), 3 deletions(-) create mode 100644 examples/sagemaker-entrypoint.sh diff --git a/Dockerfile b/Dockerfile index 153bff9cf565f..088314eb38dbe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -234,8 +234,8 @@ RUN mv vllm test_docs/ #################### TEST IMAGE #################### #################### OPENAI API SERVER #################### -# openai api server alternative -FROM vllm-base AS vllm-openai +# base openai image with additional requirements, for any subsequent openai-style images +FROM vllm-base AS vllm-openai-base # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ @@ -247,5 +247,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ENV VLLM_USAGE_SOURCE production-docker-image +# define sagemaker first, so it is not default from `docker build` +FROM vllm-openai-base AS vllm-sagemaker + +COPY examples/sagemaker-entrypoint.sh . +RUN chmod +x sagemaker-entrypoint.sh +ENTRYPOINT ["./sagemaker-entrypoint.sh"] + +FROM vllm-openai-base AS vllm-openai + ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] #################### OPENAI API SERVER #################### diff --git a/examples/sagemaker-entrypoint.sh b/examples/sagemaker-entrypoint.sh new file mode 100644 index 0000000000000..75a99ffc1f155 --- /dev/null +++ b/examples/sagemaker-entrypoint.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Define the prefix for environment variables to look for +PREFIX="SM_VLLM_" +ARG_PREFIX="--" + +# Initialize an array for storing the arguments +# port 8080 required by sagemaker, https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-code-container-response +ARGS=(--port 8080) + +# Loop through all environment variables +while IFS='=' read -r key value; do + # Remove the prefix from the key, convert to lowercase, and replace underscores with dashes + arg_name=$(echo "${key#"${PREFIX}"}" | tr '[:upper:]' '[:lower:]' | tr '_' '-') + + # Add the argument name and value to the ARGS array + ARGS+=("${ARG_PREFIX}${arg_name}") + if [ -n "$value" ]; then + ARGS+=("$value") + fi +done < <(env | grep "^${PREFIX}") + +# Pass the collected arguments to the main entrypoint +exec python3 -m vllm.entrypoints.openai.api_server "${ARGS[@]}" \ No newline at end of file diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 74fe378fdae42..e942b475535ad 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -16,7 +16,7 @@ from typing import AsyncIterator, Optional, Set, Tuple import uvloop -from fastapi import APIRouter, FastAPI, Request +from fastapi import APIRouter, FastAPI, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, Response, StreamingResponse @@ -44,11 +44,15 @@ CompletionResponse, DetokenizeRequest, DetokenizeResponse, + EmbeddingChatRequest, + EmbeddingCompletionRequest, EmbeddingRequest, EmbeddingResponse, EmbeddingResponseData, ErrorResponse, LoadLoraAdapterRequest, + PoolingChatRequest, + PoolingCompletionRequest, PoolingRequest, PoolingResponse, ScoreRequest, ScoreResponse, TokenizeRequest, @@ -310,6 +314,12 @@ async def health(raw_request: Request) -> Response: return Response(status_code=200) +@router.api_route("/ping", methods=["GET", "POST"]) +async def ping(raw_request: Request) -> Response: + """Ping check. Endpoint required for SageMaker""" + return await health(raw_request) + + @router.post("/tokenize") @with_cancellation async def tokenize(request: TokenizeRequest, raw_request: Request): @@ -483,6 +493,54 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) +TASK_HANDLERS = { + "generate": { + "messages": (ChatCompletionRequest, create_chat_completion), + "default": (CompletionRequest, create_completion), + }, + "embed": { + "messages": (EmbeddingChatRequest, create_embedding), + "default": (EmbeddingCompletionRequest, create_embedding), + }, + "score": { + "default": (ScoreRequest, create_score), + }, + "reward": { + "messages": (PoolingChatRequest, create_pooling), + "default": (PoolingCompletionRequest, create_pooling), + }, + "classify": { + "messages": (PoolingChatRequest, create_pooling), + "default": (PoolingCompletionRequest, create_pooling), + }, +} + + +@router.post("/invocations") +async def invocations(raw_request: Request): + """ + For SageMaker, routes requests to other handlers based on model `task`. + """ + body = await raw_request.json() + task = raw_request.app.state.task + + if task not in TASK_HANDLERS: + raise HTTPException( + status_code=400, + detail=f"Unsupported task: '{task}' for '/invocations'. " + f"Expected one of {set(TASK_HANDLERS.keys())}") + + handler_config = TASK_HANDLERS[task] + if "messages" in body: + request_model, handler = handler_config["messages"] + else: + request_model, handler = handler_config["default"] + + # this is required since we lose the FastAPI automatic casting + request = request_model.model_validate(body) + return await handler(request, raw_request) + + if envs.VLLM_TORCH_PROFILER_DIR: logger.warning( "Torch Profiler is enabled in the API server. This should ONLY be " @@ -687,6 +745,7 @@ def init_app_state( chat_template=resolved_chat_template, chat_template_content_format=args.chat_template_content_format, ) + state.task = model_config.task def create_server_socket(addr: Tuple[str, int]) -> socket.socket: From 2f1e8e8f54032e38998e90427aedf649c0beee39 Mon Sep 17 00:00:00 2001 From: Sachin Varghese Date: Thu, 2 Jan 2025 19:25:53 -0500 Subject: [PATCH 057/462] Update default max_num_batch_tokens for chunked prefill (#11694) --- docs/source/usage/performance.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/source/usage/performance.md b/docs/source/usage/performance.md index f028e28627a9f..2cd3801bfc82d 100644 --- a/docs/source/usage/performance.md +++ b/docs/source/usage/performance.md @@ -32,8 +32,8 @@ You can enable the feature by specifying `--enable-chunked-prefill` in the comma ```python llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True) # Set max_num_batched_tokens to tune performance. -# NOTE: 512 is the default max_num_batched_tokens for chunked prefill. -# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512) +# NOTE: 2048 is the default max_num_batched_tokens for chunked prefill. +# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=2048) ``` By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. @@ -49,13 +49,12 @@ This policy has two benefits: - It improves ITL and generation decode because decode requests are prioritized. - It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch. -You can tune the performance by changing `max_num_batched_tokens`. -By default, it is set to 512, which has the best ITL on A100 in the initial benchmark (llama 70B and mixtral 8x22B). +You can tune the performance by changing `max_num_batched_tokens`. By default, it is set to 2048. Smaller `max_num_batched_tokens` achieves better ITL because there are fewer prefills interrupting decodes. Higher `max_num_batched_tokens` achieves better TTFT as you can put more prefill to the batch. - If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes). -- Note that the default value (512) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. +- Note that the default value (2048) of `max_num_batched_tokens` is optimized for ITL, and it may have lower throughput than the default scheduler. We recommend you set `max_num_batched_tokens > 2048` for throughput. From 07064cb1d49d2b04ec58d8876bee2cd8281eedf5 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Thu, 2 Jan 2025 16:58:56 -0800 Subject: [PATCH 058/462] [Bugfix] Check chain_speculative_sampling before calling it (#11673) Signed-off-by: Lu Fang --- vllm/model_executor/layers/rejection_sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 165e8309fee64..f173cbde03f44 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -118,7 +118,7 @@ def forward( # If use Flashinfer chain_speculative_sampling kernel # for rejection sampling - if self.use_flashinfer: + if self.use_flashinfer and chain_speculative_sampling is not None: batch_size, k, _ = draft_probs.shape uniform_samples = self._create_uniform_samples( seeded_seqs, batch_size, k, draft_probs.device) From fd3a62a122fcbc9331d000b325e72687629ef1bd Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Fri, 3 Jan 2025 13:38:37 +0700 Subject: [PATCH 059/462] [perf-benchmark] Fix dependency for steps in benchmark pipeline (#11710) --- .buildkite/nightly-benchmarks/benchmark-pipeline.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml index 868b8e95db01d..679abf1814aa5 100644 --- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -1,5 +1,6 @@ steps: - label: "Wait for container to be ready" + key: wait-for-container-image agents: queue: A100 plugins: @@ -10,12 +11,11 @@ steps: command: - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh - - wait - - label: "A100" # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: A100 + depends_on: wait-for-container-image plugins: - kubernetes: podSpec: @@ -49,6 +49,7 @@ steps: # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H200 + depends_on: wait-for-container-image plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT @@ -73,7 +74,7 @@ steps: # skip: "use this flag to conditionally skip the benchmark step, useful for PR testing" agents: queue: H100 - depends_on: ~ + depends_on: wait-for-container-image plugins: - docker#v5.12.0: image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT From e1a5c2f0a123835558b1b1c9895181161527c55e Mon Sep 17 00:00:00 2001 From: Aurick Qiao Date: Fri, 3 Jan 2025 03:39:19 -0500 Subject: [PATCH 060/462] [Model] Whisper model implementation (#11280) Co-authored-by: Aurick Qiao --- .buildkite/test-pipeline.yaml | 2 + examples/offline_inference_whisper.py | 59 ++ .../audio_language/__init__.py | 0 .../audio_language/test_whisper.py | 136 ++++ tests/models/registry.py | 1 + vllm/config.py | 2 + vllm/inputs/preprocess.py | 36 +- vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/whisper.py | 737 ++++++++++++++++++ vllm/multimodal/processing.py | 28 +- vllm/sequence.py | 18 +- vllm/transformers_utils/tokenizer.py | 19 + .../tokenizer_group/base_tokenizer_group.py | 6 +- .../tokenizer_group/ray_tokenizer_group.py | 28 +- .../tokenizer_group/tokenizer_group.py | 16 +- vllm/worker/enc_dec_model_runner.py | 11 +- 16 files changed, 1045 insertions(+), 55 deletions(-) create mode 100644 examples/offline_inference_whisper.py create mode 100644 tests/models/encoder_decoder/audio_language/__init__.py create mode 100644 tests/models/encoder_decoder/audio_language/test_whisper.py create mode 100644 vllm/model_executor/models/whisper.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c6f8316412e2f..529daf54faecf 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -363,12 +363,14 @@ steps: - tests/models/decoder_only/audio_language - tests/models/decoder_only/vision_language - tests/models/embedding/vision_language + - tests/models/encoder_decoder/audio_language - tests/models/encoder_decoder/vision_language commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - pytest -v -s models/embedding/vision_language -m core_model + - pytest -v -s models/encoder_decoder/audio_language -m core_model - pytest -v -s models/encoder_decoder/language -m core_model - pytest -v -s models/encoder_decoder/vision_language -m core_model diff --git a/examples/offline_inference_whisper.py b/examples/offline_inference_whisper.py new file mode 100644 index 0000000000000..087ad4376fb2e --- /dev/null +++ b/examples/offline_inference_whisper.py @@ -0,0 +1,59 @@ +import time + +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset + +# Create a Whisper encoder/decoder model instance +llm = LLM( + model="openai/whisper-large-v3", + max_model_len=448, + max_num_seqs=400, + limit_mm_per_prompt={"audio": 1}, + kv_cache_dtype="fp8", +) + +prompts = [ + { + "prompt": "<|startoftranscript|>", + "multi_modal_data": { + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + }, + }, + { # Test explicit encoder/decoder prompt + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": AudioAsset("winning_call").audio_and_sample_rate, + }, + }, + "decoder_prompt": "<|startoftranscript|>", + } +] * 1024 + +# Create a sampling params object. +sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + max_tokens=200, +) + +start = time.time() + +# Generate output tokens from the prompts. The output is a list of +# RequestOutput objects that contain the prompt, generated +# text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Encoder prompt: {encoder_prompt!r}, " + f"Decoder prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + +duration = time.time() - start + +print("Duration:", duration) +print("RPS:", len(prompts) / duration) diff --git a/tests/models/encoder_decoder/audio_language/__init__.py b/tests/models/encoder_decoder/audio_language/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/models/encoder_decoder/audio_language/test_whisper.py b/tests/models/encoder_decoder/audio_language/test_whisper.py new file mode 100644 index 0000000000000..eb238c5332139 --- /dev/null +++ b/tests/models/encoder_decoder/audio_language/test_whisper.py @@ -0,0 +1,136 @@ +"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling. + +Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`. +""" +from typing import Optional + +import pytest + +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset + +from ....utils import fork_new_process_for_each_test, multi_gpu_test + +PROMPTS = [ + { + "prompt": + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + "multi_modal_data": { + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + }, + }, + { # Test explicit encoder/decoder prompt + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": AudioAsset("winning_call").audio_and_sample_rate, + }, + }, + "decoder_prompt": + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + } +] + +EXPECTED = { + "openai/whisper-tiny": [ + " He has birth words I spoke in the original corner of that. And a" + " little piece of black coat poetry. Mary had a little sandwich," + " sweet, with white and snow. And everyone had it very went the last" + " would sure to go.", + " >> And the old one, fit John the way to Edgar Martinez. >> One more" + " to line down the field line for our base camp. Here comes joy. Here" + " is June and the third base. They're going to wave him in. The throw" + " to the plate will be late. The Mariners are going to play for the" + " American League Championship. I don't believe it. It just continues" + " by all five." + ], + "openai/whisper-small": [ + " The first words I spoke in the original pornograph. A little piece" + " of practical poetry. Mary had a little lamb, its fleece was quite a" + " slow, and everywhere that Mary went the lamb was sure to go.", + " And the old one pitch on the way to Edgar Martinez one month. Here" + " comes joy. Here is Junior to third base. They're gonna wave him" + " in. The throw to the plate will be late. The Mariners are going to" + " play for the American League Championship. I don't believe it. It" + " just continues. My, oh my." + ], + "openai/whisper-medium": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its fleece was quite as" + " slow, and everywhere that Mary went the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez swung on the line" + " down the left field line for Obeyshev. Here comes Joy. Here is" + " Jorgen at third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh" + " my." + ], + "openai/whisper-large-v3": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its feet were quite as" + " slow, and everywhere that Mary went, the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line." + " Now the left field line for a base hit. Here comes Joy. Here is" + " Junior to third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh," + " my." + ], + "openai/whisper-large-v3-turbo": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its streets were quite" + " as slow, and everywhere that Mary went the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line" + " down the left field line for a base hit. Here comes Joy. Here is" + " Junior to third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh," + " my." + ] +} + + +def run_test( + model: str, + *, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +) -> None: + prompt_list = PROMPTS * 10 + expected_list = EXPECTED[model] * 10 + + llm = LLM( + model=model, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + ) + + sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + max_tokens=200, + ) + + outputs = llm.generate(prompt_list, sampling_params) + + for output, expected in zip(outputs, expected_list): + print(output.outputs[0].text) + assert output.outputs[0].text == expected + + +@fork_new_process_for_each_test +@pytest.mark.core_model +@pytest.mark.parametrize( + "model", ["openai/whisper-small", "openai/whisper-large-v3-turbo"]) +def test_models(model) -> None: + run_test(model, tensor_parallel_size=1) + + +@multi_gpu_test(num_gpus=2) +@pytest.mark.core_model +@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) +@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) +def test_models_distributed(model, distributed_executor_backend) -> None: + run_test(model, + tensor_parallel_size=2, + distributed_executor_backend=distributed_executor_backend) diff --git a/tests/models/registry.py b/tests/models/registry.py index e5dfb2822745d..dcb8bfa0f9510 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -204,6 +204,7 @@ class _HfExamplesInfo: "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"), # [Encoder-decoder] "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 + "WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501 } _SPECULATIVE_DECODING_EXAMPLE_MODELS = { diff --git a/vllm/config.py b/vllm/config.py index e72c53b6130d0..b51f9783008b2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2312,6 +2312,8 @@ def _get_and_verify_max_len( "seq_length", # Command-R "model_max_length", + # Whisper + "max_target_positions", # Others "max_sequence_length", "max_seq_length", diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index aaa10d278ddb0..b362ee0cac328 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -184,10 +184,16 @@ def _tokenize_prompt( corresponding token IDs. """ tokenizer = self.get_tokenizer_group() - + add_special_tokens = None + if self.model_config.hf_config.model_type == "whisper": + # For Whisper, special tokens should be provided by the user based + # on the task and language of their request. Also needed to avoid + # appending an EOS token to the prompt which disrupts generation. + add_special_tokens = False return tokenizer.encode(request_id=request_id, prompt=prompt, - lora_request=lora_request) + lora_request=lora_request, + add_special_tokens=add_special_tokens) async def _tokenize_prompt_async( self, @@ -197,10 +203,17 @@ async def _tokenize_prompt_async( ) -> List[int]: """Async version of :meth:`_tokenize_prompt`.""" tokenizer = self.get_tokenizer_group() - - return await tokenizer.encode_async(request_id=request_id, - prompt=prompt, - lora_request=lora_request) + add_special_tokens = None + if self.model_config.hf_config.model_type == "whisper": + # For Whisper, special tokens should be provided by the user based + # on the task and language of their request. Also needed to avoid + # appending an EOS token to the prompt which disrupts generation. + add_special_tokens = False + return await tokenizer.encode_async( + request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens) def _can_process_multimodal(self) -> bool: model_config = self.model_config @@ -439,8 +452,15 @@ def _build_enc_dec_llm_inputs( assert_never(encoder_inputs) # type: ignore[arg-type] if decoder_inputs is None: - dec_token_ids = self._prepare_decoder_input_ids_for_generation( - None) + if self.model_config.hf_config.model_type == "whisper": + # For Whisper models, the text prompt should go to the decoder. + # If no explicit encoder/decoder inputs, then copy the prompt + # from the encoder to the decoder. The encoder tokens are later + # overridden by the audio features. + dec_token_ids = encoder_inputs["prompt_token_ids"].copy() + else: + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + None) decoder_inputs = token_inputs(dec_token_ids) elif (decoder_inputs["type"] == "token" or decoder_inputs["type"] == "multimodal"): diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 07f4b5a3b3bc8..62840b8c1bcda 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -170,6 +170,7 @@ "UltravoxModel": ("ultravox", "UltravoxModel"), # [Encoder-decoder] "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 + "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 } _SPECULATIVE_DECODING_MODELS = { diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py new file mode 100644 index 0000000000000..cb54b4c3ba663 --- /dev/null +++ b/vllm/model_executor/models/whisper.py @@ -0,0 +1,737 @@ +import math +from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, + Union) + +import numpy as np +import torch +from torch import nn +from transformers.models.whisper.modeling_whisper import sinusoids + +from vllm.attention import Attention, AttentionMetadata, AttentionType +from vllm.config import CacheConfig, VllmConfig +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.inputs import INPUT_REGISTRY, DummyData, InputContext +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.audio import resample_audio +from vllm.sequence import SequenceData +from vllm.transformers_utils.processor import cached_get_processor + +from .interfaces import SupportsMultiModal +from .utils import AutoWeightsLoader, WeightsMapper, make_layers + +logger = init_logger(__name__) + + +class WhisperAudioInputs(TypedDict): + input_features: NestedTensors + """Shape: `(batch_size, 128, M)`""" + + +class WhisperPositionalEmbedding(nn.Embedding): + + def __init__(self, + num_positions: int, + embedding_dim: int, + padding_idx: Optional[int] = None): + super().__init__(num_positions, embedding_dim) + + def forward(self, position_ids): + return self.weight[position_ids] + + +class WhisperAttention(nn.Module): + + def __init__( + self, + embed_dim: int, + num_heads: int, + bias: bool = True, + attn_type: AttentionType = AttentionType.DECODER, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + self.embed_dim = embed_dim + tp_size = get_tensor_model_parallel_world_size() + self.total_num_heads = num_heads + assert self.total_num_heads % tp_size == 0 + self.num_heads = self.total_num_heads // tp_size + if self.total_num_heads >= tp_size: + # Number of heads is greater than TP size, so we partition + # the KV heads across multiple tensor parallel GPUs. + assert self.total_num_heads % tp_size == 0 + else: + # Number of heads is less than TP size, so we replicate + # the KV heads across multiple tensor parallel GPUs. + assert tp_size % self.total_num_heads == 0 + self.num_kv_heads = max(1, self.total_num_heads // tp_size) + self.head_dim = self.embed_dim // self.total_num_heads + self.q_size = self.num_heads * self.head_dim + self.kv_size = self.num_kv_heads * self.head_dim + self.attn_type = attn_type + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: " + f"{self.embed_dim} and `num_heads`: {num_heads}).") + self.scaling = self.head_dim**-0.5 + + self._init_qkv(embed_dim, bias, quant_config, prefix=prefix) + self.out_proj = RowParallelLinear( + input_size=embed_dim, + output_size=embed_dim, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.out_proj", + ) + self.attn = Attention( + self.num_heads, + self.head_dim, + self.scaling, + num_kv_heads=self.num_kv_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.attn", + ) + + def _init_qkv( + self, + embed_dim: int, + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + self.qkv_proj = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.head_dim, + total_num_heads=self.total_num_heads, + total_num_kv_heads=self.total_num_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", + ) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ): + qkv, _ = self.qkv_proj(hidden_states) + q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) + + attn_output = self.attn(q, + k, + v, + kv_cache, + attn_metadata, + attn_type=self.attn_type) + + output, _ = self.out_proj(attn_output) + + return output + + +class WhisperCrossAttention(WhisperAttention): + + def __init__( + self, + embed_dim: int, + num_heads: int, + bias: bool = True, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__( + embed_dim=embed_dim, + num_heads=num_heads, + bias=bias, + cache_config=cache_config, + quant_config=quant_config, + prefix=prefix, + ) + + def _init_qkv( + self, + embed_dim: int, + bias: bool = True, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: + self.q_proj = ColumnParallelLinear( + input_size=embed_dim, + output_size=embed_dim, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.q_proj", + ) + self.kv_proj = QKVParallelLinear( + hidden_size=embed_dim, + head_size=self.head_dim, + total_num_heads=0, + total_num_kv_heads=self.total_num_heads, + bias=bias, + quant_config=quant_config, + prefix=f"{prefix}.kv_proj", + ) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ): + q, _ = self.q_proj(hidden_states) + + # Encoder hidden states are only computed once during prefill phase. + # Afterwards, the keys and values should be available in the kv-cache. + if encoder_hidden_states is not None: + kv, _ = self.kv_proj(encoder_hidden_states) + k, v = kv.split([self.kv_size, self.kv_size], dim=-1) + else: + k = v = None + + attn_output = self.attn(q, + k, + v, + kv_cache, + attn_metadata, + attn_type=AttentionType.ENCODER_DECODER) + + output, _ = self.out_proj(attn_output) + + return output + + +class WhisperMLP(nn.Module): + + def __init__( + self, + embed_dim: int, + ffn_dim: int, + act_fn: str, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ): + super().__init__() + + self.activation_fn = get_act_fn(act_fn) + self.fc1 = ColumnParallelLinear( + input_size=embed_dim, + output_size=ffn_dim, + quant_config=quant_config, + prefix=f"{prefix}.fc1", + ) + self.fc2 = RowParallelLinear( + input_size=ffn_dim, + output_size=embed_dim, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + ) + + def forward(self, hidden_states: torch.Tensor): + hidden_states, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states, _ = self.fc2(hidden_states) + return hidden_states + + +class WhisperEncoderLayer(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.embed_dim = config.d_model + self.self_attn = WhisperAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + attn_type=AttentionType.ENCODER, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.mlp = WhisperMLP( + embed_dim=config.d_model, + ffn_dim=config.encoder_ffn_dim, + act_fn=config.activation_function, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ): + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states = self.self_attn( + hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + hidden_states = residual + hidden_states + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + if hidden_states.isinf().any() or hidden_states.isnan().any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, + min=-clamp_value, + max=clamp_value) + + return hidden_states + + +class WhisperDecoderLayer(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.self_attn = WhisperAttention( + embed_dim=config.d_model, + num_heads=config.decoder_attention_heads, + attn_type=AttentionType.DECODER, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + self.self_attn_layer_norm = nn.LayerNorm(config.d_model) + self.encoder_attn = WhisperCrossAttention( + embed_dim=config.d_model, + num_heads=config.decoder_attention_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.encoder_attn", + ) + self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model) + self.mlp = WhisperMLP( + embed_dim=config.d_model, + ffn_dim=config.decoder_ffn_dim, + act_fn=config.activation_function, + quant_config=quant_config, + prefix=f"{prefix}.mlp", + ) + self.final_layer_norm = nn.LayerNorm(config.d_model) + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + kv_cache: torch.Tensor, + attn_metadata: AttentionMetadata, + ): + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + hidden_states = self.encoder_attn( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + kv_cache=kv_cache, + attn_metadata=attn_metadata, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + return hidden_states + + +class WhisperEncoder(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + embed_dim = config.d_model + self.num_mel_bins = config.num_mel_bins + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_source_positions + self.embed_scale = (math.sqrt(embed_dim) + if config.scale_embedding else 1.0) + + self.conv1 = nn.Conv1d(self.num_mel_bins, + embed_dim, + kernel_size=3, + padding=1) + self.conv2 = nn.Conv1d(embed_dim, + embed_dim, + kernel_size=3, + stride=2, + padding=1) + self.embed_positions = nn.Embedding(self.max_source_positions, + embed_dim) + self.start_layer, self.end_layer, self.layers = make_layers( + config.encoder_layers, + lambda prefix: WhisperEncoderLayer(vllm_config=vllm_config, + prefix=f"{prefix}.layers"), + prefix=f"{prefix}.layers", + ) + self.layer_norm = nn.LayerNorm(config.d_model) + + with torch.no_grad(): + self.embed_positions.weight.copy_( + sinusoids(*self.embed_positions.weight.shape)) + + def forward( + self, + input_features: Union[torch.Tensor, List[torch.Tensor]], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ): + hidden_states = [] + for features in input_features: + embeds = nn.functional.gelu(self.conv1(features)) + embeds = nn.functional.gelu(self.conv2(embeds)) + embeds = embeds.permute(1, 0) + embeds = embeds + self.embed_positions.weight[:embeds.size(0), :] + hidden_states.append(embeds) + hidden_states = torch.cat(hidden_states) + + for idx, encoder_layer in enumerate(self.layers): + hidden_states = encoder_layer( + hidden_states, + kv_cache=kv_caches[idx], + attn_metadata=attn_metadata, + ) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + +class WhisperDecoder(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_target_positions + self.max_source_positions = config.max_source_positions + self.embed_scale = (math.sqrt(config.d_model) + if config.scale_embedding else 1.0) + + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, + self.padding_idx) + self.embed_positions = WhisperPositionalEmbedding( + self.max_target_positions, config.d_model) + self.start_layer, self.end_layer, self.layers = make_layers( + config.decoder_layers, + lambda prefix: WhisperDecoderLayer(vllm_config=vllm_config, + prefix=f"{prefix}.layers"), + prefix=f"{prefix}.layers", + ) + self.layer_norm = nn.LayerNorm(config.d_model) + + def forward( + self, + input_ids, + positions: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ): + inputs_embeds = self.get_input_embeddings(input_ids) + positions = self.embed_positions(positions) + hidden_states = inputs_embeds + positions + + for idx, decoder_layer in enumerate(self.layers): + hidden_states = decoder_layer( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + kv_cache=kv_caches[idx], + attn_metadata=attn_metadata, + ) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + return self.embed_tokens(input_ids) + + +class WhisperModel(nn.Module): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.encoder = WhisperEncoder(vllm_config=vllm_config, + prefix=f"{prefix}.encoder") + self.decoder = WhisperDecoder(vllm_config=vllm_config, + prefix=f"{prefix}.decoder") + + def forward( + self, + input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + encoder_outputs = self.get_encoder_outputs( + input_features, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + decoder_outputs = self.decoder( + input_ids=input_ids, + positions=positions, + encoder_hidden_states=encoder_outputs, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + return decoder_outputs + + def get_encoder_outputs( + self, + input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + ) -> Optional[torch.Tensor]: + if input_features is None: + return None + return self.encoder( + input_features, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), + (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), + (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), + (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"), + (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + loaded_params: Set[str] = set() + for name, loaded_weight in weights: + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +def get_max_whisper_audio_tokens(ctx: InputContext) -> int: + return ctx.model_config.hf_config.max_source_positions + + +def dummy_encoder_data_for_whisper(ctx: InputContext, seq_len: int, + mm_counts: Mapping[str, int]): + assert mm_counts["audio"] == 1 + num_tokens = get_max_whisper_audio_tokens(ctx) + processor = cached_get_processor(ctx.model_config.model) + chunk_length = processor.feature_extractor.chunk_length + sampling_rate = processor.feature_extractor.sampling_rate + num_samples = chunk_length * sampling_rate + return DummyData( + SequenceData.from_prompt_token_counts((0, num_tokens)), + {"audio": [(np.zeros(num_samples), sampling_rate)]}, + ) + + +def input_processor_for_whisper(ctx: InputContext, inputs): + multi_modal_data = inputs["encoder"]["multi_modal_data"] + if isinstance(multi_modal_data["audio"], list): + assert len(multi_modal_data["audio"]) == 1 + multi_modal_data["audio"] = multi_modal_data["audio"][0] + # Resample and process audio + audio, orig_sr = multi_modal_data["audio"] + processor = cached_get_processor(ctx.model_config.model) + target_sr = processor.feature_extractor.sampling_rate + audio = resample_audio(audio, orig_sr=orig_sr, target_sr=target_sr) + multi_modal_data["audio"] = (audio, target_sr) + # Pre-allocate placeholder tokens in encoder sequence + num_tokens = get_max_whisper_audio_tokens(ctx) + inputs["encoder"]["prompt_token_ids"] = [0] * num_tokens + return inputs + + +def input_mapper_for_whisper( + ctx: InputContext, + multi_modal_data: Union[np.ndarray, List[np.ndarray]], +) -> MultiModalKwargs: + if not isinstance(multi_modal_data, list): + multi_modal_data = [multi_modal_data] + + assert len(multi_modal_data) == 1 + + if len(multi_modal_data) == 0: + return MultiModalKwargs() + + processor = cached_get_processor(ctx.model_config.model) + sampling_rate = processor.feature_extractor.sampling_rate + + audios = [audio for audio, _ in multi_modal_data] + + kwargs = processor(audios, + sampling_rate=sampling_rate, + return_tensors="pt") + kwargs["input_features"] = kwargs["input_features"].squeeze(0).to( + ctx.model_config.dtype) + + return MultiModalKwargs(kwargs) + + +@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper) +@INPUT_REGISTRY.register_input_processor(input_processor_for_whisper) +@MULTIMODAL_REGISTRY.register_input_mapper("audio", input_mapper_for_whisper) +@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( + "audio", get_max_whisper_audio_tokens) +class WhisperForConditionalGeneration(nn.Module, SupportsMultiModal): + + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + quant_config = vllm_config.quant_config + self.config = config + self.dtype = vllm_config.model_config.dtype + + self.model = WhisperModel(vllm_config=vllm_config, prefix=prefix) + self.unpadded_vocab_size = config.vocab_size + self.proj_out = ParallelLMHead(config.vocab_size, + config.d_model, + quant_config=quant_config) + self.proj_out = self.proj_out.tie_weights( + self.model.decoder.embed_tokens) + logit_scale = getattr(config, "logit_scale", 1.0) + self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, + config.vocab_size, logit_scale) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs, + ) -> torch.Tensor: + audio_input = self._parse_and_validate_audio_input(**kwargs) + decoder_outputs = self.model( + input_features=audio_input["input_features"], + input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + return decoder_outputs + + def get_multimodal_embeddings( + self, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs, + ) -> Optional[NestedTensors]: + # TODO: This method does not obey the interface for SupportsMultiModal. + # Refactor this once encoder/decoder support is implemented in V1. + audio_input = self._parse_and_validate_audio_input(**kwargs) + return self.model.get_encoder_outputs( + audio_input["input_features"], + kv_caches=kv_caches, + attn_metadata=attn_metadata, + ) + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> torch.Tensor: + # TODO: This method just returns the decoder sequence embeddings since + # Whisper does not have encoder text tokens. Refactor this once + # encoder/decoder support is implemented in V1. + return self.model.decoder.get_input_embeddings(input_ids) + + def _parse_and_validate_audio_input( + self, **kwargs: object) -> WhisperAudioInputs: + input_features = kwargs.pop("input_features", None) + + if input_features is not None: + if not isinstance(input_features, (torch.Tensor, list)): + raise ValueError("Incorrect type of audio features. " + f"Got type: {type(input_features)}") + input_features = [feat.to(self.dtype) for feat in input_features] + + return WhisperAudioInputs(input_features=input_features) + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + logits = self.logits_processor(self.proj_out, hidden_states, + sampling_metadata) + return logits + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, + torch.Tensor]]) -> Set[str]: + loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) + loaded_weights = [(name, loaded_weight) + for name, loaded_weight in weights] + mapper = WeightsMapper({".fc1.": ".mlp.fc1.", ".fc2.": ".mlp.fc2."}) + return loader.load_weights(loaded_weights, mapper=mapper) \ No newline at end of file diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 64cdacfb4c574..eb7552176e974 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -16,7 +16,7 @@ from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens from vllm.utils import LRUCache, flatten_2d_lists, full_groupby from .inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -57,24 +57,6 @@ def bind(self, tokenizer: AnyTokenizer) -> "_BoundPromptReplacement": ) -def _encode( - tokenizer: AnyTokenizer, - text: str, - *, - add_special_tokens: bool = False, -) -> list[int]: - """ - Backend-agnostic equivalent of HF's - :code:`tokenizer.encode(text, add_special_tokens=...)`. - """ - if isinstance(tokenizer, MistralTokenizer): - return tokenizer.tokenizer.encode(text, - bos=add_special_tokens, - eos=add_special_tokens) - - return tokenizer.encode(text, add_special_tokens=add_special_tokens) - - @lru_cache(maxsize=2048) def _cached_encode( tokenizer: AnyTokenizer, @@ -82,7 +64,9 @@ def _cached_encode( *, add_special_tokens: bool = False, ) -> list[int]: - return _encode(tokenizer, text, add_special_tokens=add_special_tokens) + return encode_tokens(tokenizer, + text, + add_special_tokens=add_special_tokens) def _decode( @@ -983,7 +967,9 @@ def _apply_prompt_replacements( mm_item_counts, ) - token_ids = _encode(tokenizer, text) + token_ids = encode_tokens(tokenizer, + text, + add_special_tokens=False) matched_repls = [match.prompt_repl for match in text_matches] placeholders = self._find_placeholders(matched_repls, token_ids, diff --git a/vllm/sequence.py b/vllm/sequence.py index 034f89c0ddbe9..0157abbd2eed5 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -710,15 +710,27 @@ def token_type_ids(self) -> Optional[List[int]]: @property def multi_modal_data(self) -> MultiModalDataDict: - return self.first_seq.multi_modal_data + if self.first_seq.multi_modal_data: + return self.first_seq.multi_modal_data + elif self.encoder_seq is not None: + return self.encoder_seq.multi_modal_data + return {} @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - return self.first_seq.multi_modal_placeholders + if self.first_seq.multi_modal_data: + return self.first_seq.multi_modal_placeholders + elif self.encoder_seq is not None: + return self.encoder_seq.multi_modal_placeholders + return {} @property def mm_processor_kwargs(self) -> Dict[str, Any]: - return self.first_seq.mm_processor_kwargs + if self.first_seq.multi_modal_data: + return self.first_seq.mm_processor_kwargs + elif self.encoder_seq is not None: + return self.encoder_seq.mm_processor_kwargs + return {} @property def lora_int_id(self) -> int: diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index e6701f4c4b835..42b2f095bc543 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -21,6 +21,25 @@ MistralTokenizer] +def encode_tokens( + tokenizer: AnyTokenizer, + text: str, + *, + add_special_tokens: Optional[bool] = None, +) -> list[int]: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.encode(text, add_special_tokens=...)`. + """ + if isinstance(tokenizer, MistralTokenizer): + return tokenizer.tokenizer.encode(text, + bos=add_special_tokens, + eos=add_special_tokens) + elif add_special_tokens is not None: + return tokenizer.encode(text, add_special_tokens=add_special_tokens) + return tokenizer.encode(text) + + def get_cached_tokenizer(tokenizer: AnyTokenizer) -> AnyTokenizer: """Get tokenizer with cached properties. diff --git a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py index 8f78ef65bbf1a..e6cc7cd4e2e3a 100644 --- a/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py @@ -32,7 +32,8 @@ def get_max_input_len( def encode(self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group.""" pass @@ -41,7 +42,8 @@ async def encode_async( self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group.""" pass diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py index 9a999a0d6067d..3f7627e11ae5e 100644 --- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py @@ -112,7 +112,8 @@ def _finalize_encode(self, actor: ray.ObjectRef, def encode(self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group. We pick an idle actor and use it to encode the prompt. @@ -132,7 +133,8 @@ def encode(self, ret = ray.get( actor.encode.remote(request_id=request_id, prompt=prompt, - lora_request=lora_request)) + lora_request=lora_request, + add_special_tokens=add_special_tokens)) except ActorDiedError as e: # If the actor is dead, we first try to reinitialize it. logger.warning("%s died with ActorDiedError, reinitializing.", @@ -143,7 +145,8 @@ def encode(self, ret = ray.get( actor.encode.remote(request_id=request_id, prompt=prompt, - lora_request=lora_request)) + lora_request=lora_request, + add_special_tokens=add_special_tokens)) except ActorDiedError as e: logger.error( "%s died for second time in a row, marking " @@ -160,7 +163,8 @@ async def encode_async( self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: """Encode a prompt using the tokenizer group. We pick an idle actor and use it to encode the prompt. @@ -177,9 +181,11 @@ async def encode_async( actor_is_alive = True original_actor = actor try: - ret = await actor.encode.remote(request_id=request_id, - prompt=prompt, - lora_request=lora_request) + ret = await actor.encode.remote( + request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens) except ActorDiedError as e: # If the actor is dead, we first try to reinitialize it. logger.warning("%s died with ActorDiedError, reinitializing.", @@ -187,9 +193,11 @@ async def encode_async( exc_info=e) actor = self._init_actor() try: - ret = await actor.encode.remote(request_id=request_id, - prompt=prompt, - lora_request=lora_request) + ret = await actor.encode.remote( + request_id=request_id, + prompt=prompt, + lora_request=lora_request, + add_special_tokens=add_special_tokens) except ActorDiedError as e: logger.error( "%s died for second time in a row, marking " diff --git a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py index 95a8f7098bbac..6dc2f90561873 100644 --- a/vllm/transformers_utils/tokenizer_group/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group/tokenizer_group.py @@ -2,7 +2,7 @@ from vllm.config import TokenizerPoolConfig from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizer import (AnyTokenizer, +from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, get_lora_tokenizer, get_lora_tokenizer_async, get_tokenizer) @@ -55,9 +55,12 @@ def _raise_if_input_too_long(self, def encode(self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = self.get_lora_tokenizer(lora_request) - ret = tokenizer.encode(prompt) + ret = encode_tokens(tokenizer, + prompt, + add_special_tokens=add_special_tokens) self._raise_if_input_too_long(ret, lora_request) return ret @@ -65,9 +68,12 @@ async def encode_async( self, prompt: str, request_id: Optional[str] = None, - lora_request: Optional[LoRARequest] = None) -> List[int]: + lora_request: Optional[LoRARequest] = None, + add_special_tokens: Optional[bool] = None) -> List[int]: tokenizer = await self.get_lora_tokenizer_async(lora_request) - ret = tokenizer.encode(prompt) + ret = encode_tokens(tokenizer, + prompt, + add_special_tokens=add_special_tokens) self._raise_if_input_too_long(ret, lora_request) return ret diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index bff01320d7927..4d5d918087be8 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -287,12 +287,11 @@ def profile_run(self) -> None: seq_len, self.mm_registry, is_encoder_data=False) - encoder_dummy_data \ - = self.input_registry.dummy_data_for_profiling( - self.model_config, - seq_len, - self.mm_registry, - is_encoder_data=True) + encoder_dummy_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry, + is_encoder_data=True) # Having more tokens is over-conservative but otherwise fine assert len( From 80c751e7f68ade3d4c6391a0f3fce9ce970ddad0 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 3 Jan 2025 12:25:38 -0500 Subject: [PATCH 061/462] [V1] Simplify Shutdown (#11659) --- tests/v1/engine/test_engine_core_client.py | 6 --- vllm/entrypoints/llm.py | 5 --- vllm/v1/engine/async_llm.py | 3 -- vllm/v1/engine/core.py | 1 - vllm/v1/engine/core_client.py | 34 ++++++++-------- vllm/v1/engine/llm_engine.py | 7 ---- vllm/v1/utils.py | 46 +++++++++++----------- 7 files changed, 42 insertions(+), 60 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 729975e4ea8c4..20d4e6f63b339 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -142,9 +142,6 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): client.abort_requests([request.request_id]) - # Shutdown the client. - client.shutdown() - @pytest.mark.asyncio async def test_engine_core_client_asyncio(monkeypatch): @@ -200,6 +197,3 @@ async def test_engine_core_client_asyncio(monkeypatch): else: assert len(outputs[req_id]) == MAX_TOKENS, ( f"{len(outputs[req_id])=}, {MAX_TOKENS=}") - - # Shutdown the client. - client.shutdown() diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index fadf297e9f6aa..7c0de3b3e5481 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -232,11 +232,6 @@ def __init__( self.request_counter = Counter() - def __del__(self): - if hasattr(self, 'llm_engine') and self.llm_engine and hasattr( - self.llm_engine, "shutdown"): - self.llm_engine.shutdown() - @staticmethod def get_engine_class() -> Type[LLMEngine]: if envs.VLLM_USE_V1: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 3f097ca7f439c..ff7a0c28dd91a 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -103,9 +103,6 @@ def sigquit_handler(signum, frame): self.output_handler: Optional[asyncio.Task] = None - def __del__(self): - self.shutdown() - @classmethod def from_engine_args( cls, diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5840541d774ba..13a50a4f855e2 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -203,7 +203,6 @@ def signal_handler(signum, frame): finally: if engine_core is not None: engine_core.shutdown() - engine_core = None def run_busy_loop(self): """Core busy loop of the EngineCore.""" diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 3293205e110af..e009f3448bf69 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,4 +1,6 @@ -from typing import List, Optional, Type +import weakref +from abc import ABC, abstractmethod +from typing import List, Type import msgspec import zmq @@ -18,7 +20,7 @@ logger = init_logger(__name__) -class EngineCoreClient: +class EngineCoreClient(ABC): """ EngineCoreClient: subclasses handle different methods for pushing and pulling from the EngineCore for asyncio / multiprocessing. @@ -52,8 +54,9 @@ def make_client( return InprocClient(vllm_config, executor_class, log_stats) + @abstractmethod def shutdown(self): - pass + ... def get_output(self) -> List[EngineCoreOutput]: raise NotImplementedError @@ -107,9 +110,6 @@ def abort_requests(self, request_ids: List[str]) -> None: def shutdown(self): self.engine_core.shutdown() - def __del__(self): - self.shutdown() - def profile(self, is_start: bool = True) -> None: self.engine_core.profile(is_start) @@ -139,10 +139,14 @@ def __init__( self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) # ZMQ setup. - if asyncio_mode: - self.ctx = zmq.asyncio.Context() - else: - self.ctx = zmq.Context() # type: ignore[attr-defined] + self.ctx = ( + zmq.asyncio.Context() # type: ignore[attr-defined] + if asyncio_mode else zmq.Context()) # type: ignore[attr-defined] + + # Note(rob): shutdown function cannot be a bound method, + # else the gc cannot collect the object. + self._finalizer = weakref.finalize(self, lambda x: x.destroy(linger=0), + self.ctx) # Paths and sockets for IPC. output_path = get_open_zmq_ipc_path() @@ -153,7 +157,6 @@ def __init__( zmq.constants.PUSH) # Start EngineCore in background process. - self.proc_handle: Optional[BackgroundProcHandle] self.proc_handle = BackgroundProcHandle( input_path=input_path, output_path=output_path, @@ -166,12 +169,11 @@ def __init__( }) def shutdown(self): - # Shut down the zmq context. - self.ctx.destroy(linger=0) - - if hasattr(self, "proc_handle") and self.proc_handle: + """Clean up background resources.""" + if hasattr(self, "proc_handle"): self.proc_handle.shutdown() - self.proc_handle = None + + self._finalizer() class SyncMPClient(MPClient): diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index a19109559eabf..1f49de67d7493 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -205,10 +205,3 @@ def get_tokenizer_group( f"found type: {type(tokenizer_group)}") return tokenizer_group - - def __del__(self): - self.shutdown() - - def shutdown(self): - if engine_core := getattr(self, "engine_core", None): - engine_core.shutdown() diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index 19e0dd17237c9..b0a7affbebb7e 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -1,3 +1,4 @@ +import multiprocessing import os import weakref from collections.abc import Sequence @@ -91,8 +92,6 @@ def __init__( target_fn: Callable, process_kwargs: Dict[Any, Any], ): - self._finalizer = weakref.finalize(self, self.shutdown) - context = get_mp_context() reader, writer = context.Pipe(duplex=False) @@ -102,11 +101,11 @@ def __init__( process_kwargs["ready_pipe"] = writer process_kwargs["input_path"] = input_path process_kwargs["output_path"] = output_path - self.input_path = input_path - self.output_path = output_path - # Run Detokenizer busy loop in background process. + # Run busy loop in background process. self.proc = context.Process(target=target_fn, kwargs=process_kwargs) + self._finalizer = weakref.finalize(self, shutdown, self.proc, + input_path, output_path) self.proc.start() # Wait for startup. @@ -114,21 +113,24 @@ def __init__( raise RuntimeError(f"{process_name} initialization failed. " "See root cause above.") - def __del__(self): - self.shutdown() - def shutdown(self): - # Shutdown the process if needed. - if hasattr(self, "proc") and self.proc.is_alive(): - self.proc.terminate() - self.proc.join(5) - - if self.proc.is_alive(): - kill_process_tree(self.proc.pid) - - # Remove zmq ipc socket files - ipc_sockets = [self.output_path, self.input_path] - for ipc_socket in ipc_sockets: - socket_file = ipc_socket.replace("ipc://", "") - if os and os.path.exists(socket_file): - os.remove(socket_file) + self._finalizer() + + +# Note(rob): shutdown function cannot be a bound method, +# else the gc cannot collect the object. +def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str): + # Shutdown the process. + if proc.is_alive(): + proc.terminate() + proc.join(5) + + if proc.is_alive(): + kill_process_tree(proc.pid) + + # Remove zmq ipc socket files. + ipc_sockets = [output_path, input_path] + for ipc_socket in ipc_sockets: + socket_file = ipc_socket.replace("ipc://", "") + if os and os.path.exists(socket_file): + os.remove(socket_file) From 61fed92c7e646d6f2ec5d9de54568a860870e6a4 Mon Sep 17 00:00:00 2001 From: ZincCat <52513999+zinccat@users.noreply.github.com> Date: Fri, 3 Jan 2025 13:02:34 -0800 Subject: [PATCH 062/462] [Bugfix] Fix ColumnParallelLinearWithLoRA slice (#11708) Signed-off-by: ZincCat --- vllm/lora/layers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 85164c2165a3c..102e40d3f448d 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -479,7 +479,7 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: # ColumnParallelLinear. else: tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.output_dim + shard_size = self.output_size start_idx = tensor_model_parallel_rank * shard_size end_idx = (tensor_model_parallel_rank + 1) * shard_size lora_b = lora_b[:, start_idx:end_idx] @@ -490,7 +490,7 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: if bias is None: return bias tensor_model_parallel_rank = get_tensor_model_parallel_rank() - shard_size = self.output_dim + shard_size = self.output_size start_idx = tensor_model_parallel_rank * shard_size end_idx = (tensor_model_parallel_rank + 1) * shard_size bias = bias[start_idx:end_idx] From 1543914c04697fb252e4468b7c9d14be512b050a Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:29:11 -0500 Subject: [PATCH 063/462] [V1] Improve TP>1 Error Handling + Stack Trace (#11721) Co-authored-by: Tyler Michael Smith --- vllm/v1/engine/async_llm.py | 16 ---------------- vllm/v1/engine/core.py | 2 +- vllm/v1/engine/core_client.py | 19 ++++++++++++++++++- vllm/v1/executor/multiproc_executor.py | 24 +++++++++++++++++++++--- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index ff7a0c28dd91a..564d8a8343bef 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -1,6 +1,5 @@ import asyncio import os -import signal from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union from vllm.config import ModelConfig, VllmConfig @@ -42,21 +41,6 @@ def __init__( start_engine_loop: bool = True, ) -> None: - # The child processes will send SIGQUIT when unrecoverable - # errors happen. We kill the process tree here so that the - # stack trace is very evident. - # TODO: rather than killing the main process, we should - # figure out how to raise an AsyncEngineDeadError and - # handle at the API server level so we can return a better - # error code to the clients calling VLLM. - def sigquit_handler(signum, frame): - logger.fatal( - "AsyncLLM got SIGQUIT from worker processes, shutting " - "down. See stack trace above for root cause issue.") - kill_process_tree(os.getpid()) - - signal.signal(signal.SIGQUIT, sigquit_handler) - assert start_engine_loop self.log_requests = log_requests diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 13a50a4f855e2..975ce11fe8aff 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -198,7 +198,7 @@ def signal_handler(signum, frame): except Exception: traceback = get_exception_traceback() logger.error("EngineCore hit an exception: %s", traceback) - parent_process.send_signal(signal.SIGQUIT) + parent_process.send_signal(signal.SIGUSR1) finally: if engine_core is not None: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index e009f3448bf69..6a40c961fc1d7 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -1,3 +1,5 @@ +import os +import signal import weakref from abc import ABC, abstractmethod from typing import List, Type @@ -8,7 +10,8 @@ from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.utils import get_open_zmq_ipc_path, make_zmq_socket +from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree, + make_zmq_socket) from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreProfile, EngineCoreRequest, EngineCoreRequestType, EngineCoreRequestUnion) @@ -134,6 +137,20 @@ def __init__( executor_class: Type[Executor], log_stats: bool = False, ): + # The child processes will send SIGUSR1 when unrecoverable + # errors happen. We kill the process tree here so that the + # stack trace is very evident. + # TODO(rob): rather than killing the main process, we should + # figure out how to raise an AsyncEngineDeadError and + # handle at the API server level so we can return a better + # error code to the clients calling VLLM. + def sigusr1_handler(signum, frame): + logger.fatal("Got fatal signal from worker processes, shutting " + "down. See stack trace above for root cause issue.") + kill_process_tree(os.getpid()) + + signal.signal(signal.SIGUSR1, sigusr1_handler) + # Serialization setup. self.encoder = PickleEncoder() self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index ed64e7741390d..114deae980d01 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -9,6 +9,7 @@ from multiprocessing.process import BaseProcess from typing import Any, Dict, List, Optional, Tuple +import psutil import zmq from vllm.config import VllmConfig @@ -38,6 +39,19 @@ def __init__(self, vllm_config: VllmConfig) -> None: # and ensure workers will be terminated. self._finalizer = weakref.finalize(self, self.shutdown) + # The child processes will send SIGUSR1 when unrecoverable + # errors happen. + def sigusr1_handler(signum, frame): + logger.fatal( + "MulitprocExecutor got fatal signal from worker processes, " + "shutting down. See stack trace above for root cause issue.") + # Propagate error up to parent process. + parent_process = psutil.Process().parent() + parent_process.send_signal(signal.SIGUSR1) + self.shutdown() + + signal.signal(signal.SIGUSR1, sigusr1_handler) + self.vllm_config = vllm_config self.parallel_config = vllm_config.parallel_config @@ -335,8 +349,11 @@ def signal_handler(signum, frame): except SystemExit: logger.debug("Worker interrupted.") - except BaseException as e: - logger.exception(e) + except Exception: + # worker_busy_loop sends exceptions exceptons to Executor + # for shutdown, but if there is an error in startup or an + # error with IPC itself, we need to alert the parent. + psutil.Process().parent().send_signal(signal.SIGUSR1) raise finally: @@ -377,9 +394,10 @@ def worker_busy_loop(self): try: output = getattr(self.worker, method)(*args, **kwargs) - except BaseException as e: + except Exception as e: self.worker_response_mq.enqueue( (WorkerProc.ResponseStatus.FAILURE, e)) + logger.exception("WorkerProc hit an exception: %s", exc_info=e) continue self.worker_response_mq.enqueue( From a655eb30252fe266ce16fde2aa9f8f9554ccd46e Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sat, 4 Jan 2025 06:19:02 +0800 Subject: [PATCH 064/462] [Misc]Add BNB quantization for Qwen2VL (#11719) Signed-off-by: Jee Jee Li Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/qwen2_vl.py | 69 +++++++++++++++----------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 26b6d768ad4f6..5a8c6e4deb7ac 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -38,7 +38,7 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.distributed import parallel_state +from vllm.distributed import parallel_state, tensor_model_parallel_all_gather from vllm.distributed import utils as dist_utils from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata @@ -239,6 +239,8 @@ def __init__( super().__init__() # Per attention head and per partition values. world_size = parallel_state.get_tensor_model_parallel_world_size() + self.tp_size = world_size + self.tp_rank = parallel_state.get_tensor_model_parallel_rank() self.hidden_size_per_attention_head = dist_utils.divide( projection_size, num_heads) self.num_attention_heads_per_partition = dist_utils.divide( @@ -261,24 +263,41 @@ def __init__( raise RuntimeError( f"Qwen2-VL does not support {self.attn_backend} backend now.") + def split_qkv(self, qkv: torch.Tensor) -> tuple[torch.Tensor, ...]: + # [s, b, 3 * head * head_dim] + seq_len, bs, _ = qkv.shape + if self.tp_size > 1: + qkv = tensor_model_parallel_all_gather(qkv) + + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head * head_dim] + q, k, v = qkv.chunk(3, dim=2) + + # 3 * [s, b, head * head_dim] + if self.tp_size > 1: + splitter = partial(dist_utils.split_tensor_along_last_dim, + num_partitions=self.tp_size) + q = splitter(q)[self.tp_rank] + k = splitter(k)[self.tp_rank] + v = splitter(v)[self.tp_rank] + + # 3 * [s, b, head * head_dim] -> 3 * [s, b, head, head_dim] + new_shape = (seq_len, bs, self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head) + q, k, v = (x.view(*new_shape) for x in (q, k, v)) + return q, k, v + def forward( self, x: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor, ) -> torch.Tensor: - # [s, b, c] --> [s, b, head * 3 * head_dim] - x, _ = self.qkv(x) - # [s, b, head * 3 * head_dim] --> [s, b, head, 3 * head_dim] - new_x_shape = x.size()[:-1] + ( - self.num_attention_heads_per_partition, - 3 * self.hidden_size_per_attention_head, - ) - x = x.view(*new_x_shape) + # [s, b, c] --> [s, b, 3 * head * head_dim] + x, _ = self.qkv(x) - # [s, b, head, 3 * head_dim] --> 3 [s, b, head, head_dim] - q, k, v = dist_utils.split_tensor_along_last_dim(x, 3) + # [s, b, 3 * head * head_dim] -> 3 * [s, b, head, head_dim] + q, k, v = self.split_qkv(x) batch_size = q.shape[1] q, k, v = (rearrange(x, "s b ... -> b s ...").contiguous() @@ -614,24 +633,6 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight, shard_id) break else: - if name.endswith("qkv.weight"): - visual_num_heads = self.num_heads - visual_embed_dim = self.embed_dim - head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, - head_size, - visual_embed_dim) - loaded_weight = loaded_weight.transpose(0, 1) - loaded_weight = loaded_weight.reshape(-1, visual_embed_dim) - elif name.endswith("qkv.bias"): - visual_num_heads = self.num_heads - visual_embed_dim = self.embed_dim - head_size = visual_embed_dim // visual_num_heads - loaded_weight = loaded_weight.view(3, visual_num_heads, - head_size) - loaded_weight = loaded_weight.transpose(0, 1) - loaded_weight = loaded_weight.reshape(-1) - param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) @@ -935,6 +936,16 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, embedding_modules = {} embedding_padding_modules = [] + # BitandBytes specific attributes + bitsandbytes_stacked_params_mapping = { + # shard_name, weight_name, index + "q_proj": ("qkv_proj", 0), + "k_proj": ("qkv_proj", 1), + "v_proj": ("qkv_proj", 2), + "gate_proj": ("gate_up_proj", 0), + "up_proj": ("gate_up_proj", 1), + } + # To ensure correct weight loading and mapping. hf_to_vllm_mapper = WeightsMapper(orig_to_new_prefix={ "lm_head.": "language_model.lm_head.", From bf0d97d78619b290ed273199ad3800b57b638603 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Fri, 3 Jan 2025 17:36:46 -0500 Subject: [PATCH 065/462] Update requirements-tpu.txt to support python 3.9 and 3.11 (#11695) Signed-off-by: mgoin --- requirements-tpu.txt | 4 +++- vllm/worker/tpu_model_runner.py | 6 ++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requirements-tpu.txt b/requirements-tpu.txt index b8f0b15469e77..8ab18b3770ae8 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -18,6 +18,8 @@ ray[default] --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html torch==2.6.0.dev20241126+cpu torchvision==0.20.0.dev20241126+cpu -torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" +torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" jaxlib==0.4.36.dev20241122 jax==0.4.36.dev20241122 diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 9a054eb8a4cf7..7bdb7f0e2d6a9 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -126,8 +126,10 @@ def __init__( logger.warning( "The max_model_len (%d) is too large. This may degrade the " "performance due to the insufficient smem size. Consider " - "setting --max-model-len to a smaller value.", - self.model_config.max_model_len) + "setting --max-model-len to a smaller value, like %d.", + self.model_config.max_model_len, + self.model_config.max_model_len / + (block_table_size / smem_size)) def load_model(self) -> None: self.device = self.device_config.device From ad0d567e1cdc77aff435b20bac918bfd0f55db0a Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Fri, 3 Jan 2025 18:25:02 -0500 Subject: [PATCH 066/462] [V1] Chore: cruft removal (#11724) --- vllm/entrypoints/llm.py | 2 -- vllm/v1/engine/core_client.py | 2 -- vllm/v1/engine/llm_engine.py | 4 ---- vllm/v1/engine/processor.py | 3 --- 4 files changed, 11 deletions(-) diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 7c0de3b3e5481..e48fd1a4fa5e9 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -225,8 +225,6 @@ def __init__( # Logic to switch between engines is done at runtime instead of import # to avoid import order issues self.engine_class = self.get_engine_class() - - # TODO(rob): enable mp by default (issue with fork vs spawn) self.llm_engine = self.engine_class.from_engine_args( engine_args, usage_context=UsageContext.LLM_CLASS) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index 6a40c961fc1d7..a4a45ae05ff9e 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -94,8 +94,6 @@ class InprocClient(EngineCoreClient): * pushes EngineCoreRequest directly into the EngineCore * pulls EngineCoreOutputs by stepping the EngineCore - - TODO: support asyncio-mode for debugging. """ def __init__(self, *args, **kwargs): diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 1f49de67d7493..0bd9b52c9be82 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -42,8 +42,6 @@ def __init__( use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: - - # TODO: Can we avoid this? self.model_config = vllm_config.model_config # Tokenizer (+ ensure liveness if running in another process). @@ -179,8 +177,6 @@ def step(self) -> List[RequestOutput]: return request_outputs - # TODO(rob): Can we get rid of these? - def get_model_config(self): return self.model_config diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 905d3d1fc3e1c..c0f6cfab4865c 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -49,9 +49,6 @@ def __init__( cache_config.enable_prefix_caching self.mm_hasher = MMHasher() - # TODO: run in an ThreadpoolExecutor or BackgroundProcess. - # This ideally should releases the GIL, so we should not block the - # asyncio loop while this is running. def process_inputs( self, request_id: str, From e5d7ed0c5374d38e75a8ef0243cc348f0f6f9185 Mon Sep 17 00:00:00 2001 From: WangErXiao <863579016@qq.com> Date: Sat, 4 Jan 2025 08:13:12 +0800 Subject: [PATCH 067/462] [V1] log GPU blocks num for MultiprocExecutor (#11656) --- vllm/v1/executor/multiproc_executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py index 114deae980d01..41e6abbd67956 100644 --- a/vllm/v1/executor/multiproc_executor.py +++ b/vllm/v1/executor/multiproc_executor.py @@ -95,6 +95,7 @@ def initialize(self, num_gpu_blocks: int) -> None: Initialize the KV caches and begin the model execution loop of the underlying workers. """ + logger.info("# GPU blocks: %d", num_gpu_blocks) self.collective_rpc("initialize_cache", args=(num_gpu_blocks, )) self.collective_rpc("compile_or_warm_up_model") From 9c93636d84414591ae4d7b9c1174af7e91052fd8 Mon Sep 17 00:00:00 2001 From: Hust_YangXian Date: Sat, 4 Jan 2025 14:16:30 +0800 Subject: [PATCH 068/462] Update tool_calling.md (#11701) --- docs/source/usage/tool_calling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/usage/tool_calling.md b/docs/source/usage/tool_calling.md index 34b26647a959f..062f2021eb62a 100644 --- a/docs/source/usage/tool_calling.md +++ b/docs/source/usage/tool_calling.md @@ -10,7 +10,7 @@ Start the server with tool calling enabled. This example uses Meta's Llama 3.1 8 vllm serve meta-llama/Llama-3.1-8B-Instruct \ --enable-auto-tool-choice \ --tool-call-parser llama3_json \ - --chat-template examples/tool_chat_template_llama3_json.jinja + --chat-template examples/tool_chat_template_llama3.1_json.jinja ``` Next, make a request to the model that should result in it using the available tools: From d1d49397e7f8d1ac472d763dae395b67fdda1ef8 Mon Sep 17 00:00:00 2001 From: Alberto Ferrer Date: Sat, 4 Jan 2025 00:29:02 -0600 Subject: [PATCH 069/462] Update bnb.md with example for OpenAI (#11718) --- docs/source/quantization/bnb.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/quantization/bnb.md b/docs/source/quantization/bnb.md index 8240eca1c7e03..f7f41726f3725 100644 --- a/docs/source/quantization/bnb.md +++ b/docs/source/quantization/bnb.md @@ -37,3 +37,10 @@ model_id = "huggyllama/llama-7b" llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ quantization="bitsandbytes", load_format="bitsandbytes") ``` +## OpenAI Compatible Server + +Append the following to your 4bit model arguments: + +``` +--quantization bitsandbytes --load-format bitsandbytes +``` From fbf25645542fdcfb3f1a27ba05486492e368925c Mon Sep 17 00:00:00 2001 From: Kunshang Ji Date: Sat, 4 Jan 2025 14:41:31 +0800 Subject: [PATCH 070/462] [V1] Add `RayExecutor` support for `AsyncLLM` (api server) (#11712) --- vllm/v1/engine/async_llm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 564d8a8343bef..0696caf88385d 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -22,6 +22,7 @@ from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor +from vllm.v1.executor.ray_utils import initialize_ray_cluster logger = init_logger(__name__) @@ -131,7 +132,11 @@ def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: executor_class: Type[Executor] distributed_executor_backend = ( vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "mp": + if distributed_executor_backend == "ray": + initialize_ray_cluster(vllm_config.parallel_config) + from vllm.v1.executor.ray_executor import RayExecutor + executor_class = RayExecutor + elif distributed_executor_backend == "mp": from vllm.v1.executor.multiproc_executor import MultiprocExecutor executor_class = MultiprocExecutor else: From d91457d529c2df5d66bdfd939b90b7c75a9729b8 Mon Sep 17 00:00:00 2001 From: xcnick Date: Sat, 4 Jan 2025 14:49:46 +0800 Subject: [PATCH 071/462] [V1] Add kv cache utils tests. (#11513) Signed-off-by: xcnick --- tests/v1/core/test_kv_cache_utils.py | 241 +++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) create mode 100644 tests/v1/core/test_kv_cache_utils.py diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py new file mode 100644 index 0000000000000..faa3a91de151f --- /dev/null +++ b/tests/v1/core/test_kv_cache_utils.py @@ -0,0 +1,241 @@ +import pytest + +from vllm.inputs import token_inputs +from vllm.sampling_params import SamplingParams +from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue, + KVCacheBlock, + generate_block_hash_extra_keys, + hash_block_tokens, + hash_request_tokens) +from vllm.v1.request import Request + + +def make_request(request_id, + prompt_token_ids, + mm_positions=None, + mm_hashes=None): + return Request( + request_id=request_id, + inputs=token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_placeholders={"image": mm_positions} + if mm_positions else None, + multi_modal_hashes=mm_hashes, + ), + sampling_params=SamplingParams(max_tokens=17), + eos_token_id=100, + arrival_time=0, + lora_request=None, + ) + + +def test_kv_cache_block(): + # Test KVCacheBlock initialization + block = KVCacheBlock(block_id=0) + assert block.block_id == 0 + assert block.ref_cnt == 0 + assert block.block_hash is None + + # Test reference count manipulation + block.incr_ref() + assert block.ref_cnt == 1 + block.decr_ref() + assert block.ref_cnt == 0 + + # Test block hash setting and resetting + block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3)) + block.block_hash = block_hash + assert block.block_hash == block_hash + + block.reset_hash() + assert block.block_hash is None + + +def test_free_kv_cache_block_queue_initialization(): + # Test with a single block + block = KVCacheBlock(block_id=0) + queue = FreeKVCacheBlockQueue([block]) + assert queue.num_free_blocks == 1 + assert queue.free_list_head == block + assert queue.free_list_tail == block + + +def test_free_kv_cache_block_queue_operations(): + # Create a list of KVCacheBlock objects + blocks = [KVCacheBlock(block_id=i) for i in range(5)] + + # Create a FreeKVCacheBlockQueue with these blocks + queue = FreeKVCacheBlockQueue(blocks) + + # Check initial state + assert queue.num_free_blocks == 5 + assert queue.free_list_head == blocks[0] + assert queue.free_list_tail == blocks[4] + + # Pop the first block + block1 = queue.popleft() + assert block1 == blocks[0] + assert queue.num_free_blocks == 4 + assert queue.free_list_head == blocks[1] + assert queue.free_list_tail == blocks[4] + + # Remove a block from the middle + block_to_remove = blocks[2] + queue.remove(block_to_remove) + assert queue.num_free_blocks == 3 + assert blocks[1].next_free_block == blocks[3] + assert blocks[3].prev_free_block == blocks[1] + + # Append a block back + queue.append(block_to_remove) + assert queue.num_free_blocks == 4 + assert queue.free_list_tail == block_to_remove + assert block_to_remove.prev_free_block == blocks[4] + assert block_to_remove.next_free_block is None + + # Pop blocks until empty + for _ in range(4): + queue.popleft() + assert queue.num_free_blocks == 0 + assert queue.free_list_head is None + assert queue.free_list_tail is None + + # Attempt to pop from an empty queue + with pytest.raises(ValueError) as e: + queue.popleft() + assert str(e.value) == "No free blocks available" + + +def test_free_kv_cache_block_queue_get_all_free_blocks(): + # Create a list of KVCacheBlock objects + blocks = [KVCacheBlock(block_id=i) for i in range(5)] + + # Create a FreeKVCacheBlockQueue with these blocks + queue = FreeKVCacheBlockQueue(blocks) + + # Check all blocks are correctly retrieved + assert queue.get_all_free_blocks() == blocks + + # Pop a block and check again + queue.popleft() + assert queue.get_all_free_blocks() == blocks[1:] + + # Remove a block and check again + block_to_remove = blocks[2] + queue.remove(block_to_remove) + assert queue.get_all_free_blocks() == blocks[1:2] + blocks[3:] + + # Append a block back and check again + queue.append(block_to_remove) + assert queue.get_all_free_blocks() == \ + blocks[1:2] + blocks[3:] + [block_to_remove] + + +def test_generate_block_hash_extra_keys(): + request = make_request( + request_id=0, + prompt_token_ids=[_ for _ in range(20)], + mm_positions=[{ + "offset": 0, + "length": 5 + }, { + "offset": 10, + "length": 5 + }], + mm_hashes=["hash1", "hash2"], + ) + + # Test with no extra keys + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0) + assert extra_keys == (("hash1", 0), ) + assert next_mm_idx == 1 + + # Test with partial overlap + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0) + assert extra_keys == (("hash1", 3), ) + assert next_mm_idx == 1 + + # Test with no overlap + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 6, 10, 0) + assert extra_keys == () + assert next_mm_idx == 1 + + # Test with multiple extra keys + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0) + assert extra_keys == (("hash1", 0), ("hash2", 0)) + assert next_mm_idx == 2 + + +def test_generate_block_hash_extra_keys_no_mm_inputs(): + request = make_request( + request_id=0, + prompt_token_ids=[_ for _ in range(6)], + mm_positions=None, + mm_hashes=None, + ) + + extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0) + assert extra_keys is None + assert next_mm_idx == 0 + + +def test_hash_block_tokens(): + parent_block_hash = 123 + curr_block_token_ids = (1, 2, 3) + extra_keys = ("key1", "key2") + + block_hash = hash_block_tokens(parent_block_hash, curr_block_token_ids, + extra_keys) + assert isinstance(block_hash, BlockHashType) + assert block_hash.hash_value == hash( + (parent_block_hash, *curr_block_token_ids)) + assert block_hash.token_ids == curr_block_token_ids + assert block_hash.extra_keys == extra_keys + + +def test_hash_request_tokens(): + request = make_request( + request_id=0, + prompt_token_ids=[_ for _ in range(6)], + mm_positions=[{ + "offset": 0, + "length": 3 + }, { + "offset": 3, + "length": 3 + }], + mm_hashes=["hash1", "hash2"], + ) + + block_size = 3 + block_hashes = hash_request_tokens(block_size, request) + + assert len(block_hashes) == 2 + assert isinstance(block_hashes[0], BlockHashType) + assert isinstance(block_hashes[1], BlockHashType) + + # Check the first block + assert block_hashes[0].token_ids == (0, 1, 2) + assert block_hashes[0].extra_keys == (("hash1", 0), ) + + # Check the second block + assert block_hashes[1].token_ids == (3, 4, 5) + assert block_hashes[1].extra_keys == (("hash2", 0), ) + + +def test_hash_request_tokens_no_mm_inputs(): + request = make_request( + request_id=0, + prompt_token_ids=[_ for _ in range(6)], + mm_positions=None, + mm_hashes=None, + ) + + block_size = 3 + block_hashes = hash_request_tokens(block_size, request) + + assert len(block_hashes) == 2 + assert block_hashes[0].token_ids == (0, 1, 2) + assert block_hashes[0].extra_keys is None + assert block_hashes[1].token_ids == (3, 4, 5) + assert block_hashes[1].extra_keys is None From 300acb83472512b14ec7ba8cdf45efe07e8c8f68 Mon Sep 17 00:00:00 2001 From: Yan Burman Date: Sat, 4 Jan 2025 08:50:16 +0200 Subject: [PATCH 072/462] [Core][Bugfix] Use correct device to initialize GPU data during CUDA-graph-capture (#11233) Signed-off-by: Yan Burman Signed-off-by: Ido Asraff --- tests/distributed/test_custom_all_reduce.py | 2 +- tests/distributed/test_pynccl.py | 2 +- vllm/distributed/parallel_state.py | 7 +++--- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/worker/model_runner.py | 25 +++++++++++++-------- 5 files changed, 23 insertions(+), 15 deletions(-) diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py index 86ca1948ef94a..4072616fd30e2 100644 --- a/tests/distributed/test_custom_all_reduce.py +++ b/tests/distributed/test_custom_all_reduce.py @@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): for sz in test_sizes: for dtype in [torch.float32, torch.float16, torch.bfloat16]: - with graph_capture() as graph_capture_context: + with graph_capture(device=device) as graph_capture_context: # use integers so result matches NCCL exactly inp1 = torch.randint(1, 16, (sz, ), diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 3e9b0e10a11d8..36cfe42251384 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -107,7 +107,7 @@ def multiple_allreduce_with_vllm_worker_fn(): device = torch.device(f"cuda:{torch.distributed.get_rank()}") ensure_model_parallel_initialized(2, 2) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) - with graph_capture(): + with graph_capture(device=device): # two tp groups can communicate independently if torch.distributed.get_rank() in [0, 1]: tensor = tensor_model_parallel_all_reduce(tensor) diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index e6768467f4c27..a0d4235460f3b 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -920,7 +920,7 @@ def get_kv_transfer_group() -> kv_transfer.KVTransferAgent: @contextmanager -def graph_capture(): +def graph_capture(device: torch.device): """ `graph_capture` is a context manager which should surround the code that is capturing the CUDA graph. Its main purpose is to ensure that the @@ -934,8 +934,9 @@ def graph_capture(): in order to explicitly distinguish the kernels to capture from other kernels possibly launched on background in the default stream. """ - with get_tp_group().graph_capture() as context, get_pp_group( - ).graph_capture(context): + context = GraphCaptureContext(torch.cuda.Stream(device=device)) + with get_tp_group().graph_capture(context), get_pp_group().graph_capture( + context): yield context diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 75098b0330ac9..294c76cfb680e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -836,7 +836,7 @@ def capture_model(self) -> None: # Trigger CUDA graph capture for specific shapes. # Capture the large shapes first so that the smaller shapes # can reuse the memory pool allocated for the large shapes. - with graph_capture(): + with graph_capture(device=self.device): for num_tokens in reversed(self.cudagraph_batch_sizes): for _ in range(self.vllm_config.compilation_config. cudagraph_num_of_warmups): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 637fba23611f4..1c6d1bbee78ee 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1426,10 +1426,15 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: # Prepare dummy inputs. These will be reused for all batch sizes. max_batch_size = self.max_batchsize_to_capture - input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda() - input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda() + input_tokens = torch.zeros(max_batch_size, + dtype=torch.long, + device=self.device) + input_positions = torch.zeros(max_batch_size, + dtype=torch.long, + device=self.device) if self.model_config.uses_mrope: - input_positions = torch.tile(input_positions, (3, 1)) + input_positions = torch.tile(input_positions, + (3, 1)).cuda(device=self.device) # Prepare dummy previous_hidden_states only if needed by the model. # This is used by draft models such as EAGLE. previous_hidden_states = None @@ -1448,8 +1453,8 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: dtype=self.model_config.dtype, device=self.device) - with self.attn_state.graph_capture( - max_batch_size), graph_capture() as graph_capture_context: + with self.attn_state.graph_capture(max_batch_size), graph_capture( + self.device) as graph_capture_context: # NOTE: Capturing the largest batch size first may help reduce the # memory usage of CUDA graph. for virtual_engine in range( @@ -1549,10 +1554,12 @@ def _update_inputs_to_capture_for_enc_dec_model(self, """ # During the decode phase encoder_input_ids and encoder_positions are # unset. Do the same thing for graph capture. - capture_inputs["encoder_input_ids"] = torch.tensor( - [], dtype=torch.long).cuda() - capture_inputs["encoder_positions"] = torch.tensor( - [], dtype=torch.long).cuda() + capture_inputs["encoder_input_ids"] = torch.tensor([], + dtype=torch.long, + device=self.device) + capture_inputs["encoder_positions"] = torch.tensor([], + dtype=torch.long, + device=self.device) @property def vocab_size(self) -> int: From eed11ebee93e9d137ac74d8e6e97427354bd3797 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 4 Jan 2025 19:40:53 +0800 Subject: [PATCH 073/462] [VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717) Signed-off-by: DarkLight1337 --- .../__init__.py | 0 .../test_idefics3.py | 0 .../test_internvl.py | 0 .../processing/test_llava_next.py | 58 ++ .../processing/test_llava_onevision.py | 59 ++ .../test_phi3v.py | 44 +- .../test_qwen.py | 0 .../test_qwen2_vl.py | 39 +- .../vision_language/test_models.py | 9 +- .../vision_language/test_qwen2_vl.py | 127 ----- tests/multimodal/test_processing.py | 170 +++--- vllm/model_executor/models/aria.py | 5 +- vllm/model_executor/models/blip2.py | 5 +- vllm/model_executor/models/chameleon.py | 5 +- vllm/model_executor/models/clip.py | 11 +- vllm/model_executor/models/fuyu.py | 5 +- vllm/model_executor/models/llava.py | 75 ++- vllm/model_executor/models/llava_next.py | 15 +- .../model_executor/models/llava_next_video.py | 273 +++++---- vllm/model_executor/models/llava_onevision.py | 531 ++++++++---------- vllm/model_executor/models/phi3v.py | 26 +- vllm/model_executor/models/pixtral.py | 11 +- vllm/model_executor/models/qwen2_audio.py | 15 +- vllm/model_executor/models/qwen2_vl.py | 199 ++++--- vllm/model_executor/models/siglip.py | 11 +- vllm/model_executor/models/ultravox.py | 11 +- vllm/model_executor/models/vision.py | 37 +- vllm/multimodal/parse.py | 14 + vllm/multimodal/processing.py | 326 +++++++---- vllm/multimodal/registry.py | 3 +- vllm/transformers_utils/tokenizer.py | 13 + 31 files changed, 1114 insertions(+), 983 deletions(-) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/__init__.py (100%) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_idefics3.py (100%) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_internvl.py (100%) create mode 100644 tests/models/decoder_only/vision_language/processing/test_llava_next.py create mode 100644 tests/models/decoder_only/vision_language/processing/test_llava_onevision.py rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_phi3v.py (60%) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_qwen.py (100%) rename tests/models/decoder_only/vision_language/{mm_processor_kwargs => processing}/test_qwen2_vl.py (64%) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py b/tests/models/decoder_only/vision_language/processing/__init__.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/__init__.py rename to tests/models/decoder_only/vision_language/processing/__init__.py diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/processing/test_idefics3.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py rename to tests/models/decoder_only/vision_language/processing/test_idefics3.py diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py b/tests/models/decoder_only/vision_language/processing/test_internvl.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_internvl.py rename to tests/models/decoder_only/vision_language/processing/test_internvl.py diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py new file mode 100644 index 0000000000000..6772130c9b884 --- /dev/null +++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py @@ -0,0 +1,58 @@ +import pytest +from PIL import Image +from transformers import AutoTokenizer + +from vllm.inputs import InputProcessingContext + +from ....utils import build_model_context + + +# Fixtures lazy import to avoid initializing CUDA during test collection +@pytest.fixture() +def processor_for_llava_next(): + from vllm.model_executor.models.llava_next import ( + LlavaNextMultiModalProcessor) + return LlavaNextMultiModalProcessor + + +# FIXME: image_size [(198, 176), (176, 198)] +@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) +@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), + (488, 183)]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements( + processor_for_llava_next, + model_id: str, + image_size: tuple[int, int], + num_imgs: int, +): + """ + Ensure LlavaNextMultiModalProcessor handles prompt replacement properly. + """ + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + ctx = InputProcessingContext(ctx.model_config, tokenizer) + + # Build the image str / prompt based on the number of images we pass + prompt = "" * num_imgs + mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} + + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processor = processor_for_llava_next(ctx) + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + # NOTE: There is a BOS token + assert first_placeholder["offset"] == 1 + assert first_placeholder["length"] == ( + len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py new file mode 100644 index 0000000000000..71adde6568a17 --- /dev/null +++ b/tests/models/decoder_only/vision_language/processing/test_llava_onevision.py @@ -0,0 +1,59 @@ +import pytest +from PIL import Image +from transformers import AutoTokenizer + +from vllm.inputs import InputProcessingContext + +from ....utils import build_model_context + + +# Fixtures lazy import to avoid initializing CUDA during test collection +@pytest.fixture() +def processor_for_llava_onevision(): + from vllm.model_executor.models.llava_onevision import ( + LlavaOnevisionMultiModalProcessor) + return LlavaOnevisionMultiModalProcessor + + +@pytest.mark.parametrize("model_id", + ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]) +@pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), + (488, 183), (198, 176), (176, 198)]) +@pytest.mark.parametrize("num_imgs", [1, 2]) +def test_processor_prompt_replacements( + processor_for_llava_onevision, + model_id: str, + image_size: tuple[int, int], + num_imgs: int, +): + """ + Ensure LlavaOnevisionMultiModalProcessor handles prompt replacement + properly. + """ + ctx = build_model_context( + model_name=model_id, + tokenizer_name=model_id, + mm_processor_kwargs=None, + limit_mm_per_prompt={"image": num_imgs}, + ) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) + ctx = InputProcessingContext(ctx.model_config, tokenizer) + + # Build the image str / prompt based on the number of images we pass + prompt = "" * num_imgs + mm_data = {"image": [Image.new("RGB", size=image_size)] * num_imgs} + + # The processor will throw an error if there is a mismatch + # in the prompt replacements + processor = processor_for_llava_onevision(ctx) + processed_inputs = processor.apply(prompt, mm_data, {}) + + image_placeholders = processed_inputs["mm_placeholders"]["image"] + assert len(image_placeholders) == num_imgs + + first_placeholder = image_placeholders[0] + + # NOTE: There is a BOS token + assert first_placeholder["offset"] == 0 + assert first_placeholder["length"] == len( + processed_inputs["prompt_token_ids"]) // num_imgs diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/processing/test_phi3v.py similarity index 60% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py rename to tests/models/decoder_only/vision_language/processing/test_phi3v.py index 3edf96d11106d..249045b3c04ce 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py +++ b/tests/models/decoder_only/vision_language/processing/test_phi3v.py @@ -1,6 +1,4 @@ """Tests for phi3v's multimodal preprocessing kwargs.""" -from typing import Optional - import pytest from transformers import AutoTokenizer @@ -10,8 +8,6 @@ from .....conftest import _ImageAssets from ....utils import build_model_context -models = ["microsoft/Phi-3.5-vision-instruct"] - # Wrap lazy imports to avoid initializing CUDA during test collection @pytest.fixture() @@ -20,40 +16,40 @@ def processor_for_phi3v(): return Phi3VMultiModalProcessor -@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"]) +# yapf: disable @pytest.mark.parametrize( - "num_crops,expected_toks_per_img", + ("mm_processor_kwargs", "expected_toks_per_img"), [ - (4, 757), - (16, 1921), + ({"num_crops": 4}, 757), + ({"num_crops": 16}, 1921), # the default num_crops of phi-3.5-vision is 4 - (None, 757), + ({}, 757), ]) +# yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) -def test_processor_override(processor_for_phi3v, image_assets: _ImageAssets, - model: str, num_crops: Optional[int], - expected_toks_per_img: int, num_imgs: int): +def test_processor_override( + processor_for_phi3v, + image_assets: _ImageAssets, + model_id: str, + mm_processor_kwargs: dict[str, int], + expected_toks_per_img: int, + num_imgs: int, +): """Ensure input_processor_for_phi3v handles num_crops properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the custom input processor. ctx = build_model_context( - model_name=model, - tokenizer_name=model, + model_name=model_id, + tokenizer_name=model_id, trust_remote_code=True, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) ctx = InputProcessingContext(ctx.model_config, tokenizer) + # Build the image str / prompt based on the number of images we pass img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)]) prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n" - images = [image_assets[0].pil_image] * num_imgs - - mm_data = {"image": images} - mm_processor_kwargs = {} - if num_crops is not None: - mm_processor_kwargs = {"num_crops": num_crops} + mm_data = {"image": [image_assets[0].pil_image] * num_imgs} processor = processor_for_phi3v(ctx) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/processing/test_qwen.py similarity index 100% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py rename to tests/models/decoder_only/vision_language/processing/test_qwen.py diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py similarity index 64% rename from tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py rename to tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py index 1f0b482666723..b9ac887edf90f 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/processing/test_qwen2_vl.py @@ -1,5 +1,3 @@ -from typing import Any, Dict, Tuple - import pytest from transformers import AutoTokenizer @@ -8,56 +6,45 @@ from .....conftest import _ImageAssets from ....utils import build_model_context -MODEL = "Qwen/Qwen2-VL-2B-Instruct" -MIN_PIXELS = "min_pixels" -MAX_PIXELS = "max_pixels" - # Fixtures lazy import to avoid initializing CUDA during test collection -# NOTE: Qwen2VL supports multiple input modalities, so it registers multiple -# input mappers. @pytest.fixture() def processor_for_qwen2_vl(): from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalProcessor return Qwen2VLMultiModalProcessor +@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) +# yapf: disable @pytest.mark.parametrize( - "mm_processor_kwargs, expected_toks_per_img, expected_pixels_shape", [ + ("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [ ({}, 1426, (5704, 1176)), - ({ - MIN_PIXELS: 64**2, - MAX_PIXELS: 512**2 - }, 330, (1320, 1176)), + ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)), ]) -@pytest.mark.parametrize("model", [MODEL]) +# yapf: enable @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_override( processor_for_qwen2_vl, image_assets: _ImageAssets, - model: str, - mm_processor_kwargs: Dict[str, Any], + model_id: str, + mm_processor_kwargs: dict[str, object], expected_toks_per_img: int, - expected_pixels_shape: Tuple[int, int], + expected_pixels_shape: tuple[int, int], num_imgs: int, ): """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" - # Same as the previous test - don't initialize mm_processor_kwargs - # in this test and assume that the kwargs will be correctly expanded by - # the partial when calling the custom input processor. ctx = build_model_context( - model_name=model, - tokenizer_name=model, + model_name=model_id, + tokenizer_name=model_id, mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) ctx = InputProcessingContext(ctx.model_config, tokenizer) + # Build the image str / prompt based on the number of images we pass prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs - images = [image_assets[0].pil_image] * num_imgs - - mm_data = {"image": images} + mm_data = {"image": [image_assets[0].pil_image] * num_imgs} processor = processor_for_qwen2_vl(ctx) processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index 7db08166826eb..dc0b683c1f1cb 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -274,10 +274,8 @@ ), limit_mm_per_prompt={"image": 4}, )], - # Llava-next tests fixed sizes & the default size factors - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], ), - "llava_one_vision": VLMTestInfo( + "llava_onevision": VLMTestInfo( models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 @@ -288,8 +286,6 @@ ), auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, - # Llava-one-vision tests fixed sizes & the default size factors - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], custom_test_opts=[CustomTestOptions( inputs=custom_inputs.multi_video_multi_aspect_ratio_inputs( formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 @@ -306,7 +302,6 @@ max_model_len=4096, auto_cls=AutoModelForVision2Seq, vllm_output_post_proc=model_utils.llava_video_vllm_to_hf_output, - image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))], ), "mantis": VLMTestInfo( models=["TIGER-Lab/Mantis-8B-siglip-llama3"], @@ -431,7 +426,7 @@ ) for inp in custom_inputs.different_patch_input_cases_internvl() ], ), - "llava_one_vision-multiple-images": VLMTestInfo( + "llava_onevision-multiple-images": VLMTestInfo( models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"], test_type=VLMTestType.CUSTOM_INPUTS, max_model_len=16384, diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py index 51fe7d2ad32a8..16e256e040a74 100644 --- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -427,130 +427,3 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, mm_limit=1, tensor_parallel_size=1, ) - - -def run_chunked_prefill_test( - vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - mm_limit: int, - tensor_parallel_size: int, - distributed_executor_backend: Optional[str] = None, -): - """Compare inference result between - chunked prefill disabled and chunked prefill enabled - """ - - # NOTE: - # max_model_len should be greater than image_feature_size - with vllm_runner(model, - task="generate", - max_model_len=4000, - max_num_seqs=4, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend - ) as vllm_model: - - outputs_per_case = [ - vllm_model.generate_greedy_logprobs(prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images or None, - videos=videos or None) - for prompts, images, videos in inputs - ] - - with vllm_runner( - model, - task="generate", - max_model_len=4000, - max_num_seqs=4, - dtype=dtype, - limit_mm_per_prompt={ - "image": mm_limit, - "video": mm_limit - }, - tensor_parallel_size=tensor_parallel_size, - distributed_executor_backend=distributed_executor_backend, - enable_chunked_prefill=True, - # should be small enough to ensure prefilling is chunked - max_num_batched_tokens=32, - mm_processor_kwargs={ - "max_pixels": 16 * 28 * 28, - }) as vllm_model_chunked: - outputs_per_case_chunked = [ - vllm_model_chunked.generate_greedy_logprobs( - prompts, - max_tokens, - num_logprobs=num_logprobs, - images=images or None, - videos=videos or None) for prompts, images, videos in inputs - ] - - for outputs, \ - outputs_chunked \ - in zip(outputs_per_case, - outputs_per_case_chunked): - check_logprobs_close( - outputs_0_lst=outputs, - outputs_1_lst=outputs_chunked, - name_0="non_chunked", - name_1="chunked", - ) - - -@pytest.mark.core_model -@pytest.mark.parametrize("model", models) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [1]) -@pytest.mark.parametrize("num_logprobs", [10]) -def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts, - model: str, dtype: str, - max_tokens: int, - num_logprobs: int) -> None: - """ - Test Qwen2-VL's chunked prefill with M-RoPE - """ - prompts = [ - qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt) - for prompt in example_prompts[:1] - ] - - # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs, - # so an image is included in the inputs - # 2. however, Qwen2-VL currently won't work properly - # when chunked prefill is enabled and there are some multi-modal inputs, - # here use a hacky way: provide a **zero-length** image to make it happy - # - # and finally we achieved: - # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests - zero_len_image = { - "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)), - "image_grid_thw": torch.tensor([[0, 0, 0]]) - } - images = [zero_len_image] * len(prompts) - - inputs_per_case: List[Tuple[List[str], PromptImageInput, - PromptVideoInput]] = [ - (prompts, images, []), - ] - - run_chunked_prefill_test( - vllm_runner, - inputs_per_case, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - mm_limit=1, - tensor_parallel_size=1, - ) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index f99d7556b27f9..b32faa699ebf2 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -11,8 +11,8 @@ from vllm.inputs import InputProcessingContext from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.processing import (ProcessingCache, PromptReplacement, - _PlaceholderInfo, find_text_matches, - find_token_matches, iter_placeholders, + _PlaceholderInfo, find_mm_placeholders, + find_text_matches, find_token_matches, iter_token_matches, replace_text_matches, replace_token_matches) @@ -314,21 +314,27 @@ def test_find_replace_text( # Should not be used since there is nothing to convert to text mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) + mm_prompt_repls = { + key: [ + PromptReplacement(key, target, + repl_by_key[key]).bind(mock_tokenizer) + ] for key, target in target_by_key.items() - ] - matches = find_text_matches(prompt, prompt_repls) + } + mm_matches = { + key: find_text_matches(prompt, prompt_repls) + for key, prompt_repls in mm_prompt_repls.items() + } result = replace_text_matches( prompt, - matches, + mm_matches, {key: mm_count for key in repl_by_key}, ) # Only displayed on error - print("matches:", matches) + print("mm_matches:", mm_matches) print("result:", result) # Manually constructed results @@ -380,21 +386,27 @@ def test_find_replace_tokens( # Should not be used since there is nothing to convert to tokens mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, target, repl_by_key[key]).bind(mock_tokenizer) + mm_prompt_repls = { + key: [ + PromptReplacement(key, target, + repl_by_key[key]).bind(mock_tokenizer) + ] for key, target in target_by_key.items() - ] - matches = find_token_matches(prompt, prompt_repls) + } + mm_matches = { + key: find_token_matches(prompt, prompt_repls) + for key, prompt_repls in mm_prompt_repls.items() + } result = replace_token_matches( prompt, - matches, + mm_matches, {key: mm_count for key in repl_by_key}, ) # Only displayed on error - print("matches:", matches) + print("mm_matches:", mm_matches) print("result:", result) # Manually constructed results @@ -417,58 +429,76 @@ def test_find_replace_tokens( [ ( [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=6, - replacement=[32000, 32000], - ), - ], + { + "pattern_1": [ + _PlaceholderInfo( + modality="pattern_1", + item_idx=0, + start_idx=6, + replacement=[32000, 32000], + ), + ], + } + ), ( [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=1, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_1", - start_idx=5, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_3", - start_idx=7, - replacement=[1550, 918, 1550], - ), - ], + { + "pattern_1": [ + _PlaceholderInfo( + modality="pattern_1", + item_idx=0, + start_idx=1, + replacement=[32000, 32000], + ), + _PlaceholderInfo( + modality="pattern_1", + item_idx=1, + start_idx=5, + replacement=[32000, 32000], + ), + ], + "pattern_3": [ + _PlaceholderInfo( + modality="pattern_3", + item_idx=0, + start_idx=7, + replacement=[1550, 918, 1550], + ), + ], + } ), ( [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550], - [ - _PlaceholderInfo( - modality="pattern_1", - start_idx=1, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_1", - start_idx=3, - replacement=[32000, 32000], - ), - _PlaceholderInfo( - modality="pattern_3", - start_idx=6, - replacement=[1550, 918, 1550], - ), - ], + { + "pattern_1": [ + _PlaceholderInfo( + modality="pattern_1", + item_idx=0, + start_idx=1, + replacement=[32000, 32000], + ), + _PlaceholderInfo( + modality="pattern_1", + item_idx=1, + start_idx=3, + replacement=[32000, 32000], + ), + ], + "pattern_3": [ + _PlaceholderInfo( + modality="pattern_3", + item_idx=0, + start_idx=6, + replacement=[1550, 918, 1550], + ), + ], + } ), ] ) # yapf: enable -def test_iter_placeholders( +def test_find_mm_placeholders( repl_by_key, prompt, expected, @@ -476,19 +506,18 @@ def test_iter_placeholders( # Should not be used since there is nothing to convert to tokens mock_tokenizer = cast(AnyTokenizer, object()) - prompt_repls = [ - PromptReplacement(key, [], repl).bind(mock_tokenizer) + mm_prompt_repls = { + key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)] for key, repl in repl_by_key.items() - ] + } - result = list( - iter_placeholders( - prompt_repls, - prompt, - # Effectively match all occurrences in the prompt - {key: 3 - for key in repl_by_key}, - )) + result = find_mm_placeholders( + mm_prompt_repls, + prompt, + # Effectively match all occurrences in the prompt + {key: 3 + for key in repl_by_key}, + ) # Only displayed on error print("result:", result) @@ -694,7 +723,10 @@ def _test_processing_cache_correctness( } mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = baseline_processor._get_dummy_mm_inputs(mm_counts).prompt_text + prompt = baseline_processor._get_dummy_processor_inputs( + model_config.max_model_len, + mm_counts, + ).prompt_text # Drop unnecessary keys and test single -> multi conversion if rng.rand() < simplify_rate: @@ -728,6 +760,8 @@ def _test_processing_cache_correctness( ("adept/fuyu-8b", {"image": False}), ("llava-hf/llava-1.5-7b-hf", {"image": True}), ("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}), + ("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}), + ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501 ("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}), ("mistral-community/pixtral-12b", {"image": True}), ("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}), diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 4f0d679bd6c28..2fd4262a9d3b9 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -456,7 +456,7 @@ def _get_num_image_tokens(self) -> int: hf_config = self.ctx.get_hf_config() return max(hf_config.projector_patch_to_query_dict.values()) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} def _get_mm_fields_config( @@ -488,8 +488,9 @@ def _get_prompt_replacements( ) ] - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config() diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 0fe10d8585215..b3ecb2f22dc19 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -405,7 +405,7 @@ def _get_num_image_tokens(self) -> int: hf_config = self.ctx.get_hf_config(Blip2Config) return hf_config.num_query_tokens - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} def _get_hf_processor(self) -> Blip2Processor: @@ -457,8 +457,9 @@ def apply( return result - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: hf_config = self.ctx.get_hf_config(Blip2Config) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 0bd0194243ceb..1ad44678a591d 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -57,7 +57,7 @@ def _get_num_image_tokens(self) -> int: processor = self._get_hf_processor() return processor.image_seq_length - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} def _get_hf_processor(self) -> ChameleonProcessor: @@ -90,8 +90,9 @@ def _get_prompt_replacements( ) ] - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: config = self.ctx.get_hf_config(ChameleonConfig) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 0188452054b8c..1bde45cb140cb 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -164,15 +164,18 @@ def get_num_image_tokens( def get_max_image_tokens(self) -> int: return get_max_clip_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_clip_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa class CLIPVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 3680d01725238..7cd58fbc7cf21 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -96,7 +96,7 @@ def _get_image_feature_grid_size( nrows = math.ceil(image_height / 30) return ncols, nrows - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: target_width, target_height = self._get_image_target_size() max_ncols, max_nrows = self._get_image_feature_grid_size( @@ -208,8 +208,9 @@ def apply( return result - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: target_width, target_height = self._get_image_target_size() diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 78de27cd821c6..d522378e0bebb 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -25,11 +25,9 @@ NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize) -from vllm.multimodal.processing import (BaseMultiModalProcessor, - InputProcessingContext, +from vllm.multimodal.processing import (InputProcessingContext, MultiModalDataItems, ProcessingCache, - ProcessorInputs, PromptReplacement, - full_groupby_modality) + ProcessorInputs, PromptReplacement) from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel @@ -39,7 +37,7 @@ from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import vision_encoder_info +from .vision import BaseVisionLanguageMultiModalProcessor class LlavaImagePixelInputs(TypedDict): @@ -100,19 +98,7 @@ class LlavaLikeConfig(Protocol): vision_feature_layer: Final[Union[int, List[int]]] -class BaseLlavaMultiModalProcessor(BaseMultiModalProcessor): - - def __init__(self, - ctx: InputProcessingContext, - *, - cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True) -> None: - super().__init__(ctx, - cache=cache, - enable_sanity_checks=enable_sanity_checks) - - vision_config = self._get_hf_config().vision_config - self._vision_encoder_info = vision_encoder_info(vision_config) +class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): @abstractmethod def _get_hf_config(self) -> LlavaLikeConfig: @@ -121,6 +107,19 @@ def _get_hf_config(self) -> LlavaLikeConfig: def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self._get_max_image_tokens()} + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + def _apply_feature_select_strategy( self, strategy: str, @@ -142,19 +141,6 @@ def _get_max_image_tokens(self) -> int: self._vision_encoder_info.get_max_image_tokens(), ) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: - return {"image": self._get_max_image_tokens()} - - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - ) - def _get_dummy_image_size(self) -> ImageSize: image_size = self._vision_encoder_info.get_image_size() return ImageSize(image_size, image_size) @@ -163,8 +149,9 @@ def _get_dummy_image_size(self) -> ImageSize: def _get_image_token(self) -> str: raise NotImplementedError - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) @@ -709,7 +696,7 @@ def get_replacement_mantis(item_idx: int): ")", # 3 tokens ]) - mantis_repls = self._bind_prompt_replacements([ + mantis_mm_repls = self._bind_and_group_repls([ PromptReplacement( modality="image", target=[image_token_id] * num_image_tokens, @@ -719,7 +706,7 @@ def get_replacement_mantis(item_idx: int): prompt_ids, prompt_text, _ = self._apply_prompt_replacements( result["prompt_token_ids"], - mantis_repls, + mantis_mm_repls, mm_item_counts, ) @@ -728,15 +715,19 @@ def get_replacement_mantis(item_idx: int): hf_processor_mm_kwargs, mm_kwargs, ) - orig_repls = self._bind_prompt_replacements(unbound_orig_repls) + orig_repls = self._bind_and_group_repls(unbound_orig_repls) + + mm_placeholders = self._find_mm_placeholders( + orig_repls, + prompt_ids, + mm_item_counts, + ) - all_placeholders = self._find_placeholders(orig_repls, prompt_ids, - mm_item_counts) - assert len(all_placeholders) == mm_item_counts.get("image", 0) + self._validate_mm_placeholders(mm_placeholders, mm_item_counts) - mm_placeholders = { - modality: [item.to_range() for item in items] - for modality, items in full_groupby_modality(all_placeholders) + mm_placeholder_ranges = { + modality: [item.to_range() for item in placeholders] + for modality, placeholders in mm_placeholders.items() } return MultiModalInputsV2( @@ -744,7 +735,7 @@ def get_replacement_mantis(item_idx: int): prompt=prompt_text, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_placeholders=mm_placeholders, + mm_placeholders=mm_placeholder_ranges, ) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 24debd1cbf3fe..3769f04f94a92 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -67,9 +67,6 @@ def _get_hf_config(self) -> LlavaNextConfig: def _get_hf_processor(self) -> LlavaNextProcessor: return self.ctx.get_hf_processor(LlavaNextProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token - def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -81,6 +78,9 @@ def _get_mm_fields_config( image_embeds=MultiModalFieldConfig.batched("image"), ) + def _get_image_token(self) -> str: + return self._get_hf_processor().image_token + def _get_max_image_tokens(self) -> int: largest_feature_size, _ = self._get_pinpoint_with_most_features() return largest_feature_size @@ -97,20 +97,20 @@ def _get_num_image_tokens( image_height: int, ) -> int: hf_config = self._get_hf_config() + vision_encoder_info = self._vision_encoder_info base_feature_size = self._apply_feature_select_strategy( hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_num_image_tokens( + vision_encoder_info.get_num_image_tokens( image_width=image_width, image_height=image_height, ), ) - num_patches = self._vision_encoder_info.get_num_patches() num_patch_height, num_patch_width = get_anyres_image_grid_shape( image_size=(image_height, image_width), grid_pinpoints=hf_config.image_grid_pinpoints, - patch_size=self._vision_encoder_info.get_image_size(), + patch_size=vision_encoder_info.get_image_size(), ) ( @@ -119,7 +119,7 @@ def _get_num_image_tokens( ) = self._get_num_unpadded_features( original_height=image_height, original_width=image_width, - npatches=num_patches, + npatches=vision_encoder_info.get_patch_grid_length(), num_patch_height=num_patch_height, num_patch_width=num_patch_width, ) @@ -155,6 +155,7 @@ def _get_num_unpadded_features( unpadded_features = current_height * current_width newline_features = current_height + return (unpadded_features, newline_features) def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 0de9d8c5ea572..ee6b89f0d4498 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -3,38 +3,32 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn -from transformers import (CLIPVisionConfig, LlavaNextVideoConfig, - SiglipVisionConfig) +from transformers import (BatchFeature, LlavaNextVideoConfig, + LlavaNextVideoProcessor) from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) +from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP from .llava import init_vision_tower_for_llava -from .siglip import (SiglipVisionModel, dummy_image_for_siglip, - dummy_seq_data_for_siglip) +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) - -# For profile run -_MAX_FRAMES_PER_VIDEO = 32 -_MAX_NUM_VIDEOS = 1 +from .vision import BaseVisionLanguageMultiModalProcessor class LlavaNextVideoPixelInputs(TypedDict): @@ -50,143 +44,148 @@ class LlavaNextVideoPixelInputs(TypedDict): """ -def get_llava_next_video_frame_feature_size( - hf_config: LlavaNextVideoConfig) -> int: - # Support both CLIPVisionConfig and SiglipVisionConfig - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size - spatial_pool_stride = hf_config.spatial_pool_stride +class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): - return int((image_size / patch_size / spatial_pool_stride)**2) + def _get_hf_config(self) -> LlavaNextVideoConfig: + return self.ctx.get_hf_config(LlavaNextVideoConfig) + def _get_hf_processor(self) -> LlavaNextVideoProcessor: + return self.ctx.get_hf_processor(LlavaNextVideoProcessor) -def _get_max_llm_tokens(ctx: InputContext) -> int: - """ - Calculated from the maximum video frames under the context length - constraints of the language model. - """ - hf_text_config = ctx.model_config.hf_text_config - model_config = ctx.model_config - max_tokens = model_config.max_model_len - rope_scaling = model_config.rope_scaling - - if rope_scaling: - rope_scaling_factor = hf_text_config.rope_scaling["factor"] - else: - rope_scaling_factor = 1 - - max_tokens *= rope_scaling_factor - - return max_tokens - - -def get_max_llava_next_video_tokens(ctx: InputContext) -> int: - # Currently set to 32 frames - # TODO: max_tokens = _get_max_llm_tokens(ctx) - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config) - return _MAX_FRAMES_PER_VIDEO * tokens_per_frame - - -def dummy_data_for_llava_next_video(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - vision_config = hf_config.vision_config - - # TODO: support multiple videos - num_videos = mm_counts["video"] - if num_videos != _MAX_NUM_VIDEOS: - raise NotImplementedError( - f"Only {_MAX_NUM_VIDEOS} videos are supported") - - # TODO: support configuring the number of frames - frames_per_video = _MAX_FRAMES_PER_VIDEO - # num_images = num_videos * frames_per_video - - # fills the sequence with as longer video data as possible - tokens_per_frame = get_llava_next_video_frame_feature_size(hf_config) - video_feature_size = frames_per_video * tokens_per_frame - - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video", - ) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"video": 1} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + num_frames = self._get_dummy_num_frames(seq_len) + max_video_tokens = self._get_max_video_tokens(num_frames) + + return {"video": max_video_tokens} + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values_videos=MultiModalFieldConfig.batched("video")) + + def _get_num_frame_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + spatial_pool_stride = hf_config.spatial_pool_stride - pil_frame = dummy_image_for_clip(vision_config, num_images=1) - np_frame = np.array(pil_frame["image"]) - mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) - mm_data = {"video": mm_data_per_video} - return DummyData(seq_data, mm_data, ranges) - elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video", + patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) + + return pooled_grid_length * pooled_grid_length + + def _get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + ) -> int: + num_frame_tokens = self._get_num_frame_tokens( + image_width=image_width, + image_height=image_height, ) - pil_frame = dummy_image_for_siglip(vision_config, num_images=1) - np_frame = np.array(pil_frame["image"]) - mm_data_per_video = np.repeat([np_frame], frames_per_video, axis=0) - mm_data = {"video": mm_data_per_video} - return DummyData(seq_data, mm_data, ranges) + return num_frame_tokens * num_frames - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + def _get_max_video_tokens(self, num_frames: int) -> int: + return self._get_num_video_tokens(image_width=999999, + image_height=999999, + num_frames=num_frames) + def _get_max_video_frames(self, max_tokens: int) -> int: + num_frames = 0 -def input_processor_for_llava_next_video(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "video" not in multi_modal_data: - return inputs + while True: + next_num_frames = num_frames + 1 - if "multi_modal_placeholders" in inputs and "video" in inputs[ - "multi_modal_placeholders"]: - # The inputs already have placeholders. - return inputs + if self._get_max_video_tokens(next_num_frames) > max_tokens: + break - video_data = multi_modal_data["video"] + num_frames = next_num_frames - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaNextVideoConfig) - vision_config = hf_config.vision_config + return num_frames - if isinstance(video_data, np.ndarray): - # Supports both CLIP and Siglip - num_frames = video_data.shape[0] - frame_feature_size = \ - get_llava_next_video_frame_feature_size(hf_config) - video_feature_size = num_frames * frame_feature_size + def _get_dummy_num_frames(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_videos = mm_config.limit_per_prompt.get("video", 1) - tokenizer = cached_get_tokenizer(model_config.tokenizer) + max_total_frames = self._get_max_video_frames(seq_len) - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, - ) + return max(max_total_frames // max(max_videos, 1), 1) - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) + def _get_dummy_image_size(self) -> ImageSize: + image_size = self._vision_encoder_info.get_image_size() + return ImageSize(image_size, image_size) - elif is_list_of(video_data, np.ndarray): - raise NotImplementedError( - "Processing multiple videos is not supported") + def _get_video_token(self) -> str: + return self._get_hf_processor().video_token - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + hf_config = self._get_hf_config() + video_token_id = hf_config.video_token_index + + def get_replacement(item_idx: int): + videos = mm_items.get_items( + "video", (VideoEmbeddingItems, VideoProcessorItems)) + + if isinstance(videos, VideoEmbeddingItems): + num_video_tokens = videos.get_feature_size(item_idx) + else: + image_size = videos.get_frame_size(item_idx) + num_video_tokens = self._get_num_video_tokens( + image_width=image_size.width, + image_height=image_size.height, + num_frames=videos.get_num_frames(item_idx), + ) + + return [video_token_id] * num_video_tokens + + return [ + PromptReplacement( + modality="video", + target=[video_token_id], + replacement=get_replacement, + ), + ] + + def _get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_videos = mm_counts.get("video", 0) + + video_token = self._get_video_token() + target_width, target_height = self._get_dummy_image_size() + + mm_data = { + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=video_token * num_videos, + mm_data=mm_data, + ) # adopted from transformers modeling_llava_next_video.py @@ -246,11 +245,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: return hidden_states -@MULTIMODAL_REGISTRY.register_input_mapper("video") -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "video", get_max_llava_next_video_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next_video) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next_video) +@MULTIMODAL_REGISTRY.register_processor(LlavaNextVideoMultiModalProcessor) class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 0bebc1c745e2b..1e51e09a24c18 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -3,47 +3,36 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.nn as nn -from PIL import Image -from transformers import (CLIPVisionConfig, LlavaOnevisionConfig, - SiglipVisionConfig) +from transformers import (BatchFeature, LlavaOnevisionConfig, + LlavaOnevisionProcessor) from transformers.models.llava_onevision.modeling_llava_onevision import ( get_anyres_image_grid_shape, unpad_image) from typing_extensions import NotRequired from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, - InputContext, token_inputs) from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import NestedTensors -from vllm.multimodal.utils import (cached_get_tokenizer, - repeat_and_pad_placeholder_tokens) +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors +from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, + VideoProcessorItems) +from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs, + PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import (CLIPVisionModel, dummy_seq_data_for_clip, - dummy_video_for_clip, get_clip_image_feature_size, - get_clip_patch_grid_length, input_processor_for_clip) +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .llava import init_vision_tower_for_llava -from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip, - dummy_video_for_siglip, get_siglip_image_feature_size, - get_siglip_patch_grid_length, input_processor_for_siglip) +from .llava_next import LlavaNextMultiModalProcessor +from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -# Result in the max possible feature size (2x2 grid of 336x336px tiles) -MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448 - -# For profile run -_MAX_FRAMES_PER_VIDEO = 16 - class LlavaOnevisionVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] @@ -92,286 +81,251 @@ class LlavaOnevisionImageEmbeddingInputs(TypedDict): LlavaOnevisionVideoPixelInputs] -def _get_llava_onevision_image_unppaded_feature_size(height, width, patches, - scale_height, - scale_width): - current_height = patches * scale_height - current_width = patches * scale_width - - original_aspect_ratio = width / height - current_aspect_ratio = current_width / current_height - if original_aspect_ratio > current_aspect_ratio: - new_height = int(height * (current_width / width)) - padding = (current_height - new_height) // 2 - current_height -= padding * 2 - else: - new_width = int(width * (current_height / height)) - padding = (current_width - new_width) // 2 - current_width -= padding * 2 - - unpadded_features = current_height * current_width - newline_features = current_height - - ratio = math.sqrt(current_height * current_width / (9 * patches**2)) - if ratio > 1.1: - unpadded_features = int(current_height // ratio) * int( - current_width // ratio) - newline_features = int(current_height // ratio) - - return (unpadded_features, newline_features) - - -def get_llava_onevision_image_feature_size( - hf_config: LlavaOnevisionConfig, - *, - input_height: int, - input_width: int, -) -> int: - vision_config = hf_config.vision_config - - if isinstance(vision_config, CLIPVisionConfig): - num_patches = get_clip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, - ) - base_feature_size = get_clip_image_feature_size(vision_config) - elif isinstance(vision_config, SiglipVisionConfig): - num_patches = get_siglip_patch_grid_length( - image_size=vision_config.image_size, - patch_size=vision_config.patch_size, - ) - base_feature_size = get_siglip_image_feature_size(vision_config) - else: - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - strategy = hf_config.vision_feature_select_strategy - if strategy == "default": - base_feature_size -= 1 - elif strategy == "full": - pass - else: - raise ValueError(f"Unexpected select feature strategy: {strategy}") +class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): - num_patch_height, num_patch_width = get_anyres_image_grid_shape( - image_size=(input_height, input_width), - grid_pinpoints=hf_config.image_grid_pinpoints, - patch_size=vision_config.image_size, - ) + def _get_hf_config(self) -> LlavaOnevisionConfig: + return self.ctx.get_hf_config(LlavaOnevisionConfig) - ( - unpadded_feature_size, - newline_feature_size, - ) = _get_llava_onevision_image_unppaded_feature_size( - input_height, input_width, num_patches, num_patch_height, - num_patch_width) - - return unpadded_feature_size + newline_feature_size + base_feature_size - - -def get_max_llava_onevision_image_tokens(ctx: InputContext): - return get_llava_onevision_image_feature_size( - ctx.get_hf_config(LlavaOnevisionConfig), - input_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - input_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - ) - - -def get_llava_onevision_video_frame_feature_size( - hf_config: LlavaOnevisionConfig) -> int: - # Support both CLIPVisionConfig and SiglipVisionConfig - image_size = hf_config.vision_config.image_size - patch_size = hf_config.vision_config.patch_size - spatial_pool_stride = hf_config.spatial_pool_stride if hasattr( - hf_config, "spatial_pool_stride") else 2 - - height = width = image_size // patch_size - return math.ceil(height / spatial_pool_stride) * math.ceil( - width / spatial_pool_stride) - - -def get_llava_onevision_video_tokens(ctx: InputContext, - num_frames: int) -> int: - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - - # TODO: support configuring (not supported by HF right now) - num_token_image_newline = 1 - tokens_per_frame = get_llava_onevision_video_frame_feature_size(hf_config) - video_feature_size = num_frames * tokens_per_frame + num_token_image_newline - - return video_feature_size - - -def get_max_llava_onevision_video_tokens(ctx: InputContext) -> int: - return get_llava_onevision_video_tokens(ctx, _MAX_FRAMES_PER_VIDEO) - - -def dummy_data_for_llava_onevision(ctx: InputContext, seq_len: int, - mm_counts: Mapping[str, int]): - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - vision_config = hf_config.vision_config - - num_videos = mm_counts["video"] - - # TODO: support configuring the number of frames - num_frames = _MAX_FRAMES_PER_VIDEO - video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) - - if isinstance(vision_config, CLIPVisionConfig): - seq_data, ranges = dummy_seq_data_for_clip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video") - - mm_data = dummy_video_for_clip(vision_config, - num_frames=num_frames, - num_videos=num_videos) - return DummyData(seq_data, mm_data, ranges) - elif isinstance(vision_config, SiglipVisionConfig): - seq_data, ranges = dummy_seq_data_for_siglip( - vision_config, - seq_len, - num_videos, - image_token_id=hf_config.video_token_index, - image_feature_size_override=video_feature_size, - mm_key="video") - - mm_data = dummy_video_for_siglip(vision_config, - num_frames=num_frames, - num_videos=num_videos) - return DummyData(seq_data, mm_data, ranges) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - -def input_processor_when_multimodal_input_image(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) - vision_config = hf_config.vision_config - - image_data = multi_modal_data["image"] - if isinstance(image_data, Image.Image): - width, height = image_data.size - - image_feature_size = get_llava_onevision_image_feature_size( - hf_config, - input_height=height, - input_width=width, - ) - elif is_list_of(image_data, Image.Image): - image_feature_size = [ - get_llava_onevision_image_feature_size(hf_config, - input_height=img.height, - input_width=img.width) - for img in image_data - ] - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - elif is_list_of(image_data, torch.Tensor): - image_feature_size = [item.shape[1] for item in image_data] - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - vision_config = hf_config.vision_config - - if isinstance(vision_config, CLIPVisionConfig): - return input_processor_for_clip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, + def _get_hf_processor(self) -> LlavaOnevisionProcessor: + return self.ctx.get_hf_processor(LlavaOnevisionProcessor) + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + max_image_tokens = self._get_max_image_tokens() + + num_frames = self._get_dummy_num_frames(seq_len) + max_video_tokens = self._get_max_video_tokens(num_frames) + + return { + "image": max_image_tokens, + "video": max_video_tokens, + } + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.batched("video"), ) - elif isinstance(vision_config, SiglipVisionConfig): - return input_processor_for_siglip( - model_config, - vision_config, - inputs, - image_token_id=hf_config.image_token_index, - image_feature_size_override=image_feature_size, + + def _get_num_unpadded_features( + self, + *, + original_height: int, + original_width: int, + npatches: int, + num_patch_height: int, + num_patch_width: int, + ) -> tuple[int, int]: + current_height = npatches * num_patch_height + current_width = npatches * num_patch_width + + original_aspect_ratio = original_width / original_height + current_aspect_ratio = current_width / current_height + if original_aspect_ratio > current_aspect_ratio: + new_height = int(original_height * + (current_width / original_width)) + padding = (current_height - new_height) // 2 + current_height -= padding * 2 + else: + new_width = int(original_width * + (current_height / original_height)) + padding = (current_width - new_width) // 2 + current_width -= padding * 2 + + unpadded_features = current_height * current_width + newline_features = current_height + + ratio = math.sqrt(current_height * current_width / (9 * npatches**2)) + if ratio > 1.1: + unpadded_features = int(current_height // ratio) * int( + current_width // ratio) + newline_features = int(current_height // ratio) + + return (unpadded_features, newline_features) + + def _get_num_frame_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) + + patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) + + return pooled_grid_length * pooled_grid_length + + def _get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + ) -> int: + num_frame_tokens = self._get_num_frame_tokens( + image_width=image_width, + image_height=image_height, ) - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) + return num_frame_tokens * num_frames + 1 # Newline token + + def _get_max_video_tokens(self, num_frames: int) -> int: + return self._get_num_video_tokens(image_width=999999, + image_height=999999, + num_frames=num_frames) + def _get_max_video_frames(self, max_tokens: int) -> int: + num_frames = 0 -def input_processor_when_multimodal_input_video(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or "video" not in multi_modal_data: - return inputs - video_data = multi_modal_data["video"] + while True: + next_num_frames = num_frames + 1 - model_config = ctx.model_config - hf_config = ctx.get_hf_config(LlavaOnevisionConfig) + if self._get_max_video_tokens(next_num_frames) > max_tokens: + break - if isinstance(video_data, np.ndarray): - # Supports both CLIP and Siglip - num_frames = video_data.shape[0] - video_feature_size = get_llava_onevision_video_tokens(ctx, num_frames) - tokenizer = cached_get_tokenizer(model_config.tokenizer) + num_frames = next_num_frames - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, + return num_frames + + def _get_dummy_num_frames(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) + + max_image_tokens = self._get_max_image_tokens() * max_images + max_total_frames = self._get_max_video_frames(seq_len - + max_image_tokens) + + return max(max_total_frames // max(max_videos, 1), 1) + + def _get_video_token(self) -> str: + return self._get_hf_processor().video_token + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + ) -> BatchFeature: + mm_data = dict(mm_data) + videos = mm_data.pop("videos", []) + assert isinstance(videos, list) + + if not videos: + return super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + video_token = self._get_video_token() + + # LLaVA-OneVision processor doesn't support multiple videos + # with different sizes when converting back to tensors + text_image_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=mm_data, + mm_kwargs=mm_kwargs, + ) + + pixel_values_videos = [] + for video in videos: + item_processor_data = dict(prompt=video_token, videos=video) + + item_outputs = super()._call_hf_processor( + prompt=prompt, + mm_data=item_processor_data, + mm_kwargs=mm_kwargs, + ) + + pixel_values_videos.append( + item_outputs.pop("pixel_values_videos")[0]) + + combined_outputs = dict( + **text_image_outputs, + pixel_values_videos=pixel_values_videos, ) + return BatchFeature(combined_outputs) - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) - - elif is_list_of(video_data, np.ndarray): - video_feature_size = [] - for video in video_data: - num_frames = video.shape[0] - video_feature_size.append( - get_llava_onevision_video_tokens(ctx, num_frames)) - - tokenizer = cached_get_tokenizer(model_config.tokenizer) - new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens( - tokenizer, - inputs.get("prompt"), - inputs["prompt_token_ids"], - placeholder_token_id=hf_config.video_token_index, - repeat_count=video_feature_size, + def _get_prompt_replacements( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargs, + ) -> list[PromptReplacement]: + image_repls = super()._get_prompt_replacements( + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + out_mm_kwargs=out_mm_kwargs, ) - return token_inputs(prompt_token_ids=new_token_ids, - prompt=new_prompt, - multi_modal_data=multi_modal_data, - multi_modal_placeholders={"video": ranges}) - else: - raise TypeError(f"Invalid video type: {type(video_data)}") - msg = f"Unsupported video type: {type(video_data)}" - raise NotImplementedError(msg) + hf_config = self._get_hf_config() + video_token_id = hf_config.video_token_index + def get_video_replacement(item_idx: int): + videos = mm_items.get_items( + "video", (VideoEmbeddingItems, VideoProcessorItems)) -def input_processor_for_llava_onevision(ctx: InputContext, - inputs: DecoderOnlyInputs): - multi_modal_data = inputs.get("multi_modal_data") - if multi_modal_data is None or ("video" not in multi_modal_data - and "image" not in multi_modal_data): - return inputs - if "image" in multi_modal_data: - return input_processor_when_multimodal_input_image(ctx, inputs) - if "video" in multi_modal_data: - return input_processor_when_multimodal_input_video(ctx, inputs) + if isinstance(videos, VideoEmbeddingItems): + num_video_tokens = videos.get_feature_size(item_idx) + else: + image_size = videos.get_frame_size(item_idx) + num_video_tokens = self._get_num_video_tokens( + image_width=image_size.width, + image_height=image_size.height, + num_frames=videos.get_num_frames(item_idx), + ) + + return [video_token_id] * num_video_tokens - msg = "Unsupported multi data type" - raise NotImplementedError(msg) + return image_repls + [ + PromptReplacement( + modality="video", + target=[video_token_id], + replacement=get_video_replacement, + ), + ] + + def _get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + image_token = self._get_image_token() + video_token = self._get_video_token() + target_width, target_height = self._get_dummy_image_size() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=image_token * num_images + video_token * num_videos, + mm_data=mm_data, + ) class LlavaOnevisionMultiModalProjector(nn.Module): @@ -394,14 +348,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: return hidden_states -@MULTIMODAL_REGISTRY.register_image_input_mapper() -@MULTIMODAL_REGISTRY.register_input_mapper("video") -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "image", get_max_llava_onevision_image_tokens) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens( - "video", get_max_llava_onevision_video_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_onevision) -@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_onevision) +@MULTIMODAL_REGISTRY.register_processor(LlavaOnevisionMultiModalProcessor) class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index f2e49d8e4848d..7aa9d58d1d348 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -323,7 +323,7 @@ def _get_num_image_tokens( height=image_height, ) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: max_image_tokens = self._get_num_image_tokens( image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, @@ -415,12 +415,12 @@ def get_replacement_phi3v(item_idx: int): def _apply_prompt_replacements( self, token_ids: list[int], - prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, list[_PlaceholderInfo]]: + ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]: token_ids, text, placeholders = super()._apply_prompt_replacements( token_ids=token_ids, - prompt_repls=prompt_repls, + mm_prompt_repls=mm_prompt_repls, mm_item_counts=mm_item_counts, ) @@ -428,15 +428,23 @@ def _apply_prompt_replacements( if text.startswith(" <|image|>"): text = text.replace(" <|image|>", "<|image|>", 1) token_ids = [token_ids[0], *token_ids[2:]] - placeholders = [ - _PlaceholderInfo(p.modality, p.start_idx - 1, p.replacement) - for p in placeholders - ] + placeholders = { + modality: [ + _PlaceholderInfo( + modality=p.modality, + item_idx=p.item_idx, + start_idx=p.start_idx - 1, + replacement=p.replacement, + ) for p in ps + ] + for modality, ps in placeholders.items() + } return token_ids, text, placeholders - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index d7233bd6028ed..9e1d38512c0b4 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -780,15 +780,18 @@ def get_num_image_tokens( def get_max_image_tokens(self) -> int: return get_max_pixtral_hf_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_pixtral_hf_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - class PixtralHFMLP(nn.Module): diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index d050fd060353a..bc3bb1f79b407 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -84,7 +84,7 @@ class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) max_source_positions = hf_config.audio_config.max_source_positions max_output_lengths = (max_source_positions - 2) // 2 + 1 @@ -184,15 +184,16 @@ def get_replacement_qwen2_audio(item_idx: int): ] def _always_apply_prompt_replacements(self) -> bool: - # HF never applies prompt replacements, so we have to do it ourselves - # _find_placeholders may incorrectly think that HF has already performed - # processing for multi-audio input when the input audios are short - # (the corresponding placeholders may take up fewer tokens than - # the number of audio items) + # HF never applies prompt replacements, so we have to do it ourselves. + # NOTE: `_find_placeholders_by_modality` may incorrectly think that HF + # has already performed processing for multi-audio input when the input + # audios are short (the corresponding placeholders may take up fewer + # tokens than the number of audio items) return True - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 5a8c6e4deb7ac..abca85e0e2024 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -56,7 +56,8 @@ from vllm.multimodal.inputs import (ImageItem, ModalityData, MultiModalFieldConfig, MultiModalKwargs, NestedTensors, VideoItem) -from vllm.multimodal.parse import ModalityDataItems, MultiModalDataParser +from vllm.multimodal.parse import (ImageSize, ModalityDataItems, + MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessorInputs, PromptReplacement) @@ -641,58 +642,6 @@ def load_weights(self, weights: Iterable[Tuple[str, return loaded_params -# === Vision input helpers === # - - -def _get_vision_info( - vision_config: Qwen2VLVisionConfig, - height: int, - width: int, - min_pixels: int, - max_pixels: int, - *, - do_resize: bool = True, - modality: str = "image", - mm_count: int = 1, -): - """Get information (resized height / width and number of vision tokens) - of input image / video frame.""" - patch_size = vision_config.patch_size - merge_size = vision_config.spatial_merge_size - temporal_patch_size = vision_config.temporal_patch_size - - if do_resize: - resized_height, resized_width = smart_resize( - height=height, - width=width, - factor=patch_size * merge_size, - min_pixels=min_pixels, - max_pixels=max_pixels, - ) - else: - resized_height, resized_width = height, width - - if modality == "image": - grid_t = mm_count - elif modality == "video": - grid_t = max(mm_count // temporal_patch_size, 1) - else: - raise ValueError(f"Modality {modality} is not supported") - - grid_h = resized_height // patch_size - grid_w = resized_width // patch_size - vision_tokens = grid_t * grid_h * grid_w - llm_num_vision_tokens = vision_tokens // (merge_size**2) - - return resized_height, resized_width, llm_num_vision_tokens - - -def _get_image_processor(hf_processor: Qwen2VLProcessor): - image_processor = hf_processor.image_processor # type: ignore - assert isinstance(image_processor, Qwen2VLImageProcessor) - return image_processor - - class Qwen2EmbeddingItems(ModalityDataItems[dict[str, torch.Tensor], dict[str, torch.Tensor]]): @@ -764,32 +713,111 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} - def _get_max_mm_tokens(self, modality: str) -> int: + def _get_vision_info( + self, + *, + image_width: int, + image_height: int, + num_frames: int = 1, + do_resize: bool = True, + ) -> tuple[ImageSize, int]: hf_config = self.ctx.get_hf_config(Qwen2VLConfig) vision_config = hf_config.vision_config + patch_size = vision_config.patch_size + merge_size = vision_config.spatial_merge_size + temporal_patch_size = vision_config.temporal_patch_size hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) - - _, _, max_llm_image_tokens = _get_vision_info( - vision_config, - height=9999999, - width=9999999, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, - modality=modality, + image_processor = self._get_image_processor(hf_processor) + + if do_resize: + resized_height, resized_width = smart_resize( + height=image_height, + width=image_width, + factor=patch_size * merge_size, + min_pixels=image_processor.min_pixels, + max_pixels=image_processor.max_pixels, + ) + preprocessed_size = ImageSize(width=resized_width, + height=resized_height) + else: + preprocessed_size = ImageSize(width=image_width, + height=image_height) + + grid_t = max(num_frames // temporal_patch_size, 1) + grid_h = preprocessed_size.height // patch_size + grid_w = preprocessed_size.width // patch_size + + num_patches = grid_t * grid_h * grid_w + num_vision_tokens = num_patches // (merge_size**2) + + return preprocessed_size, num_vision_tokens + + def _get_dummy_image_size(self) -> ImageSize: + max_image_size, _ = self._get_vision_info( + image_width=9999999, + image_height=9999999, + ) + return max_image_size + + def _get_max_image_tokens(self) -> int: + _, max_image_tokens = self._get_vision_info( + image_width=9999999, + image_height=9999999, + ) + return max_image_tokens + + def _get_max_video_tokens(self, num_frames: int) -> int: + _, max_video_tokens = self._get_vision_info( + image_width=9999999, + image_height=9999999, + num_frames=num_frames, ) - return max_llm_image_tokens + return max_video_tokens + + def _get_max_video_frames(self, max_tokens: int) -> int: + num_frames = 0 + + while True: + next_num_frames = num_frames + 1 + + if self._get_max_video_tokens(next_num_frames) > max_tokens: + break + + num_frames = next_num_frames + + return num_frames + + def _get_dummy_num_frames(self, seq_len: int) -> int: + mm_config = self.ctx.get_mm_config() + max_images = mm_config.limit_per_prompt.get("image", 1) + max_videos = mm_config.limit_per_prompt.get("video", 1) + + max_image_tokens = self._get_max_image_tokens() * max_images + max_total_frames = self._get_max_video_frames(seq_len - + max_image_tokens) + + return max(max_total_frames // max(max_videos, 1), 1) + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + max_image_tokens = self._get_max_image_tokens() + + num_frames = self._get_dummy_num_frames(seq_len) + max_video_tokens = self._get_max_video_tokens(num_frames) - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: return { - "image": self._get_max_mm_tokens("image"), - "video": self._get_max_mm_tokens("video"), + "image": max_image_tokens, + "video": max_video_tokens, } def _get_data_parser(self) -> MultiModalDataParser: return Qwen2MultiModalDataParser() + def _get_image_processor(self, hf_processor: Qwen2VLProcessor): + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2VLImageProcessor) + return image_processor + def _get_hf_processor( self, *, @@ -797,7 +825,7 @@ def _get_hf_processor( max_pixels: Optional[int] = None, ) -> Qwen2VLProcessor: hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) - image_processor = _get_image_processor(hf_processor) + image_processor = self._get_image_processor(hf_processor) if min_pixels: image_processor.min_pixels = min_pixels @@ -818,7 +846,7 @@ def _get_prompt_replacements( out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) + image_processor = self._get_image_processor(hf_processor) # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered @@ -873,32 +901,35 @@ def _get_mm_fields_config( video_grid_thw=MultiModalFieldConfig.batched("video"), ) - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: - hf_processor = self._get_hf_processor() - image_processor = _get_image_processor(hf_processor) + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + hf_processor = self._get_hf_processor() image_token: str = hf_processor.image_token - resized_height, resized_width = smart_resize( - height=9999999, - width=9999999, - factor=image_processor.patch_size * image_processor.merge_size, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, - ) - num_images = mm_counts.get("image", 0) + video_token: str = hf_processor.video_token + target_width, target_height = self._get_dummy_image_size() mm_data = { "image": - self._get_dummy_images(width=resized_width, - height=resized_height, - num_images=num_images) + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) } return ProcessorInputs( - prompt_text=image_token * num_images, + prompt_text=image_token * num_images + video_token * num_videos, mm_data=mm_data, ) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 115eaaac900e0..7ea177e94afc0 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -171,15 +171,18 @@ def get_num_image_tokens( def get_max_image_tokens(self) -> int: return get_max_siglip_image_tokens(self.vision_config) - def get_num_patches(self) -> int: + def get_image_size(self) -> int: + return self.vision_config.image_size + + def get_patch_size(self) -> int: + return self.vision_config.patch_size + + def get_patch_grid_length(self) -> int: return get_siglip_patch_grid_length( image_size=self.vision_config.image_size, patch_size=self.vision_config.patch_size, ) - def get_image_size(self) -> int: - return self.vision_config.image_size - # Adapted from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L249 # noqa class SiglipVisionEmbeddings(nn.Module): diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 0b83684c9bac5..6ad4661e3bb8d 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -6,7 +6,6 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) -import numpy as np import torch import torch.utils.checkpoint from torch import nn @@ -31,7 +30,6 @@ PromptReplacement) from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig -from vllm.utils import is_list_of from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, @@ -62,7 +60,7 @@ class UltravoxMultiModalProcessor(BaseMultiModalProcessor): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: feature_extractor = self._get_feature_extractor() max_audio_tokens = math.ceil(feature_extractor.chunk_length * _AUDIO_TOKENS_PER_SECOND) @@ -103,6 +101,7 @@ def _call_hf_processor( mm_data = dict(mm_data) audios = mm_data.pop("audios", []) + assert isinstance(audios, list) if not audios: return super()._call_hf_processor( @@ -117,9 +116,6 @@ def _call_hf_processor( sampling_rate=feature_extractor.sampling_rate, ) - # Already resampled by _get_hf_mm_data - assert is_list_of(audios, np.ndarray) - # Ultravox processor doesn't support multiple inputs, # therefore we need to input text and audio one by one audio_features, audio_token_len = [], [] @@ -177,8 +173,9 @@ def get_replacement_ultravox(item_idx: int): ) ] - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: feature_extractor = self._get_feature_extractor() diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 65a773480d2a1..014f02ee10a1b 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,8 +1,12 @@ from abc import ABC, abstractmethod -from typing import Generic, TypeVar +from typing import Final, Generic, Optional, Protocol, TypeVar from transformers import PretrainedConfig +from vllm.multimodal.processing import (BaseMultiModalProcessor, + InputProcessingContext, + ProcessingCache) + _C = TypeVar("_C", bound=PretrainedConfig) @@ -27,11 +31,15 @@ def get_max_image_tokens(self) -> int: raise NotImplementedError @abstractmethod - def get_num_patches(self) -> int: + def get_image_size(self) -> int: raise NotImplementedError @abstractmethod - def get_image_size(self) -> int: + def get_patch_size(self) -> int: + raise NotImplementedError + + @abstractmethod + def get_patch_grid_length(self) -> int: raise NotImplementedError @@ -50,3 +58,26 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) + + +class VisionLanguageConfig(Protocol): + vision_config: Final[PretrainedConfig] + + +class BaseVisionLanguageMultiModalProcessor(BaseMultiModalProcessor): + + def __init__(self, + ctx: InputProcessingContext, + *, + cache: Optional[ProcessingCache] = None, + enable_sanity_checks: bool = True) -> None: + super().__init__(ctx, + cache=cache, + enable_sanity_checks=enable_sanity_checks) + + vision_config = self._get_hf_config().vision_config + self._vision_encoder_info = vision_encoder_info(vision_config) + + @abstractmethod + def _get_hf_config(self) -> VisionLanguageConfig: + raise NotImplementedError diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 00acb77435163..6be046ba77ca7 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -146,6 +146,20 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem]): def __init__(self, data: Sequence[HfVideoItem]) -> None: super().__init__(data, "video") + def get_num_frames(self, item_idx: int) -> int: + return len(self.get(item_idx)) + + def get_frame_size(self, item_idx: int) -> ImageSize: + image = self.get(item_idx)[0] # Assume that the video isn't empty + + if isinstance(image, Image): + return ImageSize(*image.size) + if isinstance(image, (np.ndarray, torch.Tensor)): + _, h, w = image.shape + return ImageSize(w, h) + + assert_never(image) + class VideoEmbeddingItems(EmbeddingItems): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index eb7552176e974..ebc16b817684a 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -16,7 +16,8 @@ from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger -from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens +from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, + encode_tokens) from vllm.utils import LRUCache, flatten_2d_lists, full_groupby from .inputs import (MultiModalDataDict, MultiModalFieldConfig, @@ -69,19 +70,6 @@ def _cached_encode( add_special_tokens=add_special_tokens) -def _decode( - tokenizer: AnyTokenizer, - token_ids: list[int], - *, - skip_special_tokens: bool = False, -) -> str: - """ - Backend-agnostic equivalent of HF's - :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. - """ - return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) - - @lru_cache(maxsize=2048) def _cached_decode( tokenizer: AnyTokenizer, @@ -89,9 +77,9 @@ def _cached_decode( *, skip_special_tokens: bool = False, ) -> str: - return _decode(tokenizer, - list(token_ids), - skip_special_tokens=skip_special_tokens) + return decode_tokens(tokenizer, + list(token_ids), + skip_special_tokens=skip_special_tokens) class _HasModalityAttr(Protocol): @@ -269,8 +257,10 @@ def end_idx(self) -> int: return self.match.end() -class _PlaceholderInfo(NamedTuple): +@dataclass +class _PlaceholderInfo: modality: str + item_idx: int start_idx: int replacement: list[int] @@ -311,12 +301,14 @@ def find_text_matches( def _resolve_matches( prompt: _PromptSeq, - matches: Sequence[_PromptReplacementMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], ) -> list[_PromptReplacementMatch]: """ - Resolve :code:`matches` to ensure that there are no overlapping matches, + Resolve :code:`mm_matches` to ensure that there are no overlapping matches, and sort them such that earlier matches take priority over later ones. """ + matches = [m for matches in mm_matches.values() for m in matches] + seen_matches: list[Optional[_PromptReplacementMatch]] = [None ] * len(prompt) @@ -334,14 +326,15 @@ def _resolve_matches( def _replace_matches( prompt: _S, - matches: Sequence[_PromptReplacementMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementMatch]], mm_item_counts: Mapping[str, int], ) -> list[_S]: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" out_seqs = list[_S]() prev_end_idx = 0 next_idx_by_modality = defaultdict[str, int](lambda: 0) - for match in _resolve_matches(prompt, matches): + for match in _resolve_matches(prompt, mm_matches): modality = match.modality item_idx = next_idx_by_modality[modality] @@ -371,28 +364,28 @@ def _replace_matches( def replace_token_matches( prompt: list[int], - matches: Sequence[_PromptReplacementTokenMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementTokenMatch]], mm_item_counts: Mapping[str, int], ) -> list[int]: - """Apply :code:`prompt_repls` to :code:`prompt`.""" - if not matches: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + if not mm_matches: return prompt - token_id_seqs = _replace_matches(prompt, matches, mm_item_counts) + token_id_seqs = _replace_matches(prompt, mm_matches, mm_item_counts) return flatten_2d_lists(token_id_seqs) def replace_text_matches( prompt: str, - matches: Sequence[_PromptReplacementTextMatch], + mm_matches: Mapping[str, Sequence[_PromptReplacementTextMatch]], mm_item_counts: Mapping[str, int], ) -> str: - """Apply :code:`prompt_repls` to :code:`prompt`.""" - if not matches: + """Apply the replacements in :code:`mm_matches` to :code:`prompt`.""" + if not mm_matches: return prompt - texts = _replace_matches(prompt, matches, mm_item_counts) + texts = _replace_matches(prompt, mm_matches, mm_item_counts) return "".join(texts) @@ -407,14 +400,14 @@ def _iter_modality_placeholders( return prompt_len = len(prompt) - item_index = 0 + item_idx = 0 start_idx = 0 while start_idx < prompt_len: found = False for repl_info in modality_repls: - replacement = repl_info.get_replacement(item_index) + replacement = repl_info.get_replacement(item_idx) repl_tokens = replacement.token_ids repl_len = len(repl_tokens) end_idx = start_idx + repl_len @@ -425,12 +418,13 @@ def _iter_modality_placeholders( if prompt[start_idx:end_idx] == repl_tokens: yield _PlaceholderInfo( modality=modality, + item_idx=item_idx, start_idx=start_idx, replacement=repl_tokens, ) - item_index += 1 - if item_index >= modal_item_count: + item_idx += 1 + if item_idx >= modal_item_count: return # Exclude overlapping matches @@ -442,28 +436,36 @@ def _iter_modality_placeholders( start_idx += 1 -def iter_placeholders( - prompt_repls: Sequence[_BoundPromptReplacement], +def _iter_placeholders( + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], prompt: list[int], mm_item_counts: Mapping[str, int], ) -> Iterable[_PlaceholderInfo]: """ - Yield each set of placeholder tokens found in :code:`prompt`. + For each modality, yield each set of placeholder tokens found in + :code:`prompt`. Note that empty matches are ignored. """ - repls_by_modality = dict(full_groupby_modality(prompt_repls)) - for modality, modal_item_count in mm_item_counts.items(): - if modality in repls_by_modality: + if modality in mm_prompt_repls: yield from _iter_modality_placeholders( prompt, modality, - repls_by_modality[modality], + mm_prompt_repls[modality], modal_item_count, ) +def find_mm_placeholders( + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], + prompt: list[int], + mm_item_counts: Mapping[str, int], +) -> Mapping[str, list[_PlaceholderInfo]]: + it = _iter_placeholders(mm_prompt_repls, prompt, mm_item_counts) + return dict(full_groupby_modality(it)) + + @dataclass class ProcessorInputs: """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" @@ -620,7 +622,7 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: raise NotImplementedError @abstractmethod - def get_mm_max_tokens_per_item(self) -> Mapping[str, int]: + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: """ Get the maximum possible number of tokens per data item for each modality. @@ -703,14 +705,14 @@ def _get_prompt_replacements( """ raise NotImplementedError - def _find_placeholders( + def _find_mm_placeholders( self, - all_prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], new_token_ids: list[int], mm_item_counts: Mapping[str, int], - ) -> list[_PlaceholderInfo]: - return list( - iter_placeholders(all_prompt_repls, new_token_ids, mm_item_counts)) + ) -> Mapping[str, list[_PlaceholderInfo]]: + return find_mm_placeholders(mm_prompt_repls, new_token_ids, + mm_item_counts) def _get_hf_mm_data( self, @@ -797,7 +799,10 @@ def _apply_hf_processor_missing( # Some HF processors (e.g. Qwen2-VL) expect corresponding # multi-modal tokens to be in the prompt text - dummy_inputs = self._get_dummy_mm_inputs(mm_missing_counts) + dummy_inputs = self._get_dummy_processor_inputs( + self.ctx.model_config.max_model_len, + mm_missing_counts, + ) _, mm_missing_kwargs = self._apply_hf_processor( prompt_text=dummy_inputs.prompt_text, @@ -889,50 +894,44 @@ def _cached_apply_hf_processor( mm_kwargs = MultiModalKwargs.from_items(merged_kw_items) - if self.enable_sanity_checks: - mm_item_counts = mm_data_items.get_all_counts() - - for modality, item_count in mm_item_counts.items(): - for item_idx in range(item_count): - try: - mm_kwargs.get_item(modality, item_idx) - except Exception as e: - # Make it easy to set a breakpoint in the debugger - raise e - return prompt_ids, mm_kwargs - def _bind_prompt_replacements( + def _bind_and_group_repls( self, prompt_repls: list[PromptReplacement], - ) -> list[_BoundPromptReplacement]: + ) -> dict[str, list[_BoundPromptReplacement]]: tokenizer = self._get_tokenizer() - return [prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls] + it = (prompt_repl.bind(tokenizer) for prompt_repl in prompt_repls) + return dict(full_groupby_modality(it)) def _always_apply_prompt_replacements(self) -> bool: """ A flag which can be overridden so that :meth:`_apply_prompt_replacements` is always called even if we - detect that HF has performed processing via :meth:`_find_placeholders`. + detect that HF has performed processing via + :meth:`_find_placeholders_by_modality`. - This is useful in cases where :meth:`_find_placeholders` cannot be - reliably used to detect whether HF has performed processing or not. + This is useful in cases where :meth:`_find_placeholders_by_modality` + cannot be reliably used to detect whether HF has performed processing. """ return False def _apply_prompt_replacements( self, token_ids: list[int], - prompt_repls: Sequence[_BoundPromptReplacement], + mm_prompt_repls: Mapping[str, Sequence[_BoundPromptReplacement]], mm_item_counts: Mapping[str, int], - ) -> tuple[list[int], str, list[_PlaceholderInfo]]: + ) -> tuple[list[int], str, Mapping[str, list[_PlaceholderInfo]]]: tokenizer = self._get_tokenizer() - token_matches = find_token_matches(token_ids, prompt_repls) + mm_token_matches = { + modality: find_token_matches(token_ids, prompt_repls) + for modality, prompt_repls in mm_prompt_repls.items() + } mm_match_counts = { modality: len(matches) - for modality, matches in full_groupby_modality(token_matches) + for modality, matches in mm_token_matches.items() } # If the search text does not represent a special token, @@ -951,32 +950,92 @@ def _apply_prompt_replacements( ): # yapf: disable token_ids = replace_token_matches( token_ids, - token_matches, + mm_token_matches, mm_item_counts, ) - text = _decode(tokenizer, token_ids) - matched_repls = [match.prompt_repl for match in token_matches] + text = decode_tokens(tokenizer, token_ids) + matched_repls = { + modality: [match.prompt_repl for match in token_matches] + for modality, token_matches in mm_token_matches.items() + } else: - text = _decode(tokenizer, token_ids) + text = decode_tokens(tokenizer, token_ids) - text_matches = find_text_matches(text, prompt_repls) + mm_text_matches = { + modality: find_text_matches(text, prompt_repls) + for modality, prompt_repls in mm_prompt_repls.items() + } text = replace_text_matches( text, - text_matches, + mm_text_matches, mm_item_counts, ) token_ids = encode_tokens(tokenizer, text, add_special_tokens=False) - matched_repls = [match.prompt_repl for match in text_matches] - - placeholders = self._find_placeholders(matched_repls, token_ids, - mm_item_counts) + matched_repls = { + modality: [match.prompt_repl for match in token_matches] + for modality, token_matches in mm_text_matches.items() + } + + placeholders = self._find_mm_placeholders( + matched_repls, + token_ids, + mm_item_counts, + ) return token_ids, text, placeholders + def _validate_mm_kwargs( + self, + mm_kwargs: MultiModalKwargs, + mm_item_counts: Mapping[str, int], + ) -> None: + for modality, item_count in mm_item_counts.items(): + if modality in mm_kwargs.modalities: + items = mm_kwargs.get_items(modality) + else: + items = [] + + if len(items) != item_count: + raise RuntimeError( + f"Expected there to be {item_count} {modality} items in " + f"keyword arguments corresponding to {item_count} " + f"{modality} data items, but only found {len(items)}! " + "There is likely a problem with your " + "implementation of merged multi-modal processor for this " + "model (usually arising from an inconsistency between " + "`_call_hf_processor` and `_get_mm_fields_config`).") + + def _validate_mm_placeholders( + self, + mm_placeholders: Mapping[str, list[_PlaceholderInfo]], + mm_item_counts: Mapping[str, int], + *, + allow_missing: bool = False, + ) -> Mapping[str, int]: + missing_repl_counts = dict[str, int]() + + for modality, item_count in mm_item_counts.items(): + placeholders = mm_placeholders.get(modality, []) + + if len(placeholders) != item_count and not allow_missing: + raise RuntimeError( + f"Expected there to be {item_count} prompt replacements " + f"corresponding to {item_count} {modality} items, but only " + f"found {len(placeholders)} prompt replacements! Either " + "the prompt text has missing/incorrect tokens for " + "multi-modal inputs, or there is a problem with your " + "implementation of merged multi-modal processor for this " + "model (usually arising from an inconsistency between " + "`_call_hf_processor` and `_get_prompt_replacements`).") + + missing_repl_counts[modality] = item_count - len(placeholders) + + return missing_repl_counts + def apply( self, prompt_text: str, @@ -1009,56 +1068,69 @@ def apply( hf_processor_mm_kwargs, mm_kwargs, ) - prompt_repls = self._bind_prompt_replacements(unbound_prompt_repls) + mm_prompt_repls = self._bind_and_group_repls(unbound_prompt_repls) - # If HF processor already inserts placeholder tokens, - # there is no need for us to insert them mm_item_counts = mm_items.get_all_counts() - all_placeholders = self._find_placeholders(prompt_repls, prompt_ids, - mm_item_counts) + self._validate_mm_kwargs(mm_kwargs, mm_item_counts) + + hf_mm_placeholders = self._find_mm_placeholders( + mm_prompt_repls, + prompt_ids, + mm_item_counts, + ) + + if self._always_apply_prompt_replacements(): + mm_missing_repl_counts = mm_item_counts + mm_missing_repls = dict(mm_prompt_repls) + else: + mm_missing_repl_counts = self._validate_mm_placeholders( + hf_mm_placeholders, + mm_item_counts, + allow_missing=True, + ) + + mm_missing_repls = dict[str, list[_BoundPromptReplacement]]() + for modality, missing_repl_count in mm_missing_repl_counts.items(): + if missing_repl_count == 0: + mm_missing_repls[modality] = [] + elif missing_repl_count == mm_item_counts.get(modality, 0): + mm_missing_repls[modality] = mm_prompt_repls[modality] + else: + raise ValueError("Partial prompt replacement within " + f"{modality=} is not supported") - if all_placeholders and not self._always_apply_prompt_replacements(): + # If HF processor already inserts placeholder tokens, + # there is no need for us to insert them + if all(len(repls) == 0 for repls in mm_missing_repls.items()): tokenizer = self._get_tokenizer() - prompt_text = _decode(tokenizer, prompt_ids) + prompt_text = decode_tokens(tokenizer, prompt_ids) + mm_placeholders = hf_mm_placeholders else: ( prompt_ids, prompt_text, - all_placeholders, + missing_mm_placeholders, ) = self._apply_prompt_replacements( prompt_ids, - prompt_repls, - mm_item_counts, + mm_missing_repls, + mm_missing_repl_counts, ) - mm_placeholders = dict[str, list[PlaceholderRange]]() - err_suffix = ("This suggests a problem with your implementation of " - "the merged multi-modal processor for this model, " - "particularly in the `_get_prompt_replacements` method.") - - for modality, placeholders in full_groupby_modality(all_placeholders): - if modality not in mm_items: - raise AssertionError( - f"Expected no placeholders for {modality=}, " - f"but found {placeholders=}. Input items: {mm_items}" - f"\n{err_suffix}") - - if len(placeholders) != len(mm_items[modality]): - raise AssertionError( - f"Expected length of {placeholders=} for {modality=} " - f"to equal that of input items: {mm_items[modality]}" - f"\n{err_suffix}") - - mm_placeholders[modality] = [ - item.to_range() for item in placeholders - ] + mm_placeholders = {**hf_mm_placeholders, **missing_mm_placeholders} + + self._validate_mm_placeholders(mm_placeholders, mm_item_counts) + + mm_placeholder_ranges = { + modality: [item.to_range() for item in placeholders] + for modality, placeholders in mm_placeholders.items() + } return MultiModalInputsV2( type="multimodal", prompt=prompt_text, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_placeholders=mm_placeholders, + mm_placeholders=mm_placeholder_ranges, ) def _get_dummy_audios( @@ -1092,8 +1164,9 @@ def _get_dummy_videos( return [video] * num_videos @abstractmethod - def _get_dummy_mm_inputs( + def _get_dummy_processor_inputs( self, + seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: """ @@ -1121,12 +1194,25 @@ def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]: return mm_limits + def _get_dummy_mm_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> MultiModalInputsV2: + processor_inputs = self._get_dummy_processor_inputs(seq_len, mm_counts) + + return self.apply( + prompt_text=processor_inputs.prompt_text, + mm_data=processor_inputs.mm_data, + hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, + ) + def get_dummy_data(self, seq_len: int) -> DummyData: # Avoid circular import from vllm.sequence import SequenceData mm_counts = self._get_and_validate_dummy_mm_counts() - mm_max_tokens_per_item = self.get_mm_max_tokens_per_item() + mm_max_tokens_per_item = self.get_mm_max_tokens_per_item(seq_len) if mm_counts.keys() != mm_max_tokens_per_item.keys(): raise AssertionError( "The keys returned by `get_supported_mm_limits`" @@ -1134,13 +1220,7 @@ def get_dummy_data(self, seq_len: int) -> DummyData: "returned by `get_mm_max_tokens_per_item` " f"({set(mm_max_tokens_per_item.keys())})") - processor_inputs = self._get_dummy_mm_inputs(mm_counts) - mm_inputs = self.apply( - prompt_text=processor_inputs.prompt_text, - mm_data=processor_inputs.mm_data, - hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs, - ) - + mm_inputs = self._get_dummy_mm_inputs(seq_len, mm_counts) prompt_token_ids = mm_inputs["prompt_token_ids"] placeholders_by_modality = mm_inputs["mm_placeholders"] @@ -1171,6 +1251,12 @@ def get_dummy_data(self, seq_len: int) -> DummyData: "reduce `max_num_seqs`, and/or reduce `mm_counts`.", seq_len, total_len, total_placeholders_by_modality) + return DummyData( + seq_data=SequenceData.from_prompt_token_counts((0, seq_len)), + multi_modal_data=None, + multi_modal_placeholders=None, + ) + prompt_token_ids.extend([0] * (seq_len - len(prompt_token_ids))) return DummyData( diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 073d49d7d2009..fb4389dc4df42 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -223,7 +223,8 @@ def get_max_tokens_per_item_by_modality( if self.has_processor(model_config): tokenizer = cached_get_tokenizer(model_config.tokenizer) processor = self.create_processor(model_config, tokenizer) - return processor.get_mm_max_tokens_per_item() + seq_len = model_config.max_model_len + return processor.get_mm_max_tokens_per_item(seq_len) return { key: plugin.get_max_multimodal_tokens(model_config) diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 42b2f095bc543..97920f42ec52f 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -21,6 +21,19 @@ MistralTokenizer] +def decode_tokens( + tokenizer: AnyTokenizer, + token_ids: list[int], + *, + skip_special_tokens: bool = False, +) -> str: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.decode(token_ids, skip_special_tokens=...)`. + """ + return tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + + def encode_tokens( tokenizer: AnyTokenizer, text: str, From ba214dffbeec070051b61c1985ce6342c947f598 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 4 Jan 2025 23:45:57 +0800 Subject: [PATCH 074/462] [Bugfix] Fix precision error in LLaVA-NeXT (#11735) Signed-off-by: DarkLight1337 --- .../processing/test_llava_next.py | 3 +-- vllm/model_executor/models/llava_next.py | 14 +++++++---- vllm/model_executor/models/llava_onevision.py | 23 ++++++++++++------- 3 files changed, 26 insertions(+), 14 deletions(-) diff --git a/tests/models/decoder_only/vision_language/processing/test_llava_next.py b/tests/models/decoder_only/vision_language/processing/test_llava_next.py index 6772130c9b884..6c8d300717de4 100644 --- a/tests/models/decoder_only/vision_language/processing/test_llava_next.py +++ b/tests/models/decoder_only/vision_language/processing/test_llava_next.py @@ -15,10 +15,9 @@ def processor_for_llava_next(): return LlavaNextMultiModalProcessor -# FIXME: image_size [(198, 176), (176, 198)] @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @pytest.mark.parametrize("image_size", [(1669, 2560), (2560, 1669), (183, 488), - (488, 183)]) + (488, 183), (198, 176), (176, 198)]) @pytest.mark.parametrize("num_imgs", [1, 2]) def test_processor_prompt_replacements( processor_for_llava_next, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 3769f04f94a92..f79021596f915 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -2,6 +2,7 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) +import numpy as np import torch import torch.nn as nn from transformers import BatchFeature, LlavaNextConfig, LlavaNextProcessor @@ -139,16 +140,21 @@ def _get_num_unpadded_features( current_height = npatches * num_patch_height current_width = npatches * num_patch_width - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height + # NOTE: HF resizes based on float32 + original_aspect_ratio = np.array(original_width / original_height, + dtype=np.float32) + current_aspect_ratio = np.array(current_width / current_height, + dtype=np.float32) if original_aspect_ratio > current_aspect_ratio: - scale_factor = current_width / original_width + scale_factor = np.array(current_width / original_width, + dtype=np.float32) new_height = int(original_height * scale_factor) padding = (current_height - new_height) // 2 current_height -= 2 * padding else: - scale_factor = current_height / original_height + scale_factor = np.array(current_height / original_height, + dtype=np.float32) new_width = int(original_width * scale_factor) padding = (current_width - new_width) // 2 current_width -= 2 * padding diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 1e51e09a24c18..5a3cdadc47cac 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -3,6 +3,7 @@ from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, TypedDict, Union) +import numpy as np import torch import torch.nn as nn from transformers import (BatchFeature, LlavaOnevisionConfig, @@ -127,18 +128,24 @@ def _get_num_unpadded_features( current_height = npatches * num_patch_height current_width = npatches * num_patch_width - original_aspect_ratio = original_width / original_height - current_aspect_ratio = current_width / current_height + # NOTE: HF resizes based on float32 + original_aspect_ratio = np.array(original_width / original_height, + dtype=np.float32) + current_aspect_ratio = np.array(current_width / current_height, + dtype=np.float32) + if original_aspect_ratio > current_aspect_ratio: - new_height = int(original_height * - (current_width / original_width)) + scale_factor = np.array(current_width / original_width, + dtype=np.float32) + new_height = int(original_height * scale_factor) padding = (current_height - new_height) // 2 - current_height -= padding * 2 + current_height -= 2 * padding else: - new_width = int(original_width * - (current_height / original_height)) + scale_factor = np.array(current_height / original_height, + dtype=np.float32) + new_width = int(original_width * scale_factor) padding = (current_width - new_width) // 2 - current_width -= padding * 2 + current_width -= 2 * padding unpadded_features = current_height * current_width newline_features = current_height From 65c08928c2db934b18f7c6f5eeb02617826fae8e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 4 Jan 2025 23:46:21 +0800 Subject: [PATCH 075/462] [Model] Remove unnecessary weight initialization logic (#11736) Signed-off-by: DarkLight1337 Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/layers/resampler.py | 20 ++++---------------- vllm/model_executor/models/aria.py | 5 +---- vllm/model_executor/models/minicpmv.py | 2 -- 3 files changed, 5 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py index aae806f6af323..a67713c320b86 100644 --- a/vllm/model_executor/layers/resampler.py +++ b/vllm/model_executor/layers/resampler.py @@ -27,7 +27,7 @@ Shared resampler perceiver network used in multimodal models and related helpers for sincos positional embeddings. -Example models: Qwen (Qwen-VL), Minicpmv2.0 +Example models: Qwen (Qwen-VL), MiniCPM-V 2.0 """ import math from functools import partial @@ -37,7 +37,6 @@ import torch import torch.nn.functional as F from torch import nn -from torch.nn.init import trunc_normal_ from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.quantization import QuantizationConfig @@ -169,8 +168,8 @@ def __init__(self, self.embed_dim = embed_dim self.num_heads = num_heads - self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) - trunc_normal_(self.query, std=0.02) + self.query = nn.Parameter(torch.empty(self.num_queries, embed_dim)) + if kv_dim is not None and kv_dim != embed_dim: self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, @@ -190,16 +189,7 @@ def __init__(self, self.ln_post = norm_layer(embed_dim) if do_post_projection else None self.proj = nn.Parameter( (embed_dim**-0.5) * - torch.randn(embed_dim, embed_dim)) if do_post_projection else None - - def _init_weights(self, m: nn.Module) -> None: - if isinstance(m, nn.Linear): - trunc_normal_(m.weight, std=0.02) - if isinstance(m, nn.Linear) and m.bias is not None: - nn.init.constant_(m.bias, 0) - elif isinstance(m, nn.LayerNorm): - nn.init.constant_(m.bias, 0) - nn.init.constant_(m.weight, 1.0) + torch.empty(embed_dim, embed_dim)) if do_post_projection else None def _repeat(self, query, N: int): return query.unsqueeze(1).repeat(1, N, 1) @@ -240,8 +230,6 @@ def __init__(self, self.pos_embed = nn.Parameter( torch.from_numpy(pos_embed_arr).requires_grad_(False)) - self.apply(self._init_weights) - def forward( self, x: torch.Tensor, diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 2fd4262a9d3b9..8f5fd64a90c87 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -3,7 +3,6 @@ import torch import torch.nn as nn -from torch.nn.init import trunc_normal_ from transformers import BatchFeature, PretrainedConfig from vllm.attention import AttentionMetadata @@ -216,9 +215,7 @@ def __init__( self.num_heads = num_heads self.query = nn.Parameter( - torch.zeros(max(patch_to_query_dict.values()), self.embed_dim)) - - trunc_normal_(self.query, std=0.02) + torch.empty(max(patch_to_query_dict.values()), self.embed_dim)) self.cross_attn = CrossAttention(kv_dim, embed_dim, num_heads) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 712022502539b..8f36437d47d9e 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -141,8 +141,6 @@ def __init__(self, self.max_size = max_size self._set_2d_pos_cache(self.max_size) - self.apply(self._init_weights) - def _set_2d_pos_cache(self, max_size: Tuple[int, int], device: torch.types.Device = "cpu") -> None: From 47831430cc943cd470d38d27f8c69a5782795ec3 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Sun, 5 Jan 2025 00:07:59 +0800 Subject: [PATCH 076/462] [Bugfix][V1] Fix test_kv_cache_utils.py (#11738) Signed-off-by: Jee Jee Li --- tests/v1/core/test_kv_cache_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index faa3a91de151f..2ed70b42991b5 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -147,12 +147,12 @@ def test_generate_block_hash_extra_keys(): # Test with no extra keys extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 5, 0) - assert extra_keys == (("hash1", 0), ) + assert extra_keys == ("hash1", ) assert next_mm_idx == 1 # Test with partial overlap extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 3, 8, 0) - assert extra_keys == (("hash1", 3), ) + assert extra_keys == ("hash1", ) assert next_mm_idx == 1 # Test with no overlap @@ -162,7 +162,7 @@ def test_generate_block_hash_extra_keys(): # Test with multiple extra keys extra_keys, next_mm_idx = generate_block_hash_extra_keys(request, 0, 15, 0) - assert extra_keys == (("hash1", 0), ("hash2", 0)) + assert extra_keys == ('hash1', 'hash2') assert next_mm_idx == 2 @@ -216,11 +216,11 @@ def test_hash_request_tokens(): # Check the first block assert block_hashes[0].token_ids == (0, 1, 2) - assert block_hashes[0].extra_keys == (("hash1", 0), ) + assert block_hashes[0].extra_keys == ("hash1", ) # Check the second block assert block_hashes[1].token_ids == (3, 4, 5) - assert block_hashes[1].extra_keys == (("hash2", 0), ) + assert block_hashes[1].extra_keys == ("hash2", ) def test_hash_request_tokens_no_mm_inputs(): From 4068f4b5b5dc5e2d1114be0cbb126bc44fb4e906 Mon Sep 17 00:00:00 2001 From: Lu Fang <30275821+houseroad@users.noreply.github.com> Date: Sat, 4 Jan 2025 17:20:34 -0800 Subject: [PATCH 077/462] [MISC] Replace c10::optional with std::optional (#11730) Signed-off-by: Lu Fang --- csrc/attention/paged_attention_v1.cu | 4 +- csrc/attention/paged_attention_v2.cu | 4 +- csrc/cpu/attention.cpp | 8 ++-- csrc/cpu/quant.cpp | 10 ++-- csrc/cpu/torch_bindings.cpp | 6 +-- .../epilogue/scaled_mm_epilogues_c2x.hpp | 6 +-- .../epilogue/scaled_mm_epilogues_c3x.hpp | 6 +-- csrc/cutlass_extensions/torch_utils.hpp | 2 +- csrc/mamba/causal_conv1d/causal_conv1d.cu | 24 +++++----- csrc/mamba/mamba_ssm/selective_scan_fwd.cu | 22 ++++----- csrc/ops.h | 46 +++++++++---------- .../compressed_tensors/int8_quant_kernels.cu | 4 +- .../cutlass_w8a8/scaled_mm_c2x.cu | 18 ++++---- .../cutlass_w8a8/scaled_mm_c3x.cu | 6 +-- .../cutlass_w8a8/scaled_mm_entry.cu | 30 ++++++------ csrc/quantization/machete/generate.py | 2 +- .../machete/machete_mm_kernel.cuh | 10 ++-- .../machete/machete_mm_launcher.cuh | 24 +++++----- .../machete/machete_prepack_launcher.cuh | 2 +- csrc/quantization/machete/machete_pytorch.cu | 26 +++++------ csrc/rocm/attention.cu | 4 +- csrc/rocm/ops.h | 2 +- csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu | 2 +- csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 4 +- 24 files changed, 136 insertions(+), 136 deletions(-) diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index cb1a069942069..27321148f6dda 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -53,7 +53,7 @@ void paged_attention_v1_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, + const std::optional& alibi_slopes, float k_scale, float v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { @@ -176,7 +176,7 @@ void paged_attention_v1( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index c457bdb89008e..a453b2243e48c 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -54,7 +54,7 @@ void paged_attention_v2_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, + const std::optional& alibi_slopes, float k_scale, float v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { @@ -187,7 +187,7 @@ void paged_attention_v2( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index e21832ba7582f..ef5b14088c63b 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes) { + const std::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -459,7 +459,7 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, - int max_seq_len, const c10::optional& alibi_slopes) { + int max_seq_len, const std::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -781,7 +781,7 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index d9aed657a3113..33b1637832888 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major const torch::Tensor& b, // [IC, OC], column-major const torch::Tensor& a_scales, // [1] or [M] const torch::Tensor& b_scales, // [1] or [OC] - const c10::optional& bias // [OC] + const std::optional& bias // [OC] ) { CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) // Checks for conformality @@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major const torch::Tensor& a_scales, // [1] or [M] const torch::Tensor& b_scales, // [1] or [OC] const torch::Tensor& azp_adj, // [OC] - const c10::optional& azp, // [1] or [M] - const c10::optional& bias // [OC] + const std::optional& azp, // [1] or [M] + const std::optional& bias // [OC] ) { CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) // Checks for conformality @@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] const torch::Tensor& scale, - c10::optional const& azp) { + std::optional const& azp) { CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); @@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant( torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] torch::Tensor& scale, // [..., 1] - c10::optional const& azp) { + std::optional const& azp) { CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 03beefbc6de7d..74e4d8189d403 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_scales, const torch::Tensor& b_scales, - const c10::optional& bias); + const std::optional& bias); void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_scales, const torch::Tensor& b_scales, const torch::Tensor& azp_adj, - const c10::optional& azp, - const c10::optional& bias); + const std::optional& azp, + const std::optional& bias); TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp index 26f7423fd7455..ef413e6dd75c5 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp @@ -68,7 +68,7 @@ struct ScaledEpilogueBase { // This overload handles the case where there might not be a tensor, in which // case a nullptr is passed and a constant (0) is used. template - static auto args_from_tensor(c10::optional const& tensor) { + static auto args_from_tensor(std::optional const& tensor) { static_assert(std::is_same_v>); using Arguments = typename Descriptor::Arguments; auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; @@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp static ArgumentType prepare_args(torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); @@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken torch::Tensor const& b_scales, torch::Tensor const& azp_adj, torch::Tensor const& azp, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp index c723adf126422..c590c66a66652 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -67,7 +67,7 @@ struct ScaledEpilogueBase { // This overload handles the case where there might not be a tensor, in which // case a nullptr is passed and a constant (0) is used. template - static auto args_from_tensor(c10::optional const& tensor) { + static auto args_from_tensor(std::optional const& tensor) { using Arguments = typename Descriptor::Arguments; auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; static_assert(std::is_same_v> || @@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp static ArgumentType prepare_args(torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); @@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken torch::Tensor const& b_scales, torch::Tensor const& azp_adj, torch::Tensor const& azp, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp index 2c78572521eec..a1ff933cce63f 100644 --- a/csrc/cutlass_extensions/torch_utils.hpp +++ b/csrc/cutlass_extensions/torch_utils.hpp @@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor, template static inline auto maybe_make_cute_layout( - c10::optional const& tensor, + std::optional const& tensor, std::string_view name = "tensor") { using Layout = decltype(make_cute_layout(*tensor)); diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index dd1e6de2e0180..f0e5533bcae60 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, const at::Tensor x, const at::Tensor weight, const at::Tensor out, - const c10::optional& bias, + const std::optional& bias, bool silu_activation, int64_t pad_slot_id, - const c10::optional& query_start_loc = std::nullopt, - const c10::optional& cache_indices = std::nullopt, - const c10::optional& has_initial_state = std::nullopt) { + const std::optional& query_start_loc = std::nullopt, + const std::optional& cache_indices = std::nullopt, + const std::optional& has_initial_state = std::nullopt) { // Reset the parameters memset(¶ms, 0, sizeof(params)); @@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, - const c10::optional &bias_, - const c10::optional &conv_states, - const c10::optional &query_start_loc, - const c10::optional &cache_indices, - const c10::optional &has_initial_state, + const std::optional &bias_, + const std::optional &conv_states, + const std::optional &query_start_loc, + const std::optional &cache_indices, + const std::optional &has_initial_state, bool silu_activation, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early @@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, void causal_conv1d_update(const at::Tensor &x, const at::Tensor &conv_state, const at::Tensor &weight, - const c10::optional &bias_, + const std::optional &bias_, bool silu_activation, - const c10::optional &cache_seqlens_, - const c10::optional &conv_state_indices_, + const std::optional &cache_seqlens_, + const std::optional &conv_state_indices_, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early int64_t pad_slot_id) { diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index 71624696338d0..bd0a34119c82b 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, const torch::Tensor out, const torch::Tensor z, const torch::Tensor out_z, - const c10::optional& D, - const c10::optional& delta_bias, + const std::optional& D, + const std::optional& delta_bias, const torch::Tensor ssm_states, bool has_z, bool delta_softplus, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, bool varlen, int64_t pad_slot_id) { @@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C, - const c10::optional &D_, - const c10::optional &z_, - const c10::optional &delta_bias_, + const std::optional &D_, + const std::optional &z_, + const std::optional &delta_bias_, bool delta_softplus, - const c10::optional &query_start_loc, - const c10::optional &cache_indices, - const c10::optional &has_initial_state, + const std::optional &query_start_loc, + const std::optional &cache_indices, + const std::optional &has_initial_state, const torch::Tensor &ssm_states, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early diff --git a/csrc/ops.h b/csrc/ops.h index 347c502845d8f..9efd9b0c24700 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -33,7 +33,7 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -44,7 +44,7 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -153,15 +153,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability); @@ -169,7 +169,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& e, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a); @@ -177,11 +177,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor const& scale, - c10::optional const& azp); + std::optional const& azp); void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scales, - c10::optional const& azp); + std::optional const& azp); torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, torch::Tensor b_gptq_qzeros, @@ -198,34 +198,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input, void dynamic_per_token_scaled_fp8_quant( torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale, - c10::optional const& scale_ub); + std::optional const& scale_ub); void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, const torch::Tensor& B, const torch::Tensor& C, - const c10::optional& D_, - const c10::optional& z_, - const c10::optional& delta_bias_, + const std::optional& D_, + const std::optional& z_, + const std::optional& delta_bias_, bool delta_softplus, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, const torch::Tensor& ssm_states, int64_t pad_slot_id); void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight, - const c10::optional& bias_, + const std::optional& bias_, bool silu_activation, - const c10::optional& cache_seqlens_, - const c10::optional& conv_state_indices_, + const std::optional& cache_seqlens_, + const std::optional& conv_state_indices_, int64_t pad_slot_id); void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, - const c10::optional& bias_, - const c10::optional& conv_states, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& bias_, + const std::optional& conv_states, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, bool silu_activation, int64_t pad_slot_id); #ifndef USE_ROCM diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index e9987535bd3ea..e79785827189d 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel( void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size] torch::Tensor const& scale, - c10::optional const& azp) { + std::optional const& azp) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scale.numel() == 1); @@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] void dynamic_scaled_int8_quant( torch::Tensor& out, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size] - torch::Tensor& scales, c10::optional const& azp) { + torch::Tensor& scales, std::optional const& azp) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scales.is_contiguous()); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index dbb72e8bbd3f5..865fef5aeea11 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 123f4359c0d1a..e18d7d79e5b77 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 4f7b6588ef3f7..3f2b52624f366 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #endif void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, @@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); #if defined CUDA_VERSION && CUDA_VERSION >= 12000 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, @@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); #endif bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { @@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && @@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 2df4d181902f8..a9b5ddf4cbdd2 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -63,7 +63,7 @@ static inline std::optional maybe_scalartype( - c10::optional const& t) { + std::optional const& t) { if (!t) { return std::nullopt; } else { diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh index d4d19ae5deec7..e4af067915e0a 100644 --- a/csrc/quantization/machete/machete_mm_kernel.cuh +++ b/csrc/quantization/machete/machete_mm_kernel.cuh @@ -183,11 +183,11 @@ struct MacheteKernelTemplate { torch::Tensor const& A, // MxK matrix torch::Tensor const& B, // KxN prepacked matrix torch::Tensor& D, // MxN matrix - c10::optional const& maybe_g_scales, // scale_KxN matrix - c10::optional const& maybe_g_zeros, // scale_KxN matrix - c10::optional maybe_group_size, - c10::optional const& maybe_ch_scales, // len N vector - c10::optional const& maybe_tok_scales) // len M vector + std::optional const& maybe_g_scales, // scale_KxN matrix + std::optional const& maybe_g_zeros, // scale_KxN matrix + std::optional maybe_group_size, + std::optional const& maybe_ch_scales, // len N vector + std::optional const& maybe_tok_scales) // len M vector { static_assert(!with_group_zeropoints || with_group_scales); diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh index 4b0da5b303e0c..cabe0af46f069 100644 --- a/csrc/quantization/machete/machete_mm_launcher.cuh +++ b/csrc/quantization/machete/machete_mm_launcher.cuh @@ -13,23 +13,23 @@ struct MMArgs { torch::Tensor const& A; torch::Tensor const& B; vllm::ScalarType const& b_type; - c10::optional const& maybe_out_type; - c10::optional const& maybe_group_scales; - c10::optional const& maybe_group_zeros; - c10::optional maybe_group_size; - c10::optional const& maybe_channel_scales; - c10::optional const& maybe_token_scales; - c10::optional maybe_schedule; + std::optional const& maybe_out_type; + std::optional const& maybe_group_scales; + std::optional const& maybe_group_zeros; + std::optional maybe_group_size; + std::optional const& maybe_channel_scales; + std::optional const& maybe_token_scales; + std::optional maybe_schedule; }; struct SupportedSchedulesArgs { at::ScalarType a_type; vllm::ScalarType b_type; - c10::optional maybe_group_scales_type; - c10::optional maybe_group_zeros_type; - c10::optional maybe_channel_scales_type; - c10::optional maybe_token_scales_type; - c10::optional maybe_out_type; + std::optional maybe_group_scales_type; + std::optional maybe_group_zeros_type; + std::optional maybe_channel_scales_type; + std::optional maybe_token_scales_type; + std::optional maybe_out_type; }; torch::Tensor mm_dispatch(MMArgs args); diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh index 3486d28be2126..634b651a4d107 100644 --- a/csrc/quantization/machete/machete_prepack_launcher.cuh +++ b/csrc/quantization/machete/machete_prepack_launcher.cuh @@ -10,7 +10,7 @@ struct PrepackBArgs { torch::Tensor const& B; at::ScalarType a_type; vllm::ScalarType b_type; - c10::optional maybe_group_scales_type; + std::optional maybe_group_scales_type; }; template diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index da2c2fb0d3e77..05a51ee21ddb7 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -10,11 +10,11 @@ using namespace vllm; std::vector supported_schedules( at::ScalarType a_type, int64_t b_type_id, - c10::optional maybe_group_scales_type, - c10::optional maybe_group_zeros_type, - c10::optional maybe_channel_scales_type, - c10::optional maybe_token_scales_type, - c10::optional maybe_out_type) { + std::optional maybe_group_scales_type, + std::optional maybe_group_zeros_type, + std::optional maybe_channel_scales_type, + std::optional maybe_token_scales_type, + std::optional maybe_out_type) { ScalarType const b_type = ScalarType::from_id(b_type_id); return supported_schedules_dispatch({ .a_type = a_type, @@ -29,13 +29,13 @@ std::vector supported_schedules( torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, int64_t b_type_id, - c10::optional const& maybe_out_type, - c10::optional const& maybe_group_scales, - c10::optional const& maybe_group_zeros, - c10::optional maybe_group_size, - c10::optional const& maybe_channel_scales, - c10::optional const& maybe_token_scales, - c10::optional maybe_schedule) { + std::optional const& maybe_out_type, + std::optional const& maybe_group_scales, + std::optional const& maybe_group_zeros, + std::optional maybe_group_size, + std::optional const& maybe_channel_scales, + std::optional const& maybe_token_scales, + std::optional maybe_schedule) { ScalarType const b_type = ScalarType::from_id(b_type_id); return mm_dispatch({.A = A, .B = B, @@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, torch::Tensor prepack_B( torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id, - c10::optional const& maybe_group_scales_type) { + std::optional const& maybe_group_scales_type) { ScalarType const b_type = ScalarType::from_id(b_type_id); return prepack_B_dispatch( {.B = B, diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index b48348a515c8d..0fec9624c457e 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -928,7 +928,7 @@ void paged_attention_custom_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, const int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& context_lens, - int max_context_len, const c10::optional& alibi_slopes, + int max_context_len, const std::optional& alibi_slopes, float k_scale, float v_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); @@ -1086,7 +1086,7 @@ void paged_attention( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& context_lens, // [num_seqs] int64_t block_size, int64_t max_context_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale) { const int head_size = query.size(2); if (kv_cache_dtype == "auto") { diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index 9f085115a3956..34b2f9ce8a4c4 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, double scale, torch::Tensor& block_tables, torch::Tensor& context_lens, int64_t block_size, int64_t max_context_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale); diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu index 6223dc8cca704..5a1879787c328 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu @@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& bt_meta, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index d464b045b895f..371de0950bc99 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& e, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #endif void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, @@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& bt_meta, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) && From 635b897246da121238454ed4b2bbc87cb4d4166b Mon Sep 17 00:00:00 2001 From: cennn <61925104+cennn@users.noreply.github.com> Date: Sun, 5 Jan 2025 23:09:11 +0800 Subject: [PATCH 078/462] [distributed] remove pynccl's redundant stream (#11744) --- tests/distributed/test_pynccl.py | 5 ++-- .../device_communicators/pynccl.py | 28 ++++++------------- vllm/distributed/parallel_state.py | 3 +- 3 files changed, 12 insertions(+), 24 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index 36cfe42251384..a77b48d5e49f3 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -137,9 +137,8 @@ def worker_fn_with_cudagraph(): # run something in the default stream to initialize torch engine a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}') torch.cuda.synchronize() - with torch.cuda.graph( - graph, stream=pynccl_comm.stream), pynccl_comm.change_state( - enable=True): + with torch.cuda.graph(graph), \ + pynccl_comm.change_state(enable=True): a_out = pynccl_comm.all_reduce(a) torch.cuda.synchronize() graph.replay() diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index a6800f93f167b..93d96fd8f5686 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -51,7 +51,6 @@ def __init__( if self.world_size == 1: self.available = False self.disabled = True - self.stream = None return try: self.nccl = NCCLLibrary(library_path) @@ -60,7 +59,6 @@ def __init__( # e.g. in a non-GPU environment self.available = False self.disabled = True - self.stream = None return self.available = True @@ -98,12 +96,12 @@ def __init__( with torch.cuda.device(device): self.comm: ncclComm_t = self.nccl.ncclCommInitRank( self.world_size, self.unique_id, self.rank) - self.stream = torch.cuda.Stream() + stream = torch.cuda.current_stream() # A small all_reduce for warmup. data = torch.zeros(1, device=device) self.all_reduce(data) - self.stream.synchronize() + stream.synchronize() del data def all_reduce(self, @@ -122,7 +120,7 @@ def all_reduce(self, out_tensor = torch.empty_like(in_tensor) if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclAllReduce(buffer_type(in_tensor.data_ptr()), buffer_type(out_tensor.data_ptr()), in_tensor.numel(), @@ -144,7 +142,7 @@ def all_gather(self, f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {input_tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclAllGather( buffer_type(input_tensor.data_ptr()), buffer_type(output_tensor.data_ptr()), input_tensor.numel(), @@ -165,7 +163,7 @@ def reduce_scatter(self, f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {input_tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclReduceScatter( buffer_type(input_tensor.data_ptr()), buffer_type(output_tensor.data_ptr()), output_tensor.numel(), @@ -180,7 +178,7 @@ def send(self, tensor: torch.Tensor, dst: int, stream=None): f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, self.comm, cudaStream_t(stream.cuda_stream)) @@ -192,7 +190,7 @@ def recv(self, tensor: torch.Tensor, src: int, stream=None): f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, self.comm, cudaStream_t(stream.cuda_stream)) @@ -204,7 +202,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None): f"this nccl communicator is created to work on {self.device}, " f"but the input tensor is on {tensor.device}") if stream is None: - stream = self.stream + stream = torch.cuda.current_stream() if src == self.rank: sendbuff = buffer_type(tensor.data_ptr()) # NCCL requires the sender also to have a receive buffer @@ -217,9 +215,7 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None): self.comm, cudaStream_t(stream.cuda_stream)) @contextmanager - def change_state(self, - enable: Optional[bool] = None, - stream: Optional[torch.cuda.Stream] = None): + def change_state(self, enable: Optional[bool] = None): """ A context manager to change the state of the communicator. """ @@ -227,15 +223,9 @@ def change_state(self, # guess a default value when not specified enable = self.available - if stream is None: - stream = self.stream - old_disable = self.disabled - old_stream = self.stream - self.stream = stream self.disabled = not enable yield self.disabled = old_disable - self.stream = old_stream diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index a0d4235460f3b..dccd3addbcb35 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -310,8 +310,7 @@ def graph_capture( if not pynccl_comm: maybe_pynccl_context = nullcontext() else: - maybe_pynccl_context = pynccl_comm.change_state( - stream=torch.cuda.current_stream()) + maybe_pynccl_context = pynccl_comm.change_state() with maybe_pynccl_context: yield graph_capture_context From eba17173d34548a39989eae2530dce53496a1f3d Mon Sep 17 00:00:00 2001 From: Lancer <402430575@qq.com> Date: Mon, 6 Jan 2025 00:48:16 +0800 Subject: [PATCH 079/462] fix: [doc] fix typo (#11751) Co-authored-by: Lancer --- vllm/core/block/block_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index dca0b3fe8d304..90c1438efbd08 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -23,7 +23,7 @@ class BlockTable: blocks to initialize the BlockTable with. If not provided, an empty BlockTable is created. max_block_sliding_window (Optional[int], optional): The number of - blocks to keep around for each sequance. If None, all blocks + blocks to keep around for each sequence. If None, all blocks are kept (eg., when sliding window is not used). It should at least fit the sliding window size of the model. From 33fc1e2e86ce5d60940463f8f71daaa61728d3b7 Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com> Date: Sun, 5 Jan 2025 16:35:01 -0500 Subject: [PATCH 080/462] [Frontend] Improve `StreamingResponse` Exception Handling (#11752) --- vllm/entrypoints/openai/serving_chat.py | 4 ++-- vllm/entrypoints/openai/serving_completion.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9ba5eeb7709c9..89a119ac65695 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -301,7 +301,7 @@ async def chat_completion_stream_generator( ] * num_choices else: tool_parsers = [None] * num_choices - except RuntimeError as e: + except Exception as e: logger.exception("Error in tool parser creation.") data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" @@ -591,7 +591,7 @@ async def chat_completion_stream_generator( completion_tokens=num_completion_tokens, total_tokens=num_prompt_tokens + num_completion_tokens) - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") data = self.create_streaming_error_response(str(e)) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 17197dce8da23..2c9c20caf8119 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -371,7 +371,7 @@ async def completion_stream_generator( # report to FastAPI middleware aggregate usage across all choices request_metadata.final_usage_info = final_usage_info - except ValueError as e: + except Exception as e: # TODO: Use a vllm-specific Validation Error data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" From 9e764e7b105a483ebc702cad33922ba8d8c210e1 Mon Sep 17 00:00:00 2001 From: cennn <61925104+cennn@users.noreply.github.com> Date: Mon, 6 Jan 2025 09:05:48 +0800 Subject: [PATCH 081/462] [distributed] remove pynccl's redundant change_state (#11749) --- tests/distributed/test_pynccl.py | 64 ++++++++----------- .../device_communicators/pynccl.py | 17 ----- vllm/distributed/parallel_state.py | 9 +-- 3 files changed, 28 insertions(+), 62 deletions(-) diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py index a77b48d5e49f3..a8571a1157892 100644 --- a/tests/distributed/test_pynccl.py +++ b/tests/distributed/test_pynccl.py @@ -59,8 +59,7 @@ def worker_fn(): device=get_world_group().device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) - with pynccl_comm.change_state(enable=True): - tensor = pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) torch.cuda.synchronize() assert torch.all(tensor == pynccl_comm.world_size).cpu().item() @@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn(): group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1] pynccl_comm = PyNcclCommunicator(group=group, device=device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) - with pynccl_comm.change_state(enable=True): - # two groups can communicate independently - if torch.distributed.get_rank() in [0, 1]: - tensor = pynccl_comm.all_reduce(tensor) - tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() - assert torch.all(tensor == 4).cpu().item() - else: - tensor = pynccl_comm.all_reduce(tensor) - torch.cuda.synchronize() - assert torch.all(tensor == 2).cpu().item() + # two groups can communicate independently + if torch.distributed.get_rank() in [0, 1]: + tensor = pynccl_comm.all_reduce(tensor) + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == 4).cpu().item() + else: + tensor = pynccl_comm.all_reduce(tensor) + torch.cuda.synchronize() + assert torch.all(tensor == 2).cpu().item() @pytest.mark.skipif(torch.cuda.device_count() < 4, @@ -137,8 +135,7 @@ def worker_fn_with_cudagraph(): # run something in the default stream to initialize torch engine a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}') torch.cuda.synchronize() - with torch.cuda.graph(graph), \ - pynccl_comm.change_state(enable=True): + with torch.cuda.graph(graph): a_out = pynccl_comm.all_reduce(a) torch.cuda.synchronize() graph.replay() @@ -167,8 +164,7 @@ def all_gather_worker_fn(): for r in range(world_size) ]).to(device) - with pynccl_comm.change_state(enable=True): - pynccl_comm.all_gather(result, tensor) + pynccl_comm.all_gather(result, tensor) torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -205,8 +201,7 @@ def reduce_scatter_worker_fn(): expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size] for tensor in all_tensors).to(device) - with pynccl_comm.change_state(enable=True): - pynccl_comm.reduce_scatter(result, tensor) + pynccl_comm.reduce_scatter(result, tensor) torch.cuda.synchronize() torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) @@ -233,15 +228,13 @@ def send_recv_worker_fn(): else: tensor = torch.empty(16, 1024, 1024, dtype=torch.float32).cuda(pynccl_comm.rank) - with pynccl_comm.change_state(enable=True): - if pynccl_comm.rank == 0: - pynccl_comm.send(tensor, - dst=(pynccl_comm.rank + 1) % - pynccl_comm.world_size) - else: - pynccl_comm.recv(tensor, - src=(pynccl_comm.rank - 1) % - pynccl_comm.world_size) + + if pynccl_comm.rank == 0: + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) + else: + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) torch.cuda.synchronize() assert torch.all(tensor == 1).cpu().item() @@ -272,15 +265,12 @@ def multiple_send_recv_worker_fn(): 1024, dtype=torch.float32, device=device) - with pynccl_comm.change_state(enable=True): - if torch.distributed.get_rank() in [0, 1]: - pynccl_comm.send(tensor, - dst=(pynccl_comm.rank + 1) % - pynccl_comm.world_size) - else: - pynccl_comm.recv(tensor, - src=(pynccl_comm.rank - 1) % - pynccl_comm.world_size) + if torch.distributed.get_rank() in [0, 1]: + pynccl_comm.send(tensor, + dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size) + else: + pynccl_comm.recv(tensor, + src=(pynccl_comm.rank - 1) % pynccl_comm.world_size) torch.cuda.synchronize() if torch.distributed.get_rank() in [0, 2]: assert torch.all(tensor == 1).cpu().item() diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py index 93d96fd8f5686..fda4d007ceb5b 100644 --- a/vllm/distributed/device_communicators/pynccl.py +++ b/vllm/distributed/device_communicators/pynccl.py @@ -1,4 +1,3 @@ -from contextlib import contextmanager from typing import Optional, Union # ===================== import region ===================== @@ -213,19 +212,3 @@ def broadcast(self, tensor: torch.Tensor, src: int, stream=None): self.nccl.ncclBroadcast(sendbuff, recvbuff, tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, self.comm, cudaStream_t(stream.cuda_stream)) - - @contextmanager - def change_state(self, enable: Optional[bool] = None): - """ - A context manager to change the state of the communicator. - """ - if enable is None: - # guess a default value when not specified - enable = self.available - - old_disable = self.disabled - - self.disabled = not enable - yield - - self.disabled = old_disable diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index dccd3addbcb35..a837c1dc5953b 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -305,14 +305,7 @@ def graph_capture( stream.wait_stream(curr_stream) with torch.cuda.stream(stream), maybe_ca_context: - pynccl_comm = self.pynccl_comm - maybe_pynccl_context: Any - if not pynccl_comm: - maybe_pynccl_context = nullcontext() - else: - maybe_pynccl_context = pynccl_comm.change_state() - with maybe_pynccl_context: - yield graph_capture_context + yield graph_capture_context def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: """ From 402d37836059463c7ec8b1e25d40c29138f1dd40 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 6 Jan 2025 10:18:33 +0800 Subject: [PATCH 082/462] [Doc] [1/N] Reorganize Getting Started section (#11645) Signed-off-by: DarkLight1337 --- docs/source/design/arch_overview.md | 3 +-- docs/source/design/multiprocessing.md | 2 +- docs/source/{usage => getting_started}/faq.md | 0 .../cpu-arm.md} | 2 +- .../cpu-x86.md} | 6 +++--- .../gpu-cuda.md} | 4 ++-- .../gpu-rocm.md} | 2 +- .../hpu-gaudi.md} | 4 +++- .../getting_started/installation/index.md | 19 +++++++++++++++++++ .../neuron.md} | 2 +- .../openvino.md} | 4 ++-- .../tpu.md} | 2 +- .../xpu.md} | 2 +- docs/source/getting_started/quickstart.md | 2 +- .../{debugging.md => troubleshooting.md} | 11 ++++++----- docs/source/index.md | 16 ++++------------ docs/source/models/generative_models.md | 2 +- docs/source/models/pooling_models.md | 2 +- docs/source/serving/distributed_serving.md | 2 +- docs/source/usage/spec_decode.md | 4 ++-- docs/source/usage/structured_outputs.md | 2 +- vllm/utils.py | 2 +- 22 files changed, 54 insertions(+), 41 deletions(-) rename docs/source/{usage => getting_started}/faq.md (100%) rename docs/source/getting_started/{arm-installation.md => installation/cpu-arm.md} (92%) rename docs/source/getting_started/{cpu-installation.md => installation/cpu-x86.md} (95%) rename docs/source/getting_started/{installation.md => installation/gpu-cuda.md} (99%) rename docs/source/getting_started/{amd-installation.md => installation/gpu-rocm.md} (99%) rename docs/source/getting_started/{gaudi-installation.md => installation/hpu-gaudi.md} (99%) create mode 100644 docs/source/getting_started/installation/index.md rename docs/source/getting_started/{neuron-installation.md => installation/neuron.md} (99%) rename docs/source/getting_started/{openvino-installation.md => installation/openvino.md} (90%) rename docs/source/getting_started/{tpu-installation.md => installation/tpu.md} (99%) rename docs/source/getting_started/{xpu-installation.md => installation/xpu.md} (98%) rename docs/source/getting_started/{debugging.md => troubleshooting.md} (94%) diff --git a/docs/source/design/arch_overview.md b/docs/source/design/arch_overview.md index 475a3e5fa9ddc..2f1280c047672 100644 --- a/docs/source/design/arch_overview.md +++ b/docs/source/design/arch_overview.md @@ -77,8 +77,7 @@ python -m vllm.entrypoints.openai.api_server --model That code can be found in . -More details on the API server can be found in the {doc}`OpenAI Compatible -Server ` document. +More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document. ## LLM Engine diff --git a/docs/source/design/multiprocessing.md b/docs/source/design/multiprocessing.md index 34564413b34f6..da87638e5b743 100644 --- a/docs/source/design/multiprocessing.md +++ b/docs/source/design/multiprocessing.md @@ -2,7 +2,7 @@ ## Debugging -Please see the [Debugging Tips](#debugging-python-multiprocessing) +Please see the [Troubleshooting](#troubleshooting-python-multiprocessing) page for information on known issues and how to solve them. ## Introduction diff --git a/docs/source/usage/faq.md b/docs/source/getting_started/faq.md similarity index 100% rename from docs/source/usage/faq.md rename to docs/source/getting_started/faq.md diff --git a/docs/source/getting_started/arm-installation.md b/docs/source/getting_started/installation/cpu-arm.md similarity index 92% rename from docs/source/getting_started/arm-installation.md rename to docs/source/getting_started/installation/cpu-arm.md index 799b597b3ad5d..a46e2c010600d 100644 --- a/docs/source/getting_started/arm-installation.md +++ b/docs/source/getting_started/installation/cpu-arm.md @@ -2,7 +2,7 @@ # Installation for ARM CPUs -vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the x86 platform documentation covering: +vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. This guide provides installation instructions specific to ARM. For additional details on supported features, refer to the [x86 CPU documentation](#installation-x86) covering: - CPU backend inference capabilities - Relevant runtime environment variables diff --git a/docs/source/getting_started/cpu-installation.md b/docs/source/getting_started/installation/cpu-x86.md similarity index 95% rename from docs/source/getting_started/cpu-installation.md rename to docs/source/getting_started/installation/cpu-x86.md index c3d3f715ed804..bbb2d1872ef39 100644 --- a/docs/source/getting_started/cpu-installation.md +++ b/docs/source/getting_started/installation/cpu-x86.md @@ -1,6 +1,6 @@ -(installation-cpu)= +(installation-x86)= -# Installation with CPU +# Installation for x86 CPUs vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features: @@ -151,4 +151,4 @@ $ python examples/offline_inference.py $ VLLM_CPU_KVCACHE_SPACE=40 VLLM_CPU_OMP_THREADS_BIND="0-31|32-63" vllm serve meta-llama/Llama-2-7b-chat-hf -tp=2 --distributed-executor-backend mp ``` - - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](../serving/deploying_with_nginx.md) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). + - Using Data Parallel for maximum throughput: to launch an LLM serving endpoint on each NUMA node along with one additional load balancer to dispatch the requests to those endpoints. Common solutions like [Nginx](#nginxloadbalancer) or HAProxy are recommended. Anyscale Ray project provides the feature on LLM [serving](https://docs.ray.io/en/latest/serve/index.html). Here is the example to setup a scalable LLM serving with [Ray Serve](https://github.com/intel/llm-on-ray/blob/main/docs/setup.md). diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation/gpu-cuda.md similarity index 99% rename from docs/source/getting_started/installation.md rename to docs/source/getting_started/installation/gpu-cuda.md index 996fb346f43d4..7ea10bb8b59ff 100644 --- a/docs/source/getting_started/installation.md +++ b/docs/source/getting_started/installation/gpu-cuda.md @@ -1,6 +1,6 @@ -(installation)= +(installation-cuda)= -# Installation +# Installation for CUDA vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. diff --git a/docs/source/getting_started/amd-installation.md b/docs/source/getting_started/installation/gpu-rocm.md similarity index 99% rename from docs/source/getting_started/amd-installation.md rename to docs/source/getting_started/installation/gpu-rocm.md index 6d01efbbf8828..796911d7305a6 100644 --- a/docs/source/getting_started/amd-installation.md +++ b/docs/source/getting_started/installation/gpu-rocm.md @@ -1,6 +1,6 @@ (installation-rocm)= -# Installation with ROCm +# Installation for ROCm vLLM supports AMD GPUs with ROCm 6.2. diff --git a/docs/source/getting_started/gaudi-installation.md b/docs/source/getting_started/installation/hpu-gaudi.md similarity index 99% rename from docs/source/getting_started/gaudi-installation.md rename to docs/source/getting_started/installation/hpu-gaudi.md index 1f2ee62860dec..94de169f51a73 100644 --- a/docs/source/getting_started/gaudi-installation.md +++ b/docs/source/getting_started/installation/hpu-gaudi.md @@ -1,4 +1,6 @@ -# Installation with Intel® Gaudi® AI Accelerators +(installation-gaudi)= + +# Installation for Intel® Gaudi® This README provides instructions on running vLLM with Intel Gaudi devices. diff --git a/docs/source/getting_started/installation/index.md b/docs/source/getting_started/installation/index.md new file mode 100644 index 0000000000000..83de1aff409b2 --- /dev/null +++ b/docs/source/getting_started/installation/index.md @@ -0,0 +1,19 @@ +(installation-index)= + +# Installation + +vLLM supports the following hardware platforms: + +```{toctree} +:maxdepth: 1 + +gpu-cuda +gpu-rocm +cpu-x86 +cpu-arm +hpu-gaudi +tpu +xpu +openvino +neuron +``` diff --git a/docs/source/getting_started/neuron-installation.md b/docs/source/getting_started/installation/neuron.md similarity index 99% rename from docs/source/getting_started/neuron-installation.md rename to docs/source/getting_started/installation/neuron.md index baaeeb9f53a10..431f90537f543 100644 --- a/docs/source/getting_started/neuron-installation.md +++ b/docs/source/getting_started/installation/neuron.md @@ -1,6 +1,6 @@ (installation-neuron)= -# Installation with Neuron +# Installation for Neuron vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching. Paged Attention and Chunked Prefill are currently in development and will be available soon. diff --git a/docs/source/getting_started/openvino-installation.md b/docs/source/getting_started/installation/openvino.md similarity index 90% rename from docs/source/getting_started/openvino-installation.md rename to docs/source/getting_started/installation/openvino.md index 8b43c0a90447f..60f95fd1c4250 100644 --- a/docs/source/getting_started/openvino-installation.md +++ b/docs/source/getting_started/installation/openvino.md @@ -1,8 +1,8 @@ (installation-openvino)= -# Installation with OpenVINO +# Installation for OpenVINO -vLLM powered by OpenVINO supports all LLM models from {doc}`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: +vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)). OpenVINO vLLM backend supports the following advanced vLLM features: - Prefix caching (`--enable-prefix-caching`) - Chunked prefill (`--enable-chunked-prefill`) diff --git a/docs/source/getting_started/tpu-installation.md b/docs/source/getting_started/installation/tpu.md similarity index 99% rename from docs/source/getting_started/tpu-installation.md rename to docs/source/getting_started/installation/tpu.md index 4d3ac541c90ce..bc93c44fead30 100644 --- a/docs/source/getting_started/tpu-installation.md +++ b/docs/source/getting_started/installation/tpu.md @@ -1,6 +1,6 @@ (installation-tpu)= -# Installation with TPU +# Installation for TPUs Tensor Processing Units (TPUs) are Google's custom-developed application-specific integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs diff --git a/docs/source/getting_started/xpu-installation.md b/docs/source/getting_started/installation/xpu.md similarity index 98% rename from docs/source/getting_started/xpu-installation.md rename to docs/source/getting_started/installation/xpu.md index 9554ae4b7fb44..be4e3b9bd1bc5 100644 --- a/docs/source/getting_started/xpu-installation.md +++ b/docs/source/getting_started/installation/xpu.md @@ -1,6 +1,6 @@ (installation-xpu)= -# Installation with XPU +# Installation for XPUs vLLM initially supports basic model inferencing and serving on Intel GPU platform. diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 9c8b7e4f592c9..ff216f8af30f9 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -23,7 +23,7 @@ $ conda activate myenv $ pip install vllm ``` -Please refer to the {ref}`installation documentation ` for more details on installing vLLM. +Please refer to the [installation documentation](#installation-index) for more details on installing vLLM. (offline-batched-inference)= diff --git a/docs/source/getting_started/debugging.md b/docs/source/getting_started/troubleshooting.md similarity index 94% rename from docs/source/getting_started/debugging.md rename to docs/source/getting_started/troubleshooting.md index 19eb699572a08..5a0310da0f2cb 100644 --- a/docs/source/getting_started/debugging.md +++ b/docs/source/getting_started/troubleshooting.md @@ -1,8 +1,8 @@ -(debugging)= +(troubleshooting)= -# Debugging Tips +# Troubleshooting -This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. +This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible. ```{note} Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated. @@ -47,6 +47,7 @@ You might also need to set `export NCCL_SOCKET_IFNAME=` If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph. To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error. +(troubleshooting-incorrect-hardware-driver)= ## Incorrect hardware/driver If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly. @@ -139,7 +140,7 @@ A multi-node environment is more complicated than a single-node one. If you see Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes. ``` -(debugging-python-multiprocessing)= +(troubleshooting-python-multiprocessing)= ## Python multiprocessing ### `RuntimeError` Exception @@ -150,7 +151,7 @@ If you have seen a warning in your logs like this: WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously initialized. We must use the `spawn` multiprocessing start method. Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See - https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing + https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing for more information. ``` diff --git a/docs/source/index.md b/docs/source/index.md index 34f9c4caebe6f..f390474978790 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -50,7 +50,7 @@ For more information, check out the following: - [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention) - [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023) - [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al. -- {ref}`vLLM Meetups `. +- [vLLM Meetups](#meetups) ## Documentation @@ -58,18 +58,11 @@ For more information, check out the following: :caption: Getting Started :maxdepth: 1 -getting_started/installation -getting_started/amd-installation -getting_started/openvino-installation -getting_started/cpu-installation -getting_started/gaudi-installation -getting_started/arm-installation -getting_started/neuron-installation -getting_started/tpu-installation -getting_started/xpu-installation +getting_started/installation/index getting_started/quickstart -getting_started/debugging getting_started/examples/examples_index +getting_started/troubleshooting +getting_started/faq ``` ```{toctree} @@ -110,7 +103,6 @@ usage/structured_outputs usage/spec_decode usage/compatibility_matrix usage/performance -usage/faq usage/engine_args usage/env_vars usage/usage_stats diff --git a/docs/source/models/generative_models.md b/docs/source/models/generative_models.md index 35e0302b86619..383299d61b5dd 100644 --- a/docs/source/models/generative_models.md +++ b/docs/source/models/generative_models.md @@ -120,7 +120,7 @@ outputs = llm.chat(conversation, chat_template=custom_template) ## Online Inference -Our [OpenAI Compatible Server](../serving/openai_compatible_server.md) provides endpoints that correspond to the offline APIs: +Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs: - [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text. - [Chat API](#chat-api) is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template. diff --git a/docs/source/models/pooling_models.md b/docs/source/models/pooling_models.md index 76c96c9edcc5d..12ded68eb30b5 100644 --- a/docs/source/models/pooling_models.md +++ b/docs/source/models/pooling_models.md @@ -106,7 +106,7 @@ A code example can be found here: for more information. +After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See for more information. ``` ```{warning} diff --git a/docs/source/usage/spec_decode.md b/docs/source/usage/spec_decode.md index 8302da81b6173..8c52c97a41e48 100644 --- a/docs/source/usage/spec_decode.md +++ b/docs/source/usage/spec_decode.md @@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas: 3. **vLLM Logprob Stability** \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the same request across runs. For more details, see the FAQ section - titled *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. + titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). **Conclusion** @@ -195,7 +195,7 @@ can occur due to following factors: **Mitigation Strategies** -For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the {ref}`FAQs `. +For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq). ## Resources for vLLM contributors diff --git a/docs/source/usage/structured_outputs.md b/docs/source/usage/structured_outputs.md index 7292012e36a26..26c09bb0d8a0c 100644 --- a/docs/source/usage/structured_outputs.md +++ b/docs/source/usage/structured_outputs.md @@ -18,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters: - `guided_whitespace_pattern`: used to override the default whitespace pattern for guided json decoding. - `guided_decoding_backend`: used to select the guided decoding backend to use. -You can see the complete list of supported parameters on the [OpenAI Compatible Server](../serving/openai_compatible_server.md) page. +You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server)page. Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one: diff --git a/vllm/utils.py b/vllm/utils.py index 8ef07d2c326a3..aadeddabf8b55 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1938,7 +1938,7 @@ def _check_multiproc_method(): "the `spawn` multiprocessing start method. Setting " "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. " "See https://docs.vllm.ai/en/latest/getting_started/" - "debugging.html#python-multiprocessing " + "troubleshooting.html#python-multiprocessing " "for more information.") os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" From 408e5600158bfa34306cfbd034a3779e488752fa Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Sun, 5 Jan 2025 20:49:55 -0800 Subject: [PATCH 083/462] [Bugfix] Remove block size constraint (#11723) --- vllm/config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index b51f9783008b2..b0ed88cb7f42b 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1015,11 +1015,6 @@ def _verify_args(self) -> None: raise ValueError( "GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") - from vllm.platforms import current_platform - if (current_platform.is_cuda() and self.block_size is not None - and self.block_size > 32): - raise ValueError("CUDA Paged Attention kernel only supports " - f"block sizes up to 32. Got {self.block_size}.") def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": From 06bfb51963953d6ae31b87965bfb91b6eca4fd24 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 6 Jan 2025 14:24:42 +0900 Subject: [PATCH 084/462] [V1] Add BlockTable class (#11693) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/block_table.py | 78 ++++++++++++++++++++++++++++++ vllm/v1/worker/gpu_input_batch.py | 25 ++++------ vllm/v1/worker/gpu_model_runner.py | 16 +++--- 3 files changed, 94 insertions(+), 25 deletions(-) create mode 100644 vllm/v1/worker/block_table.py diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py new file mode 100644 index 0000000000000..26a2084b131fa --- /dev/null +++ b/vllm/v1/worker/block_table.py @@ -0,0 +1,78 @@ +from typing import List + +import numpy as np +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class BlockTable: + + def __init__( + self, + max_num_reqs: int, + max_model_len: int, + max_num_blocks_per_req: int, + pin_memory: bool, + device: torch.device, + ): + self.max_num_reqs = max_num_reqs + self.max_model_len = max_model_len + self.max_num_blocks_per_req = max_num_blocks_per_req + self.pin_memory = pin_memory + self.device = device + + self.block_table = torch.zeros( + (max_num_reqs, max_num_blocks_per_req), + device=self.device, + dtype=torch.int32, + ) + self.block_table_cpu = torch.zeros( + (max_num_reqs, max_num_blocks_per_req), + device="cpu", + dtype=torch.int32, + pin_memory=pin_memory, + ) + self.block_table_np = self.block_table_cpu.numpy() + self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32) + + def append_row( + self, + row_idx: int, + start: int, + block_ids: List[int], + ) -> None: + num_blocks = len(block_ids) + self.block_table_np[row_idx, start:start + num_blocks] = block_ids + self.num_blocks_per_row[row_idx] = start + num_blocks + + def add_row(self, row_idx: int, block_ids: List[int]) -> None: + self.append_row(row_idx, 0, block_ids) + + def move_row(self, src: int, tgt: int) -> None: + num_blocks = self.num_blocks_per_row[src] + self.block_table_np[tgt, :num_blocks] = self.block_table_np[ + src, :num_blocks] + self.num_blocks_per_row[tgt] = num_blocks + + def commit(self, num_reqs: int) -> None: + self.block_table[:num_reqs].copy_(self.block_table_cpu[:num_reqs], + non_blocking=True) + + def clear(self) -> None: + self.block_table.fill_(0) + self.block_table_cpu.fill_(0) + + def get_device_tensor(self) -> torch.Tensor: + """Ruturns the device tensor of the block table.""" + return self.block_table + + def get_cpu_tensor(self) -> torch.Tensor: + """Returns the CPU tensor of the block table.""" + return self.block_table_cpu + + def get_numpy_array(self) -> np.ndarray: + """Returns the numpy array of the block table.""" + return self.block_table_np diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index f8a1427c6c26c..40494e64b22f0 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -9,6 +9,7 @@ from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingParams, SamplingType from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.block_table import BlockTable if TYPE_CHECKING: from vllm.multimodal.inputs import PlaceholderRange @@ -70,19 +71,14 @@ def __init__( self.num_prompt_tokens = np.zeros(max_num_reqs, dtype=np.int32) self.num_computed_tokens_cpu = np.empty(max_num_reqs, dtype=np.int32) - # Attention-related. - self.block_table = torch.zeros( - (max_num_reqs, max_num_blocks_per_req), - device=self.device, - dtype=torch.int32, - ) - self.block_table_cpu_tensor = torch.zeros( - (max_num_reqs, max_num_blocks_per_req), - device="cpu", - dtype=torch.int32, + # Block table. + self.block_table = BlockTable( + max_num_reqs=max_num_reqs, + max_model_len=max_model_len, + max_num_blocks_per_req=max_num_blocks_per_req, pin_memory=pin_memory, + device=device, ) - self.block_table_cpu = self.block_table_cpu_tensor.numpy() # Sampling-related. self.temperature = torch.empty((max_num_reqs, ), @@ -193,8 +189,7 @@ def add_request( self.num_tokens[req_index] = request.num_tokens self.num_computed_tokens_cpu[req_index] = request.num_computed_tokens - num_blocks = len(request.block_ids) - self.block_table_cpu[req_index, :num_blocks] = request.block_ids + self.block_table.add_row(req_index, request.block_ids) sampling_params = request.sampling_params self.temperature_cpu[req_index] = sampling_params.temperature @@ -300,9 +295,7 @@ def condense(self, empty_req_indices: List[int]) -> None: self.num_prompt_tokens[last_req_index] self.num_computed_tokens_cpu[ empty_index] = self.num_computed_tokens_cpu[last_req_index] - # TODO(woosuk): Optimize the copy of block_table_cpu. - self.block_table_cpu[empty_index] = self.block_table_cpu[ - last_req_index] + self.block_table.move_row(last_req_index, empty_index) self.temperature_cpu[empty_index] = self.temperature_cpu[ last_req_index] self.top_p_cpu[empty_index] = self.top_p_cpu[last_req_index] diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 294c76cfb680e..31e693235f99f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -211,10 +211,9 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: if num_new_blocks == 0: continue start_index = len(req_state.block_ids) - end_index = start_index + num_new_blocks req_state.block_ids.extend(req_data.new_block_ids) - self.input_batch.block_table_cpu[ - req_index, start_index:end_index] = req_data.new_block_ids + self.input_batch.block_table.append_row(req_index, start_index, + req_data.new_block_ids) req_ids_to_add: List[str] = [] # Add new requests to the cached states. @@ -275,9 +274,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # OPTIMIZATION: Start copying the block table first. # This way, we can overlap the copy with the following CPU operations. - self.input_batch.block_table[:num_reqs].copy_( - self.input_batch.block_table_cpu_tensor[:num_reqs], - non_blocking=True) + self.input_batch.block_table.commit(num_reqs) # Get the number of scheduled tokens for each request. # TODO: The Python loop can be slow. Optimize. @@ -333,8 +330,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. - block_numbers = (self.input_batch.block_table_cpu_tensor.flatten() - [block_table_indices].numpy()) + block_table_cpu = self.input_batch.block_table.get_cpu_tensor() + block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() block_offsets = positions_np % self.block_size np.add(block_numbers * self.block_size, block_offsets, @@ -450,7 +447,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): query_start_loc=query_start_loc, max_seq_len=max_seq_len, seq_start_loc=seq_start_loc, - block_table=self.input_batch.block_table[:num_reqs], + block_table=( + self.input_batch.block_table.get_device_tensor()[:num_reqs]), slot_mapping=slot_mapping, use_cascade=use_cascade, common_prefix_len=common_prefix_len, From f8fcca100beada88136944976da88f47f363acab Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Sun, 5 Jan 2025 23:12:38 -0800 Subject: [PATCH 085/462] [Misc] Fix typo for valid_tool_parses (#11753) Signed-off-by: Rui Qiao --- vllm/entrypoints/openai/api_server.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e942b475535ad..047f699e4f277 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -767,11 +767,11 @@ async def run_server(args, **uvicorn_kwargs) -> None: if args.tool_parser_plugin and len(args.tool_parser_plugin) > 3: ToolParserManager.import_tool_parser(args.tool_parser_plugin) - valide_tool_parses = ToolParserManager.tool_parsers.keys() + valid_tool_parses = ToolParserManager.tool_parsers.keys() if args.enable_auto_tool_choice \ - and args.tool_call_parser not in valide_tool_parses: + and args.tool_call_parser not in valid_tool_parses: raise KeyError(f"invalid tool call parser: {args.tool_call_parser} " - f"(chose from {{ {','.join(valide_tool_parses)} }})") + f"(chose from {{ {','.join(valid_tool_parses)} }})") # workaround to make sure that we bind the port before the engine is set up. # This avoids race conditions with ray. From 022c5c6944bcf28ac4d0d28ce14f2b559358be52 Mon Sep 17 00:00:00 2001 From: Rui Qiao <161574667+ruisearch42@users.noreply.github.com> Date: Sun, 5 Jan 2025 23:59:16 -0800 Subject: [PATCH 086/462] [V1] Refactor get_executor_cls (#11754) --- tests/v1/engine/test_engine_core.py | 6 +++--- tests/v1/engine/test_engine_core_client.py | 6 +++--- vllm/v1/engine/async_llm.py | 21 +-------------------- vllm/v1/engine/llm_engine.py | 20 +------------------- vllm/v1/executor/abstract.py | 19 ++++++++++++++++++- 5 files changed, 26 insertions(+), 46 deletions(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 954cec734b956..8dd9b23fbdd5f 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -8,8 +8,8 @@ from vllm.engine.arg_utils import EngineArgs from vllm.platforms import current_platform from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core import EngineCore +from vllm.v1.executor.abstract import Executor if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", @@ -43,7 +43,7 @@ def test_engine_core(monkeypatch): """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) vllm_config = engine_args.create_engine_config() - executor_class = AsyncLLM._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, executor_class=executor_class) @@ -149,7 +149,7 @@ def test_engine_core_advanced_sampling(monkeypatch): """Setup the EngineCore.""" engine_args = EngineArgs(model=MODEL_NAME) vllm_config = engine_args.create_engine_config() - executor_class = AsyncLLM._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) engine_core = EngineCore(vllm_config=vllm_config, executor_class=executor_class) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 20d4e6f63b339..5a21806e57a11 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -11,8 +11,8 @@ from vllm.platforms import current_platform from vllm.usage.usage_lib import UsageContext from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.async_llm import AsyncLLM from vllm.v1.engine.core_client import EngineCoreClient +from vllm.v1.executor.abstract import Executor if not current_platform.is_cuda(): pytest.skip(reason="V1 currently only supported on CUDA.", @@ -84,7 +84,7 @@ def test_engine_core_client(monkeypatch, multiprocessing_mode: bool): engine_args = EngineArgs(model=MODEL_NAME, compilation_config=3) vllm_config = engine_args.create_engine_config( UsageContext.UNKNOWN_CONTEXT) - executor_class = AsyncLLM._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) client = EngineCoreClient.make_client( multiprocess_mode=multiprocessing_mode, asyncio_mode=False, @@ -152,7 +152,7 @@ async def test_engine_core_client_asyncio(monkeypatch): engine_args = EngineArgs(model=MODEL_NAME) vllm_config = engine_args.create_engine_config( usage_context=UsageContext.UNKNOWN_CONTEXT) - executor_class = AsyncLLM._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) client = EngineCoreClient.make_client( multiprocess_mode=True, asyncio_mode=True, diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 0696caf88385d..b963ba74f13f0 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -22,7 +22,6 @@ from vllm.v1.engine.detokenizer import Detokenizer from vllm.v1.engine.processor import Processor from vllm.v1.executor.abstract import Executor -from vllm.v1.executor.ray_utils import initialize_ray_cluster logger = init_logger(__name__) @@ -105,7 +104,7 @@ def from_engine_args( else: vllm_config = engine_config - executor_class = cls._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) # Create the AsyncLLM. return cls( @@ -127,24 +126,6 @@ def shutdown(self): if handler := getattr(self, "output_handler", None): handler.cancel() - @classmethod - def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: - executor_class: Type[Executor] - distributed_executor_backend = ( - vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "ray": - initialize_ray_cluster(vllm_config.parallel_config) - from vllm.v1.executor.ray_executor import RayExecutor - executor_class = RayExecutor - elif distributed_executor_backend == "mp": - from vllm.v1.executor.multiproc_executor import MultiprocExecutor - executor_class = MultiprocExecutor - else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor - return executor_class - async def add_request( self, request_id: str, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 0bd9b52c9be82..8ced3a34d2da3 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -89,7 +89,7 @@ def from_engine_args( # Create the engine configs. vllm_config = engine_args.create_engine_config(usage_context) - executor_class = cls._get_executor_cls(vllm_config) + executor_class = Executor.get_class(vllm_config) if VLLM_ENABLE_V1_MULTIPROCESSING: logger.debug("Enabling multiprocessing for LLMEngine.") @@ -103,24 +103,6 @@ def from_engine_args( stat_loggers=stat_loggers, multiprocess_mode=enable_multiprocessing) - @classmethod - def _get_executor_cls(cls, vllm_config: VllmConfig) -> Type[Executor]: - executor_class: Type[Executor] - distributed_executor_backend = ( - vllm_config.parallel_config.distributed_executor_backend) - if distributed_executor_backend == "ray": - from vllm.v1.executor.ray_executor import RayExecutor - executor_class = RayExecutor - elif distributed_executor_backend == "mp": - from vllm.v1.executor.multiproc_executor import MultiprocExecutor - executor_class = MultiprocExecutor - else: - assert (distributed_executor_backend is None) - from vllm.v1.executor.uniproc_executor import UniprocExecutor - executor_class = UniprocExecutor - - return executor_class - def get_num_unfinished_requests(self) -> int: return self.detokenizer.get_num_unfinished_requests() diff --git a/vllm/v1/executor/abstract.py b/vllm/v1/executor/abstract.py index 564d0447f15a6..5d74d4b01f500 100644 --- a/vllm/v1/executor/abstract.py +++ b/vllm/v1/executor/abstract.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Tuple +from typing import Tuple, Type from vllm.config import VllmConfig from vllm.v1.outputs import ModelRunnerOutput @@ -8,6 +8,23 @@ class Executor(ABC): """Abstract class for executors.""" + @staticmethod + def get_class(vllm_config: VllmConfig) -> Type["Executor"]: + executor_class: Type[Executor] + distributed_executor_backend = ( + vllm_config.parallel_config.distributed_executor_backend) + if distributed_executor_backend == "ray": + from vllm.v1.executor.ray_executor import RayExecutor + executor_class = RayExecutor + elif distributed_executor_backend == "mp": + from vllm.v1.executor.multiproc_executor import MultiprocExecutor + executor_class = MultiprocExecutor + else: + assert (distributed_executor_backend is None) + from vllm.v1.executor.uniproc_executor import UniprocExecutor + executor_class = UniprocExecutor + return executor_class + @abstractmethod def __init__(self, vllm_config: VllmConfig) -> None: raise NotImplementedError From 9c749713f6990a9f9d12e526d9bfc2669dfa8ee6 Mon Sep 17 00:00:00 2001 From: Lucas Tucker <47258766+lucas-tucker@users.noreply.github.com> Date: Mon, 6 Jan 2025 01:59:36 -0600 Subject: [PATCH 087/462] [mypy] Forward pass function type hints in lora (#11740) Signed-off-by: lucast2021 Co-authored-by: lucast2021 --- vllm/lora/layers.py | 12 +++++++++--- vllm/lora/models.py | 3 ++- vllm/model_executor/layers/linear.py | 4 +++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 102e40d3f448d..a933ccaecf15e 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -405,7 +405,9 @@ def __init__(self, base_layer: ReplicatedLinear) -> None: self.output_size = self.base_layer.output_size self.n_slices = 1 - def forward(self, input_): + def forward( + self, input_: torch.Tensor + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of ReplicatedLinearWithLoRA Args: @@ -496,7 +498,9 @@ def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: bias = bias[start_idx:end_idx] return bias - def forward(self, input_): + def forward( + self, input_: torch.Tensor + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of ColumnParallelLinear Args: @@ -833,7 +837,9 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: return bias - def forward(self, input_): + def forward( + self, input_: torch.Tensor + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: """Forward of RowParallelLinear Args: diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 9cfcc6bba727f..5b7225bdc8f37 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,7 +4,7 @@ import os import re from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Sequence, Type +from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union import safetensors.torch import torch @@ -219,6 +219,7 @@ def from_local_checkpoint( config["vllm_max_position_embeddings"] = max_position_embeddings peft_helper = PEFTHelper.from_dict(config) + unexpected_modules: List[Union[list[str], str]] if os.path.isfile(lora_tensor_path): tensors: Dict[str, torch.Tensor] = {} # Find unexpected modules. diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 33b221b994b2b..48cfb1b221720 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -238,7 +238,9 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): assert param.size() == loaded_weight.size() param.data.copy_(loaded_weight) - def forward(self, x: torch.Tensor) -> torch.Tensor: + def forward( + self, x: torch.Tensor + ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]: bias = self.bias if not self.skip_bias_add else None assert self.quant_method is not None output = self.quant_method.apply(self, x, bias) From 2a622d704a4270c8d6fab057e8a545ed86ac35b7 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Mon, 6 Jan 2025 00:01:22 -0800 Subject: [PATCH 088/462] k8s-config: Update the secret to use stringData (#11679) Signed-off-by: Suraj Deshmukh --- docs/source/serving/deploying_with_k8s.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/serving/deploying_with_k8s.md b/docs/source/serving/deploying_with_k8s.md index 77f848088ea43..5f9b0e4f55ecc 100644 --- a/docs/source/serving/deploying_with_k8s.md +++ b/docs/source/serving/deploying_with_k8s.md @@ -43,7 +43,7 @@ metadata: name: hf-token-secret namespace: default type: Opaque -data: +stringData: token: "REPLACE_WITH_TOKEN" ``` From 996357e4808ca5eab97d4c97c7d25b3073f46aab Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 6 Jan 2025 16:02:21 +0800 Subject: [PATCH 089/462] [VLM] Separate out profiling-related logic (#11746) Signed-off-by: DarkLight1337 --- tests/multimodal/test_processing.py | 7 +- vllm/model_executor/models/aria.py | 79 +++--- vllm/model_executor/models/blip2.py | 78 +++--- vllm/model_executor/models/chameleon.py | 72 +++--- vllm/model_executor/models/fuyu.py | 85 ++++--- vllm/model_executor/models/llava.py | 181 +++++++++----- vllm/model_executor/models/llava_next.py | 75 +++--- .../model_executor/models/llava_next_video.py | 148 ++++++----- vllm/model_executor/models/llava_onevision.py | 174 +++++++------ vllm/model_executor/models/phi3v.py | 104 ++++---- vllm/model_executor/models/qwen2_audio.py | 96 +++++--- vllm/model_executor/models/qwen2_vl.py | 231 +++++++++++------- vllm/model_executor/models/ultravox.py | 91 ++++--- vllm/model_executor/models/vision.py | 37 +-- vllm/multimodal/processing.py | 152 ++++-------- vllm/multimodal/profiling.py | 121 +++++++++ vllm/multimodal/registry.py | 2 +- 17 files changed, 1015 insertions(+), 718 deletions(-) create mode 100644 vllm/multimodal/profiling.py diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index b32faa699ebf2..75d878217b657 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -586,9 +586,10 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): ) processor = processor_factory(ctx, cache=None) + profiler = processor.profiling_info mock_supported_mm_limits = MagicMock(return_value={"image": num_supported}) - processor.get_supported_mm_limits = mock_supported_mm_limits + profiler.get_supported_mm_limits = mock_supported_mm_limits if is_valid: exc_ctx = nullcontext() @@ -596,7 +597,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): exc_ctx = pytest.raises(ValueError, match="this model only supports") with exc_ctx: - processor._get_and_validate_dummy_mm_counts() + profiler.get_mm_limits() @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"]) @@ -723,7 +724,7 @@ def _test_processing_cache_correctness( } mm_counts = {k: len(vs) for k, vs in mm_data.items()} - prompt = baseline_processor._get_dummy_processor_inputs( + prompt = baseline_processor.profiling_info.get_dummy_processor_inputs( model_config.max_model_len, mm_counts, ).prompt_text diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 8f5fd64a90c87..2e649f10c0765 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -24,8 +24,9 @@ from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, NestedTensors) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.aria import (AriaMoELMConfig, AriaVisionConfig) @@ -444,18 +445,58 @@ def build_mm_projector(config: PretrainedConfig): ) -class AriaMultiModalProcessor(BaseMultiModalProcessor): +class AriaProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} + def _get_hf_config(self): + return self.ctx.get_hf_config() + + def _get_vision_config(self) -> AriaVisionConfig: + return self._get_hf_config().vision_config def _get_num_image_tokens(self) -> int: - hf_config = self.ctx.get_hf_config() + hf_config = self._get_hf_config() return max(hf_config.projector_patch_to_query_dict.values()) + +class AriaProfilingInfo(AriaProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + vision_config = self._get_vision_config() + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + hf_processor = self._get_hf_processor() + image_token: str = hf_processor.image_token # type: ignore + + return ProcessorInputs( + prompt_text=image_token * num_images, + mm_data=mm_data, + ) + + +class AriaMultiModalProcessor(AriaProcessingMixin, BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return AriaProfilingInfo(self.ctx) + def _get_mm_fields_config( self, hf_inputs: BatchFeature, @@ -472,7 +513,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config() + hf_config = self._get_hf_config() image_token_id = hf_config.image_token_index num_image_tokens = self._get_num_image_tokens() @@ -485,32 +526,6 @@ def _get_prompt_replacements( ) ] - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - hf_config = self.ctx.get_hf_config() - vision_config: AriaVisionConfig = hf_config.vision_config - - max_image_size = vision_config.image_size - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=max_image_size, - height=max_image_size, - num_images=num_images) - } - - hf_processor = self._get_hf_processor() - image_token: str = hf_processor.image_token # type: ignore - - return ProcessorInputs( - prompt_text=image_token * num_images, - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(AriaMultiModalProcessor) class AriaForConditionalGeneration(nn.Module, SupportsMultiModal): diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index b3ecb2f22dc19..fd45783f167b4 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -4,8 +4,8 @@ import torch import torch.nn as nn -from transformers import (BatchFeature, Blip2Config, Blip2Processor, - Blip2QFormerConfig, apply_chunking_to_forward) +from transformers import (BatchFeature, Blip2Config, Blip2QFormerConfig, + apply_chunking_to_forward) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, VllmConfig @@ -18,8 +18,9 @@ MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from .blip import BlipVisionModel @@ -396,20 +397,52 @@ def forward( return sequence_output -class Blip2MultiModalProcessor(BaseMultiModalProcessor): +class Blip2ProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} + def _get_hf_config(self): + return self.ctx.get_hf_config(Blip2Config) def _get_num_image_tokens(self) -> int: - hf_config = self.ctx.get_hf_config(Blip2Config) + hf_config = self._get_hf_config() return hf_config.num_query_tokens + +class Blip2ProfilingInfo(Blip2ProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} - def _get_hf_processor(self) -> Blip2Processor: - return self.ctx.get_hf_processor(Blip2Processor) + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + hf_config = self._get_hf_config() + vision_config = hf_config.vision_config + + max_image_size = vision_config.image_size + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=max_image_size, + height=max_image_size, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) + + +class Blip2MultiModalProcessor(Blip2ProcessingMixin, BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return Blip2ProfilingInfo(self.ctx) def _get_mm_fields_config( self, @@ -427,13 +460,13 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - max_image_tokens = self._get_num_image_tokens() + num_image_tokens = self._get_num_image_tokens() return [ PromptReplacement( modality="image", target="", - replacement="" * max_image_tokens + "", + replacement="" * num_image_tokens + "", ) ] @@ -457,29 +490,6 @@ def apply( return result - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - hf_config = self.ctx.get_hf_config(Blip2Config) - vision_config = hf_config.vision_config - - max_image_size = vision_config.image_size - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=max_image_size, - height=max_image_size, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text="", - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(Blip2MultiModalProcessor) class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 1ad44678a591d..73ed73b61ebf9 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -31,8 +31,9 @@ MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import print_warning_once @@ -48,20 +49,55 @@ class ChameleonImagePixelInputs(TypedDict): """Shape: `(batch_size * num_images, num_channels, height, width)`""" -class ChameleonMultiModalProcessor(BaseMultiModalProcessor): +class ChameleonProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} + def _get_hf_config(self): + return self.ctx.get_hf_config(ChameleonConfig) + + def _get_hf_processor(self): + return self.ctx.get_hf_processor(ChameleonProcessor) def _get_num_image_tokens(self) -> int: processor = self._get_hf_processor() return processor.image_seq_length + +class ChameleonProfilingInfo(ChameleonProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": self._get_num_image_tokens()} - def _get_hf_processor(self) -> ChameleonProcessor: - return self.ctx.get_hf_processor(ChameleonProcessor) + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + config = self._get_hf_config() + + width = height = config.vq_config.resolution + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=width, + height=height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="" * num_images, + mm_data=mm_data, + ) + + +class ChameleonMultiModalProcessor(ChameleonProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return ChameleonProfilingInfo(self.ctx) def _get_mm_fields_config( self, @@ -76,7 +112,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - processor = self._get_hf_processor() + processor = self._get_hf_processor(**hf_processor_mm_kwargs) return [ PromptReplacement( @@ -90,28 +126,6 @@ def _get_prompt_replacements( ) ] - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - config = self.ctx.get_hf_config(ChameleonConfig) - - width = height = config.vq_config.resolution - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=width, - height=height, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text="" * num_images, - mm_data=mm_data, - ) - def apply( self, prompt_text: str, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7cd58fbc7cf21..c937fcb0978b9 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -35,8 +35,9 @@ NestedTensors, PlaceholderRange) from vllm.multimodal.parse import ImageProcessorItems, ImageSize from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -63,18 +64,16 @@ class FuyuImagePatchInputs(TypedDict): """ -class FuyuMultiModalProcessor(BaseMultiModalProcessor): +class FuyuProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": 1} + def _get_hf_config(self): + return self.ctx.get_hf_config(FuyuConfig) - def _get_image_target_size(self) -> ImageSize: - processor = self._get_hf_processor() - image_processor: FuyuImageProcessor = processor.image_processor + def _get_hf_processor(self): + return self.ctx.get_hf_processor(FuyuProcessor) - target_size = image_processor.size - return ImageSize(width=target_size["width"], - height=target_size["height"]) + def _get_image_processor(self) -> FuyuImageProcessor: + return self._get_hf_processor().image_processor def _get_image_feature_grid_size( self, @@ -82,7 +81,9 @@ def _get_image_feature_grid_size( image_width: int, image_height: int, ) -> tuple[int, int]: - target_width, target_height = self._get_image_target_size() + image_processor = self._get_image_processor() + target_width = image_processor.size["width"] + target_height = image_processor.size["height"] if not (image_width <= target_width and image_height <= target_height): height_scale_factor = target_height / image_height @@ -96,8 +97,14 @@ def _get_image_feature_grid_size( nrows = math.ceil(image_height / 30) return ncols, nrows + +class FuyuProfilingInfo(FuyuProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": 1} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - target_width, target_height = self._get_image_target_size() + target_width, target_height = self._get_image_size_with_most_features() max_ncols, max_nrows = self._get_image_feature_grid_size( image_width=target_width, @@ -107,8 +114,36 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"image": max_image_tokens} - def _get_hf_processor(self) -> FuyuProcessor: - return self.ctx.get_hf_processor(FuyuProcessor) + def _get_image_size_with_most_features(self) -> ImageSize: + image_processor = self._get_image_processor() + return ImageSize(width=image_processor.size["width"], + height=image_processor.size["height"]) + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + target_width, target_height = self._get_image_size_with_most_features() + num_images = mm_counts.get("image", 0) + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + return ProcessorInputs( + prompt_text="", + mm_data=mm_data, + ) + + +class FuyuMultiModalProcessor(FuyuProcessingMixin, BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return FuyuProfilingInfo(self.ctx) def _call_hf_processor( self, @@ -161,7 +196,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config(FuyuConfig) + hf_config = self._get_hf_config() bos_token_id = hf_config.bos_token_id tokenizer = self._get_tokenizer() @@ -208,26 +243,6 @@ def apply( return result - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - target_width, target_height = self._get_image_target_size() - num_images = mm_counts.get("image", 0) - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images) - } - - return ProcessorInputs( - prompt_text="", - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(FuyuMultiModalProcessor) class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index d522378e0bebb..4299af8cd03a2 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,4 +1,4 @@ -from abc import abstractmethod +from abc import ABC, abstractmethod from functools import cached_property from typing import (Final, Iterable, List, Literal, Mapping, Optional, Protocol, Set, Tuple, TypedDict, Union) @@ -13,6 +13,7 @@ from vllm.attention import AttentionMetadata from vllm.config import VllmConfig +from vllm.inputs import InputProcessingContext from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) @@ -25,9 +26,10 @@ NestedTensors) from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, ImageSize) -from vllm.multimodal.processing import (InputProcessingContext, +from vllm.multimodal.processing import (BaseMultiModalProcessor, MultiModalDataItems, ProcessingCache, - ProcessorInputs, PromptReplacement) + ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel @@ -37,7 +39,7 @@ from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import BaseVisionLanguageMultiModalProcessor +from .vision import get_vision_encoder_info class LlavaImagePixelInputs(TypedDict): @@ -94,30 +96,42 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class LlavaLikeConfig(Protocol): vision_config: Final[PretrainedConfig] + image_token_index: Final[int] vision_feature_select_strategy: Final[str] - vision_feature_layer: Final[Union[int, List[int]]] + vision_feature_layer: Final[Union[int, list[int]]] -class BaseLlavaMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): +class LlavaLikeProcessor(Protocol): + image_token: Final[str] + + +class BaseLlavaProcessingMixin(ProcessingMixin, ABC): - @abstractmethod def _get_hf_config(self) -> LlavaLikeConfig: - raise NotImplementedError + return self.ctx.get_hf_config(LlavaConfig) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} + def _get_vision_encoder_info(self): + return get_vision_encoder_info(self._get_hf_config()) - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - return {"image": self._get_max_image_tokens()} + @abstractmethod + def _get_hf_processor(self) -> LlavaLikeProcessor: + raise NotImplementedError - def _get_mm_fields_config( + def _get_num_image_tokens( self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), + *, + image_width: int, + image_height: int, + ) -> int: + hf_config = self._get_hf_config() + vision_encoder_info = self._get_vision_encoder_info() + + return self._apply_feature_select_strategy( + hf_config.vision_feature_select_strategy, + vision_encoder_info.get_num_image_tokens( + image_width=image_width, + image_height=image_height, + ), ) def _apply_feature_select_strategy( @@ -133,31 +147,38 @@ def _apply_feature_select_strategy( msg = f"Unexpected feature select strategy: {strategy!r}" raise NotImplementedError(msg) - def _get_max_image_tokens(self) -> int: - hf_config = self._get_hf_config() - return self._apply_feature_select_strategy( - hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_max_image_tokens(), - ) +class BaseLlavaProfilingInfo(BaseLlavaProcessingMixin, BaseProfilingInfo): - def _get_dummy_image_size(self) -> ImageSize: - image_size = self._vision_encoder_info.get_image_size() - return ImageSize(image_size, image_size) + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} - @abstractmethod - def _get_image_token(self) -> str: - raise NotImplementedError + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return {"image": self._get_max_image_tokens()} + + def _get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self._get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) - def _get_dummy_processor_inputs( + def _get_max_image_tokens(self) -> int: + target_width, target_height = self._get_image_size_with_most_features() + + return self._get_num_image_tokens( + image_width=target_width, + image_height=target_height, + ) + + def get_dummy_processor_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> ProcessorInputs: num_images = mm_counts.get("image", 0) - image_token = self._get_image_token() - target_width, target_height = self._get_dummy_image_size() + processor = self._get_hf_processor() + image_token = processor.image_token + target_width, target_height = self._get_image_size_with_most_features() mm_data = { "image": @@ -172,32 +193,32 @@ def _get_dummy_processor_inputs( ) -class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): - - def _get_hf_config(self) -> LlavaConfig: - return self.ctx.get_hf_config(LlavaConfig) +class LlavaProcessingMixin(BaseLlavaProcessingMixin): - def _get_hf_processor(self) -> LlavaProcessor: + def _get_hf_processor(self): return self.ctx.get_hf_processor(LlavaProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token - def _get_num_image_tokens( - self, - *, - image_width: int, - image_height: int, - ) -> int: - hf_config = self._get_hf_config() +class LlavaProfilingInfo(LlavaProcessingMixin, BaseLlavaProfilingInfo): + pass - return self._apply_feature_select_strategy( - hf_config.vision_feature_select_strategy, - self._vision_encoder_info.get_num_image_tokens( - image_width=image_width, - image_height=image_height, - ), - ) + +class BaseLlavaMultiModalProcessor(LlavaProcessingMixin, + BaseMultiModalProcessor): + + # Copied from BaseMultiModalProcessor + @abstractmethod + def _get_profiling_info(self) -> BaseProfilingInfo: + raise NotImplementedError + + # Copied from BaseMultiModalProcessor + @abstractmethod + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + raise NotImplementedError def _get_prompt_replacements( self, @@ -232,16 +253,37 @@ def get_replacement(item_idx: int): ] -class PixtralHFMultiModalProcessor(BaseLlavaMultiModalProcessor): +class LlavaMultiModalProcessor(BaseLlavaMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return LlavaProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) - def _get_hf_config(self) -> LlavaConfig: - return self.ctx.get_hf_config(LlavaConfig) - def _get_hf_processor(self) -> PixtralProcessor: +class PixtralHFProcessingMixin(BaseLlavaProcessingMixin): + + def _get_hf_processor(self): return self.ctx.get_hf_processor(PixtralProcessor) - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token + +class PixtralHFProfilingInfo(PixtralHFProcessingMixin, BaseLlavaProfilingInfo): + pass + + +class PixtralHFMultiModalProcessor(PixtralHFProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return PixtralHFProfilingInfo(self.ctx) def _call_hf_processor( self, @@ -270,6 +312,16 @@ def _call_hf_processor( return processed_outputs + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) + def _get_prompt_replacements( self, mm_items: MultiModalDataItems, @@ -316,7 +368,7 @@ def _build_llava_or_pixtral_hf_processor( *, cache: Optional[ProcessingCache] = None, enable_sanity_checks: bool = True, -) -> BaseLlavaMultiModalProcessor: +) -> BaseMultiModalProcessor: hf_config = ctx.get_hf_config(LlavaConfig) if isinstance(hf_config.vision_config, PixtralVisionConfig): @@ -663,16 +715,13 @@ def load_weights(self, weights: Iterable[Tuple[str, class MantisMultiModalProcessor(LlavaMultiModalProcessor): - def _get_hf_processor(self): - return self.ctx.get_hf_processor(LlavaProcessor) - def apply( self, prompt_text: str, mm_data: MultiModalDataDict, hf_processor_mm_kwargs: Mapping[str, object], ) -> MultiModalInputsV2: - hf_config = self.ctx.get_hf_config(LlavaConfig) + hf_config = self._get_hf_config() image_token_id = hf_config.image_token_index # Assume that it doesn't depend on the image size diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index f79021596f915..c76ec164a3087 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,6 +1,6 @@ from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, Union) import numpy as np import torch @@ -17,12 +17,14 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalFieldConfig, NestedTensors from vllm.multimodal.parse import ImageSize +from vllm.multimodal.profiling import BaseProfilingInfo from vllm.sequence import IntermediateTensors from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import (LlavaMultiModalProcessor, LlavaMultiModalProjector, - init_vision_tower_for_llava) +from .llava import (BaseLlavaMultiModalProcessor, BaseLlavaProcessingMixin, + BaseLlavaProfilingInfo, LlavaLikeConfig, + LlavaMultiModalProjector, init_vision_tower_for_llava) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, init_vllm_registered_model, maybe_prefix) @@ -60,35 +62,17 @@ class LlavaNextImageEmbeddingInputs(TypedDict): LlavaNextImageEmbeddingInputs] -class LlavaNextMultiModalProcessor(LlavaMultiModalProcessor): +class LlavaNextLikeConfig(LlavaLikeConfig, Protocol): + image_grid_pinpoints: Final[list[list[int]]] - def _get_hf_config(self) -> LlavaNextConfig: - return self.ctx.get_hf_config(LlavaNextConfig) - def _get_hf_processor(self) -> LlavaNextProcessor: - return self.ctx.get_hf_processor(LlavaNextProcessor) +class LlavaNextProcessingMixin(BaseLlavaProcessingMixin): - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_sizes=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - ) - - def _get_image_token(self) -> str: - return self._get_hf_processor().image_token - - def _get_max_image_tokens(self) -> int: - largest_feature_size, _ = self._get_pinpoint_with_most_features() - return largest_feature_size + def _get_hf_config(self) -> LlavaNextLikeConfig: + return self.ctx.get_hf_config(LlavaNextConfig) - def _get_dummy_image_size(self) -> ImageSize: - _, pinpoint = self._get_pinpoint_with_most_features() - return pinpoint + def _get_hf_processor(self): + return self.ctx.get_hf_processor(LlavaNextProcessor) # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.2.0/server/text_generation_server/models/vlm_causal_lm.py#L106 def _get_num_image_tokens( @@ -98,7 +82,7 @@ def _get_num_image_tokens( image_height: int, ) -> int: hf_config = self._get_hf_config() - vision_encoder_info = self._vision_encoder_info + vision_encoder_info = self._get_vision_encoder_info() base_feature_size = self._apply_feature_select_strategy( hf_config.vision_feature_select_strategy, @@ -140,7 +124,7 @@ def _get_num_unpadded_features( current_height = npatches * num_patch_height current_width = npatches * num_patch_width - # NOTE: HF resizes based on float32 + # NOTE: Use float32 to remain consistent with HF output original_aspect_ratio = np.array(original_width / original_height, dtype=np.float32) current_aspect_ratio = np.array(current_width / current_height, @@ -164,11 +148,10 @@ def _get_num_unpadded_features( return (unpadded_features, newline_features) - def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: - """ - Get the grid pinpoint with the most features and - the corresponding feature size. - """ + +class LlavaNextProfilingInfo(LlavaNextProcessingMixin, BaseLlavaProfilingInfo): + + def _get_image_size_with_most_features(self) -> ImageSize: hf_config = self._get_hf_config() largest_feature_size, largest_feature_pinpoint = 0, None @@ -183,7 +166,25 @@ def _get_pinpoint_with_most_features(self) -> tuple[int, ImageSize]: if largest_feature_size == 0 or largest_feature_pinpoint is None: raise ValueError("Cannot have a largest feature size of 0!") - return largest_feature_size, largest_feature_pinpoint + return largest_feature_pinpoint + + +class LlavaNextMultiModalProcessor(LlavaNextProcessingMixin, + BaseLlavaMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return LlavaNextProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + ) @MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index ee6b89f0d4498..6e82cee1c95a4 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -15,11 +15,14 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, - VideoEmbeddingItems, VideoProcessorItems) -from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs, +from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs, + NestedTensors) +from vllm.multimodal.parse import (ImageSize, VideoEmbeddingItems, + VideoProcessorItems) +from vllm.multimodal.processing import (BaseMultiModalProcessor, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -28,7 +31,7 @@ from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) -from .vision import BaseVisionLanguageMultiModalProcessor +from .vision import get_vision_encoder_info class LlavaNextVideoPixelInputs(TypedDict): @@ -44,29 +47,16 @@ class LlavaNextVideoPixelInputs(TypedDict): """ -class LlavaNextVideoMultiModalProcessor(BaseVisionLanguageMultiModalProcessor): +class LlavaNextVideoProcessingMixin(ProcessingMixin): - def _get_hf_config(self) -> LlavaNextVideoConfig: + def _get_hf_config(self): return self.ctx.get_hf_config(LlavaNextVideoConfig) - def _get_hf_processor(self) -> LlavaNextVideoProcessor: - return self.ctx.get_hf_processor(LlavaNextVideoProcessor) - - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"video": 1} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - num_frames = self._get_dummy_num_frames(seq_len) - max_video_tokens = self._get_max_video_tokens(num_frames) - - return {"video": max_video_tokens} + def _get_vision_encoder_info(self): + return get_vision_encoder_info(self._get_hf_config()) - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict(pixel_values_videos=MultiModalFieldConfig.batched("video")) + def _get_hf_processor(self): + return self.ctx.get_hf_processor(LlavaNextVideoProcessor) def _get_num_frame_tokens( self, @@ -77,7 +67,8 @@ def _get_num_frame_tokens( hf_config = self._get_hf_config() spatial_pool_stride = hf_config.spatial_pool_stride - patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + vision_encoder_info = self._get_vision_encoder_info() + patch_grid_length = vision_encoder_info.get_patch_grid_length() pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) return pooled_grid_length * pooled_grid_length @@ -96,18 +87,43 @@ def _get_num_video_tokens( return num_frame_tokens * num_frames - def _get_max_video_tokens(self, num_frames: int) -> int: - return self._get_num_video_tokens(image_width=999999, - image_height=999999, - num_frames=num_frames) + +class LlavaNextVideoProfilingInfo(LlavaNextVideoProcessingMixin, + BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"video": 1} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self._get_image_size_with_most_features() + + max_video_tokens = self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + ) + + return {"video": max_video_tokens} + + def _get_image_size_with_most_features(self) -> ImageSize: + vision_encoder_info = self._get_vision_encoder_info() + width = height = vision_encoder_info.get_image_size() + return ImageSize(width=width, height=height) def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + num_frames = 0 while True: next_num_frames = num_frames + 1 + next_max_tokens = self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + ) - if self._get_max_video_tokens(next_num_frames) > max_tokens: + if next_max_tokens > max_tokens: break num_frames = next_num_frames @@ -122,12 +138,45 @@ def _get_dummy_num_frames(self, seq_len: int) -> int: return max(max_total_frames // max(max_videos, 1), 1) - def _get_dummy_image_size(self) -> ImageSize: - image_size = self._vision_encoder_info.get_image_size() - return ImageSize(image_size, image_size) + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_videos = mm_counts.get("video", 0) + + processor = self._get_hf_processor() + video_token = processor.video_token + target_width, target_height = self._get_image_size_with_most_features() + + mm_data = { + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=video_token * num_videos, + mm_data=mm_data, + ) + - def _get_video_token(self) -> str: - return self._get_hf_processor().video_token +class LlavaNextVideoMultiModalProcessor(LlavaNextVideoProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return LlavaNextVideoProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values_videos=MultiModalFieldConfig.batched("video")) def _get_prompt_replacements( self, @@ -162,36 +211,11 @@ def get_replacement(item_idx: int): ), ] - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_videos = mm_counts.get("video", 0) - - video_token = self._get_video_token() - target_width, target_height = self._get_dummy_image_size() - - mm_data = { - "video": - self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), - num_videos=num_videos, - ) - } - - return ProcessorInputs( - prompt_text=video_token * num_videos, - mm_data=mm_data, - ) - # adopted from transformers modeling_llava_next_video.py class LlavaNextVideoPooler(nn.Module): - def __init__(self, config): + def __init__(self, config: LlavaNextVideoConfig): super().__init__() mode = config.spatial_pool_mode @@ -209,7 +233,7 @@ def __init__(self, config): raise ValueError( f"Unknown pooling mode: {mode}. Expected [`average`, `max`]") - def forward(self, image_features): + def forward(self, image_features: torch.Tensor): ori_width = int( math.sqrt(image_features.shape[1] * self.image_size // self.image_size)) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5a3cdadc47cac..6dccc1e0d3b8d 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -1,7 +1,7 @@ import math from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Final, Iterable, List, Literal, Mapping, Optional, + Protocol, Set, Tuple, TypedDict, Union) import numpy as np import torch @@ -21,15 +21,16 @@ from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, VideoProcessorItems) -from vllm.multimodal.processing import (MultiModalFieldConfig, ProcessorInputs, - PromptReplacement) +from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP -from .llava import init_vision_tower_for_llava -from .llava_next import LlavaNextMultiModalProcessor +from .llava import BaseLlavaProfilingInfo, init_vision_tower_for_llava +from .llava_next import (LlavaNextLikeConfig, LlavaNextMultiModalProcessor, + LlavaNextProcessingMixin) from .siglip import SiglipVisionModel from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, maybe_prefix, merge_multimodal_embeddings) @@ -82,39 +83,17 @@ class LlavaOnevisionImageEmbeddingInputs(TypedDict): LlavaOnevisionVideoPixelInputs] -class LlavaOnevisionMultiModalProcessor(LlavaNextMultiModalProcessor): +class LlavaOnevisionLikeConfig(LlavaNextLikeConfig, Protocol): + video_token_index: Final[int] - def _get_hf_config(self) -> LlavaOnevisionConfig: - return self.ctx.get_hf_config(LlavaOnevisionConfig) - - def _get_hf_processor(self) -> LlavaOnevisionProcessor: - return self.ctx.get_hf_processor(LlavaOnevisionProcessor) - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": None} - - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - max_image_tokens = self._get_max_image_tokens() +class LlavaOnevisionProcessingMixin(LlavaNextProcessingMixin): - num_frames = self._get_dummy_num_frames(seq_len) - max_video_tokens = self._get_max_video_tokens(num_frames) - - return { - "image": max_image_tokens, - "video": max_video_tokens, - } + def _get_hf_config(self) -> LlavaOnevisionLikeConfig: + return self.ctx.get_hf_config(LlavaOnevisionConfig) - def _get_mm_fields_config( - self, - hf_inputs: BatchFeature, - hf_processor_mm_kwargs: Mapping[str, object], - ) -> Mapping[str, MultiModalFieldConfig]: - return dict( - pixel_values=MultiModalFieldConfig.batched("image"), - image_sizes=MultiModalFieldConfig.batched("image"), - image_embeds=MultiModalFieldConfig.batched("image"), - pixel_values_videos=MultiModalFieldConfig.batched("video"), - ) + def _get_hf_processor(self): + return self.ctx.get_hf_processor(LlavaOnevisionProcessor) def _get_num_unpadded_features( self, @@ -128,7 +107,7 @@ def _get_num_unpadded_features( current_height = npatches * num_patch_height current_width = npatches * num_patch_width - # NOTE: HF resizes based on float32 + # NOTE: Use float32 to remain consistent with HF output original_aspect_ratio = np.array(original_width / original_height, dtype=np.float32) current_aspect_ratio = np.array(current_width / current_height, @@ -167,7 +146,8 @@ def _get_num_frame_tokens( hf_config = self._get_hf_config() spatial_pool_stride = getattr(hf_config, "spatial_pool_stride", 2) - patch_grid_length = self._vision_encoder_info.get_patch_grid_length() + vision_encoder_info = self._get_vision_encoder_info() + patch_grid_length = vision_encoder_info.get_patch_grid_length() pooled_grid_length = math.ceil(patch_grid_length / spatial_pool_stride) return pooled_grid_length * pooled_grid_length @@ -186,18 +166,33 @@ def _get_num_video_tokens( return num_frame_tokens * num_frames + 1 # Newline token - def _get_max_video_tokens(self, num_frames: int) -> int: - return self._get_num_video_tokens(image_width=999999, - image_height=999999, - num_frames=num_frames) + +class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin, + BaseLlavaProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self._get_max_image_tokens(), + "video": self._get_max_video_tokens(seq_len), + } def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + num_frames = 0 while True: next_num_frames = num_frames + 1 + next_max_tokens = self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + ) - if self._get_max_video_tokens(next_num_frames) > max_tokens: + if next_max_tokens > max_tokens: break num_frames = next_num_frames @@ -215,8 +210,65 @@ def _get_dummy_num_frames(self, seq_len: int) -> int: return max(max_total_frames // max(max_videos, 1), 1) - def _get_video_token(self) -> str: - return self._get_hf_processor().video_token + def _get_max_video_tokens(self, seq_len: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + + return self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + ) + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + processor = self._get_hf_processor() + image_token = processor.image_token + video_token = processor.video_token + target_width, target_height = self._get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) + } + + return ProcessorInputs( + prompt_text=image_token * num_images + video_token * num_videos, + mm_data=mm_data, + ) + + +class LlavaOnevisionMultiModalProcessor(LlavaOnevisionProcessingMixin, + LlavaNextMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return LlavaOnevisionProfilingInfo(self.ctx) + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + image_sizes=MultiModalFieldConfig.batched("image"), + image_embeds=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.batched("video"), + ) def _call_hf_processor( self, @@ -235,7 +287,8 @@ def _call_hf_processor( mm_kwargs=mm_kwargs, ) - video_token = self._get_video_token() + processor = self._get_hf_processor() + video_token = processor.video_token # LLaVA-OneVision processor doesn't support multiple videos # with different sizes when converting back to tensors @@ -303,37 +356,6 @@ def get_video_replacement(item_idx: int): ), ] - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) - num_videos = mm_counts.get("video", 0) - - image_token = self._get_image_token() - video_token = self._get_video_token() - target_width, target_height = self._get_dummy_image_size() - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images), - "video": - self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), - num_videos=num_videos, - ) - } - - return ProcessorInputs( - prompt_text=image_token * num_images + video_token * num_videos, - mm_data=mm_data, - ) - class LlavaOnevisionMultiModalProjector(nn.Module): diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7aa9d58d1d348..c8418c14e5fdf 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -14,7 +14,7 @@ # limitations under the License. from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union import torch import torch.nn as nn @@ -28,22 +28,23 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig, MultiModalInputsV2, MultiModalKwargs, NestedTensors, PlaceholderRange) -from vllm.multimodal.parse import ImageEmbeddingItems, ImageProcessorItems +from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems, + ImageSize) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement, _BoundPromptReplacement, _PlaceholderInfo) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of -from .clip import dummy_image_for_clip +from .clip import CLIPVisionModel from .interfaces import SupportsMultiModal, SupportsPP from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, init_vllm_registered_model, maybe_prefix, @@ -54,10 +55,6 @@ # Cannot find the following 2 numbers from hf config. _IMAGE_TOKEN_ID = 32044 -# Result in the max possible feature size (h:w = 16:1) -MAX_IMAGE_FEATURE_SIZE_HEIGHT = 8000 -MAX_IMAGE_FEATURE_SIZE_WIDTH = 50 - CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(dropout=0.0, hidden_act="quick_gelu", hidden_size=1024, @@ -305,10 +302,17 @@ def add_image_newline(self, image_features_hd): return image_features_hd_newline -class Phi3VMultiModalProcessor(BaseMultiModalProcessor): +class Phi3VProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None} + def _get_hf_processor( + self, + *, + num_crops: Optional[int] = None, + ) -> ProcessorMixin: + if num_crops is not None: + return self.ctx.get_hf_processor(num_crops=num_crops) + + return self.ctx.get_hf_processor() def _get_num_image_tokens( self, @@ -323,23 +327,55 @@ def _get_num_image_tokens( height=image_height, ) + +class Phi3VProfilingInfo(Phi3VProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None} + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + target_width, target_height = self._get_image_size_with_most_features() + max_image_tokens = self._get_num_image_tokens( - image_width=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height=MAX_IMAGE_FEATURE_SIZE_HEIGHT, + image_width=target_width, + image_height=target_height, ) return {"image": max_image_tokens} - def _get_hf_processor( + def _get_image_size_with_most_features(self) -> ImageSize: + # Result in the max possible feature size (h:w = 16:1) + return ImageSize(height=8000, width=50) + + def get_dummy_processor_inputs( self, - *, - num_crops: Optional[int] = None, - ) -> ProcessorMixin: - if num_crops is not None: - return self.ctx.get_hf_processor(num_crops=num_crops) + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) - return self.ctx.get_hf_processor() + target_width, target_height = self._get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images) + } + + hf_processor = self._get_hf_processor() + image_tokens: list[str] = hf_processor.img_tokens # type: ignore + + return ProcessorInputs( + prompt_text="".join(image_tokens[:num_images]), + mm_data=mm_data, + ) + + +class Phi3VMultiModalProcessor(Phi3VProcessingMixin, BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return Phi3VProfilingInfo(self.ctx) def _call_hf_processor( self, @@ -377,10 +413,10 @@ def _get_mm_fields_config( def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() + hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) image_tokens: list[str] = hf_processor.img_tokens # type: ignore tokenizer = self._get_tokenizer() @@ -442,28 +478,6 @@ def _apply_prompt_replacements( return token_ids, text, placeholders - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) - - data = dummy_image_for_clip( - CLIP_VIT_LARGE_PATCH14_336_CONFIG, - num_images, - image_width_override=MAX_IMAGE_FEATURE_SIZE_WIDTH, - image_height_override=MAX_IMAGE_FEATURE_SIZE_HEIGHT, - ) - - hf_processor = self._get_hf_processor() - image_tokens: list[str] = hf_processor.img_tokens # type: ignore - - return ProcessorInputs( - prompt_text="".join(image_tokens[:num_images]), - mm_data=data, - ) - def apply( self, prompt_text: str, diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index bc3bb1f79b407..a7bb3425ed17c 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -20,8 +20,8 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from functools import cached_property -from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict, - Union) +from typing import (Any, Iterable, List, Mapping, Optional, Set, Tuple, + TypedDict, Union) import torch import torch.nn as nn @@ -40,8 +40,9 @@ NestedTensors) from vllm.multimodal.parse import AudioProcessorItems, MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from .interfaces import SupportsMultiModal, SupportsPP @@ -79,28 +80,70 @@ def _get_feat_extract_output_lengths(input_lengths: torch.Tensor): return feat_lengths, output_lengths -class Qwen2AudioMultiModalProcessor(BaseMultiModalProcessor): +class Qwen2AudioProcessingMixin(ProcessingMixin): + + def _get_hf_config(self): + return self.ctx.get_hf_config(Qwen2AudioConfig) + + def _get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> Qwen2AudioProcessor: + return self.ctx.get_hf_processor(Qwen2AudioProcessor) + + def _get_feature_extractor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> WhisperFeatureExtractor: + hf_processor = self._get_hf_processor(sampling_rate=sampling_rate) + feature_extractor = hf_processor.feature_extractor # type: ignore + assert isinstance(feature_extractor, WhisperFeatureExtractor) + return feature_extractor + + +class Qwen2AudioProfilingInfo(Qwen2AudioProcessingMixin, BaseProfilingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) + hf_config = self._get_hf_config() max_source_positions = hf_config.audio_config.max_source_positions max_output_lengths = (max_source_positions - 2) // 2 + 1 return {"audio": max_output_lengths} - def _get_hf_processor( + def get_dummy_processor_inputs( self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> Qwen2AudioProcessor: - return self.ctx.get_hf_processor(Qwen2AudioProcessor) + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + feature_extractor = self._get_feature_extractor() + + sampling_rate = feature_extractor.sampling_rate + audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) + + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + return ProcessorInputs( + prompt_text="<|AUDIO|>" * num_audios, + mm_data=mm_data, + ) + - def _get_feature_extractor(self) -> WhisperFeatureExtractor: - return self._get_hf_processor().feature_extractor # type: ignore +class Qwen2AudioMultiModalProcessor(Qwen2AudioProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return Qwen2AudioProfilingInfo(self.ctx) def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self._get_feature_extractor() @@ -110,7 +153,7 @@ def _call_hf_processor( self, prompt: str, mm_data: Mapping[str, object], - mm_kwargs: Mapping[str, object], + mm_kwargs: Mapping[str, Any], ) -> BatchFeature: mm_data = dict(mm_data) audios = mm_data.pop("audios", []) @@ -118,7 +161,7 @@ def _call_hf_processor( if audios: mm_data["audios"] = audios - feature_extractor = self._get_feature_extractor() + feature_extractor = self._get_feature_extractor(**mm_kwargs) mm_kwargs = dict( **mm_kwargs, sampling_rate=feature_extractor.sampling_rate, @@ -151,7 +194,7 @@ def _get_prompt_replacements( hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_config = self.ctx.get_hf_config(Qwen2AudioConfig) + hf_config = self._get_hf_config() placeholder = hf_config.audio_token_index feature_attention_mask = out_mm_kwargs.get("feature_attention_mask") @@ -191,27 +234,6 @@ def _always_apply_prompt_replacements(self) -> bool: # tokens than the number of audio items) return True - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() - - sampling_rate = feature_extractor.sampling_rate - audio_len = feature_extractor.chunk_length * sampling_rate - num_audios = mm_counts.get("audio", 0) - - mm_data = { - "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) - } - - return ProcessorInputs( - prompt_text="<|AUDIO|>" * num_audios, - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(Qwen2AudioMultiModalProcessor) class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index abca85e0e2024..a5c2fb9e84df3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -59,8 +59,9 @@ from vllm.multimodal.parse import (ImageSize, ModalityDataItems, MultiModalDataParser) from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors from vllm.transformers_utils.config import uses_mrope @@ -708,10 +709,44 @@ def _parse_video_data( return super()._parse_video_data(data) -class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor): +class Qwen2VLProcessingMixin(ProcessingMixin): - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - return {"image": None, "video": None} + def _get_hf_config(self): + return self.ctx.get_hf_config(Qwen2VLConfig) + + def _get_hf_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + ) -> Qwen2VLProcessor: + hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2VLImageProcessor) + + if min_pixels: + image_processor.min_pixels = min_pixels + if max_pixels: + image_processor.max_pixels = max_pixels + if max_pixels or min_pixels: + image_processor.size = { + "min_pixels": image_processor.min_pixels, + "max_pixels": image_processor.max_pixels, + } + + return hf_processor + + def _get_image_processor( + self, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, + ): + hf_processor = self._get_hf_processor(min_pixels=min_pixels, + max_pixels=max_pixels) + image_processor = hf_processor.image_processor # type: ignore + assert isinstance(image_processor, Qwen2VLImageProcessor) + return image_processor def _get_vision_info( self, @@ -721,14 +756,13 @@ def _get_vision_info( num_frames: int = 1, do_resize: bool = True, ) -> tuple[ImageSize, int]: - hf_config = self.ctx.get_hf_config(Qwen2VLConfig) + hf_config = self._get_hf_config() vision_config = hf_config.vision_config patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size temporal_patch_size = vision_config.temporal_patch_size - hf_processor = self._get_hf_processor() - image_processor = self._get_image_processor(hf_processor) + image_processor = self._get_image_processor() if do_resize: resized_height, resized_width = smart_resize( @@ -753,7 +787,45 @@ def _get_vision_info( return preprocessed_size, num_vision_tokens - def _get_dummy_image_size(self) -> ImageSize: + def _get_num_image_tokens( + self, + *, + image_width: int, + image_height: int, + ) -> int: + _, num_image_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + ) + return num_image_tokens + + def _get_num_video_tokens( + self, + *, + image_width: int, + image_height: int, + num_frames: int, + ) -> int: + _, num_video_tokens = self._get_vision_info( + image_width=image_width, + image_height=image_height, + num_frames=num_frames, + ) + return num_video_tokens + + +class Qwen2VLProfilingInfo(Qwen2VLProcessingMixin, BaseProfilingInfo): + + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + return {"image": None, "video": None} + + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + return { + "image": self._get_max_image_tokens(), + "video": self._get_max_video_tokens(seq_len), + } + + def _get_image_size_with_most_features(self) -> ImageSize: max_image_size, _ = self._get_vision_info( image_width=9999999, image_height=9999999, @@ -761,27 +833,27 @@ def _get_dummy_image_size(self) -> ImageSize: return max_image_size def _get_max_image_tokens(self) -> int: - _, max_image_tokens = self._get_vision_info( - image_width=9999999, - image_height=9999999, - ) - return max_image_tokens + target_width, target_height = self._get_image_size_with_most_features() - def _get_max_video_tokens(self, num_frames: int) -> int: - _, max_video_tokens = self._get_vision_info( - image_width=9999999, - image_height=9999999, - num_frames=num_frames, + return self._get_num_image_tokens( + image_width=target_width, + image_height=target_height, ) - return max_video_tokens def _get_max_video_frames(self, max_tokens: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + num_frames = 0 while True: next_num_frames = num_frames + 1 + next_max_tokens = self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=next_num_frames, + ) - if self._get_max_video_tokens(next_num_frames) > max_tokens: + if next_max_tokens > max_tokens: break num_frames = next_num_frames @@ -797,56 +869,73 @@ def _get_dummy_num_frames(self, seq_len: int) -> int: max_total_frames = self._get_max_video_frames(seq_len - max_image_tokens) - return max(max_total_frames // max(max_videos, 1), 1) + num_frames = max(max_total_frames // max(max_videos, 1), 1) - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - max_image_tokens = self._get_max_image_tokens() + # Temporary workaround for https://github.com/huggingface/transformers/issues/35412 + if num_frames > 1 and num_frames % 2 == 1: + num_frames += 1 - num_frames = self._get_dummy_num_frames(seq_len) - max_video_tokens = self._get_max_video_tokens(num_frames) + return num_frames - return { - "image": max_image_tokens, - "video": max_video_tokens, + def _get_max_video_tokens(self, seq_len: int) -> int: + target_width, target_height = self._get_image_size_with_most_features() + + return self._get_num_video_tokens( + image_width=target_width, + image_height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + ) + + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + + hf_processor = self._get_hf_processor() + image_token: str = hf_processor.image_token + video_token: str = hf_processor.video_token + target_width, target_height = self._get_image_size_with_most_features() + + mm_data = { + "image": + self._get_dummy_images(width=target_width, + height=target_height, + num_images=num_images), + "video": + self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=self._get_dummy_num_frames(seq_len), + num_videos=num_videos, + ) } - def _get_data_parser(self) -> MultiModalDataParser: - return Qwen2MultiModalDataParser() + return ProcessorInputs( + prompt_text=image_token * num_images + video_token * num_videos, + mm_data=mm_data, + ) - def _get_image_processor(self, hf_processor: Qwen2VLProcessor): - image_processor = hf_processor.image_processor # type: ignore - assert isinstance(image_processor, Qwen2VLImageProcessor) - return image_processor - def _get_hf_processor( - self, - *, - min_pixels: Optional[int] = None, - max_pixels: Optional[int] = None, - ) -> Qwen2VLProcessor: - hf_processor = self.ctx.get_hf_processor(Qwen2VLProcessor) - image_processor = self._get_image_processor(hf_processor) +class Qwen2VLMultiModalProcessor(Qwen2VLProcessingMixin, + BaseMultiModalProcessor): - if min_pixels: - image_processor.min_pixels = min_pixels - if max_pixels: - image_processor.max_pixels = max_pixels - if max_pixels or min_pixels: - image_processor.size = { - "min_pixels": image_processor.min_pixels, - "max_pixels": image_processor.max_pixels, - } + def _get_profiling_info(self) -> BaseProfilingInfo: + return Qwen2VLProfilingInfo(self.ctx) - return hf_processor + def _get_data_parser(self) -> MultiModalDataParser: + return Qwen2MultiModalDataParser() def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() - image_processor = self._get_image_processor(hf_processor) + hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) + image_processor = self._get_image_processor(**hf_processor_mm_kwargs) # NOTE: Only Qwen2VLProcessor in transformers 4.47.0 has # image_token and video_token registered @@ -901,38 +990,6 @@ def _get_mm_fields_config( video_grid_thw=MultiModalFieldConfig.batched("video"), ) - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - num_images = mm_counts.get("image", 0) - num_videos = mm_counts.get("video", 0) - - hf_processor = self._get_hf_processor() - image_token: str = hf_processor.image_token - video_token: str = hf_processor.video_token - target_width, target_height = self._get_dummy_image_size() - - mm_data = { - "image": - self._get_dummy_images(width=target_width, - height=target_height, - num_images=num_images), - "video": - self._get_dummy_videos( - width=target_width, - height=target_height, - num_frames=self._get_dummy_num_frames(seq_len), - num_videos=num_videos, - ) - } - - return ProcessorInputs( - prompt_text=image_token * num_images + video_token * num_videos, - mm_data=mm_data, - ) - @MULTIMODAL_REGISTRY.register_processor(Qwen2VLMultiModalProcessor) class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal, diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 6ad4661e3bb8d..ba823acecbb56 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,8 +3,8 @@ import math from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from typing import (Any, Iterable, List, Literal, Mapping, Optional, Set, + Tuple, TypedDict, Union) import torch import torch.utils.checkpoint @@ -26,8 +26,9 @@ NestedTensors) from vllm.multimodal.parse import MultiModalDataParser from vllm.multimodal.processing import (BaseMultiModalProcessor, - MultiModalDataItems, ProcessorInputs, + MultiModalDataItems, ProcessingMixin, PromptReplacement) +from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors from vllm.transformers_utils.configs.ultravox import UltravoxConfig @@ -55,7 +56,30 @@ class UltravoxAudioEmbeddingInputs(TypedDict): UltravoxAudioEmbeddingInputs] -class UltravoxMultiModalProcessor(BaseMultiModalProcessor): +class UltravoxProcessingMixin(ProcessingMixin): + + def _get_hf_processor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> ProcessorMixin: + return self.ctx.get_hf_processor() + + def _get_feature_extractor( + self, + *, + # Ignored in initialization + sampling_rate: Optional[int] = None, + ) -> WhisperFeatureExtractor: + hf_processor = self._get_hf_processor(sampling_rate=sampling_rate) + audio_processor = hf_processor.audio_processor # type: ignore + feature_extractor = audio_processor.feature_extractor # type: ignore + assert isinstance(feature_extractor, WhisperFeatureExtractor) + return feature_extractor + + +class UltravoxProfilingInfo(UltravoxProcessingMixin, BaseProfilingInfo): def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"audio": None} @@ -67,17 +91,33 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: return {"audio": max_audio_tokens} - def _get_hf_processor( + def get_dummy_processor_inputs( self, - *, - # Ignored in initialization - sampling_rate: Optional[int] = None, - ) -> ProcessorMixin: - return self.ctx.get_hf_processor() + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + feature_extractor = self._get_feature_extractor() + + sampling_rate = feature_extractor.sampling_rate + audio_len = feature_extractor.chunk_length * sampling_rate + num_audios = mm_counts.get("audio", 0) + + mm_data = { + "audio": + self._get_dummy_audios(length=audio_len, num_audios=num_audios) + } + + return ProcessorInputs( + prompt_text="<|audio|>" * num_audios, + mm_data=mm_data, + ) - def _get_feature_extractor(self) -> WhisperFeatureExtractor: - hf_processor = self._get_hf_processor() - return hf_processor.audio_processor.feature_extractor # type: ignore + +class UltravoxMultiModalProcessor(UltravoxProcessingMixin, + BaseMultiModalProcessor): + + def _get_profiling_info(self) -> BaseProfilingInfo: + return UltravoxProfilingInfo(self.ctx) def _get_data_parser(self) -> MultiModalDataParser: feature_extractor = self._get_feature_extractor() @@ -155,10 +195,10 @@ def _get_mm_fields_config( def _get_prompt_replacements( self, mm_items: MultiModalDataItems, - hf_processor_mm_kwargs: Mapping[str, object], + hf_processor_mm_kwargs: Mapping[str, Any], out_mm_kwargs: MultiModalKwargs, ) -> list[PromptReplacement]: - hf_processor = self._get_hf_processor() + hf_processor = self._get_hf_processor(**hf_processor_mm_kwargs) placeholder = hf_processor.audio_token_replacement # type: ignore def get_replacement_ultravox(item_idx: int): @@ -173,27 +213,6 @@ def get_replacement_ultravox(item_idx: int): ) ] - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - feature_extractor = self._get_feature_extractor() - - sampling_rate = feature_extractor.sampling_rate - audio_len = feature_extractor.chunk_length * sampling_rate - num_audios = mm_counts.get("audio", 0) - - mm_data = { - "audio": - self._get_dummy_audios(length=audio_len, num_audios=num_audios) - } - - return ProcessorInputs( - prompt_text="<|audio|>" * num_audios, - mm_data=mm_data, - ) - class StackAudioFrames(nn.Module): """ diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py index 014f02ee10a1b..8516c9f7066f7 100644 --- a/vllm/model_executor/models/vision.py +++ b/vllm/model_executor/models/vision.py @@ -1,12 +1,8 @@ from abc import ABC, abstractmethod -from typing import Final, Generic, Optional, Protocol, TypeVar +from typing import Final, Generic, Protocol, TypeVar from transformers import PretrainedConfig -from vllm.multimodal.processing import (BaseMultiModalProcessor, - InputProcessingContext, - ProcessingCache) - _C = TypeVar("_C", bound=PretrainedConfig) @@ -43,12 +39,18 @@ def get_patch_grid_length(self) -> int: raise NotImplementedError -def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: +class VisionLanguageConfig(Protocol): + vision_config: Final[PretrainedConfig] + + +def get_vision_encoder_info( + hf_config: VisionLanguageConfig) -> VisionEncoderInfo: # Avoid circular imports from .clip import CLIPEncoderInfo, CLIPVisionConfig from .pixtral import PixtralHFEncoderInfo, PixtralVisionConfig from .siglip import SiglipEncoderInfo, SiglipVisionConfig + vision_config = hf_config.vision_config if isinstance(vision_config, CLIPVisionConfig): return CLIPEncoderInfo(vision_config) if isinstance(vision_config, PixtralVisionConfig): @@ -58,26 +60,3 @@ def vision_encoder_info(vision_config: PretrainedConfig) -> VisionEncoderInfo: msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) - - -class VisionLanguageConfig(Protocol): - vision_config: Final[PretrainedConfig] - - -class BaseVisionLanguageMultiModalProcessor(BaseMultiModalProcessor): - - def __init__(self, - ctx: InputProcessingContext, - *, - cache: Optional[ProcessingCache] = None, - enable_sanity_checks: bool = True) -> None: - super().__init__(ctx, - cache=cache, - enable_sanity_checks=enable_sanity_checks) - - vision_config = self._get_hf_config().vision_config - self._vision_encoder_info = vision_encoder_info(vision_config) - - @abstractmethod - def _get_hf_config(self) -> VisionLanguageConfig: - raise NotImplementedError diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index ebc16b817684a..933c1d3aff0cb 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -8,11 +8,10 @@ from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union import numpy as np -import numpy.typing as npt import torch from blake3 import blake3 from PIL import Image -from transformers import BatchFeature, ProcessorMixin +from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from vllm.inputs import DummyData, InputProcessingContext from vllm.logger import init_logger @@ -24,6 +23,7 @@ MultiModalInputsV2, MultiModalKwargs, MultiModalKwargsItem, PlaceholderRange) from .parse import MultiModalDataItems, MultiModalDataParser +from .profiling import BaseProfilingInfo logger = init_logger(__name__) @@ -466,14 +466,6 @@ def find_mm_placeholders( return dict(full_groupby_modality(it)) -@dataclass -class ProcessorInputs: - """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" - prompt_text: str - mm_data: MultiModalDataDict - hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) - - class ProcessingCache: def __init__(self, capacity: int) -> None: @@ -585,9 +577,33 @@ def put( self._cache.put(cache_key, output_kwargs) -class BaseMultiModalProcessor(ABC): +class ProcessingMixin: + """ + Contains helper functions to perform processing. + + Not to be confused with :class:`transformers.ProcessorMixin`. + """ + ctx: InputProcessingContext + + def _get_tokenizer(self) -> AnyTokenizer: + return self.ctx.tokenizer + + def _get_hf_config(self) -> PretrainedConfig: + return self.ctx.get_hf_config() + + def _get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + """ + Subclasses can override this method to handle + specific kwargs from model config or user inputs. + """ + return self.ctx.get_hf_processor(**kwargs) + + +class BaseMultiModalProcessor(ProcessingMixin, ABC): """ Abstract base class to process multi-modal inputs to be used in vLLM. + + Not to be confused with :class:`transformers.ProcessorMixin`. """ def __init__(self, @@ -601,6 +617,9 @@ def __init__(self, self.cache = cache self.enable_sanity_checks = enable_sanity_checks + self.data_parser = self._get_data_parser() + self.profiling_info = self._get_profiling_info() + def __call__( self, prompt: str, @@ -609,32 +628,9 @@ def __call__( ) -> MultiModalInputsV2: return self.apply(prompt, mm_data, hf_processor_mm_kwargs) - @abstractmethod - def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: - """ - Return the maximum supported number of items for each modality. - - A value of `None` means unlimited number of items. - - Omitting a modality from the returned dictionary means that - it is not supported at all. - """ - raise NotImplementedError - - @abstractmethod - def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: - """ - Get the maximum possible number of tokens per data item - for each modality. - - The dictionary returned by this method should have the same - keys as that returned by :meth:`get_supported_mm_limits`. - """ - raise NotImplementedError - def _get_data_parser(self) -> MultiModalDataParser: """ - Construct a data parser to preprocess multi-modal data items + Construct a parser to preprocess multi-modal data items before passing them to :meth:`_get_hf_mm_data`. You can support additional modalities by creating a subclass @@ -642,15 +638,12 @@ def _get_data_parser(self) -> MultiModalDataParser: """ return MultiModalDataParser() - def _get_hf_processor(self) -> ProcessorMixin: + def _get_profiling_info(self) -> BaseProfilingInfo: """ - Subclasses can add keyword arguments to this method to accept - additional kwargs from model config or user inputs. + Get the profiling information to find the worst-case memory usage of + the model. """ - return self.ctx.get_hf_processor() - - def _get_tokenizer(self) -> AnyTokenizer: - return self.ctx.tokenizer + raise NotImplementedError def _to_mm_items( self, @@ -660,8 +653,7 @@ def _to_mm_items( Normalize :class:`MultiModalDataDict` to :class:`MultiModalDataItems` before passing them to :meth:`_get_hf_mm_data`. """ - parser = self._get_data_parser() - mm_items = parser.parse_mm_data(mm_data) + mm_items = self.data_parser.parse_mm_data(mm_data) mm_limits = self.ctx.get_mm_config().limit_per_prompt for modality, items in mm_items.items(): @@ -799,7 +791,7 @@ def _apply_hf_processor_missing( # Some HF processors (e.g. Qwen2-VL) expect corresponding # multi-modal tokens to be in the prompt text - dummy_inputs = self._get_dummy_processor_inputs( + dummy_inputs = self.profiling_info.get_dummy_processor_inputs( self.ctx.model_config.max_model_len, mm_missing_counts, ) @@ -1133,73 +1125,14 @@ def apply( mm_placeholders=mm_placeholder_ranges, ) - def _get_dummy_audios( - self, - *, - length: int, - num_audios: int, - ) -> list[npt.NDArray]: - audio = np.zeros((length, )) - return [audio] * num_audios - - def _get_dummy_images( - self, - *, - width: int, - height: int, - num_images: int, - ) -> list[Image.Image]: - image = Image.new("RGB", (width, height), color=0) - return [image] * num_images - - def _get_dummy_videos( - self, - *, - width: int, - height: int, - num_frames: int, - num_videos: int, - ) -> list[npt.NDArray]: - video = np.zeros((num_frames, width, height, 3)) - return [video] * num_videos - - @abstractmethod - def _get_dummy_processor_inputs( - self, - seq_len: int, - mm_counts: Mapping[str, int], - ) -> ProcessorInputs: - """ - Build the multi-modal portion of the input which, after processing, - results in `mm_max_tokens` in :meth:`get_dummy_data`. - """ - raise NotImplementedError - - def _get_and_validate_dummy_mm_counts(self) -> Mapping[str, int]: - mm_limit_per_prompt = self.ctx.get_mm_config().limit_per_prompt - supported_mm_limits = self.get_supported_mm_limits() - - mm_limits = { - modality: mm_limit_per_prompt.get(modality, 1) - for modality in supported_mm_limits - } - - for modality, supported_limit in supported_mm_limits.items(): - limit = mm_limits[modality] - if supported_limit is not None and supported_limit < limit: - raise ValueError( - f"You set {modality}={limit} (or defaulted to 1) in " - f"`--limit-mm-per-prompt`, but this model only supports " - f"at most {supported_limit} {modality} items.") - - return mm_limits - def _get_dummy_mm_inputs( self, seq_len: int, mm_counts: Mapping[str, int], ) -> MultiModalInputsV2: - processor_inputs = self._get_dummy_processor_inputs(seq_len, mm_counts) + profiling = self.profiling_info + processor_inputs = profiling.get_dummy_processor_inputs( + seq_len, mm_counts) return self.apply( prompt_text=processor_inputs.prompt_text, @@ -1211,8 +1144,9 @@ def get_dummy_data(self, seq_len: int) -> DummyData: # Avoid circular import from vllm.sequence import SequenceData - mm_counts = self._get_and_validate_dummy_mm_counts() - mm_max_tokens_per_item = self.get_mm_max_tokens_per_item(seq_len) + profiling = self.profiling_info + mm_counts = profiling.get_mm_limits() + mm_max_tokens_per_item = profiling.get_mm_max_tokens_per_item(seq_len) if mm_counts.keys() != mm_max_tokens_per_item.keys(): raise AssertionError( "The keys returned by `get_supported_mm_limits`" diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py new file mode 100644 index 0000000000000..2ecf0db1a485d --- /dev/null +++ b/vllm/multimodal/profiling.py @@ -0,0 +1,121 @@ +from abc import ABC, abstractmethod +from collections.abc import Mapping +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np +import numpy.typing as npt +from PIL import Image + +from vllm.inputs import InputProcessingContext +from vllm.logger import init_logger + +from .inputs import MultiModalDataDict + +logger = init_logger(__name__) + + +@dataclass +class ProcessorInputs: + """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" + prompt_text: str + mm_data: MultiModalDataDict + hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) + + +class BaseProfilingInfo(ABC): + """ + Abstract base class that provides the information necessary to profile + multi-modal models. + """ + + def __init__(self, ctx: InputProcessingContext) -> None: + super().__init__() + + self.ctx = ctx + + @abstractmethod + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: + """ + Return the maximum supported number of items for each modality. + + A value of `None` means unlimited number of items. + + Omitting a modality from the returned dictionary means that + it is not supported at all. + """ + raise NotImplementedError + + @abstractmethod + def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]: + """ + Get the maximum possible number of tokens per data item + for each modality. + + The dictionary returned by this method should have the same + keys as that returned by :meth:`get_supported_mm_limits`. + """ + raise NotImplementedError + + @abstractmethod + def get_dummy_processor_inputs( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> ProcessorInputs: + """ + Build the multi-modal portion of the input which, after processing, + results in `mm_max_tokens` in :meth:`get_mm_max_tokens_per_item`. + """ + raise NotImplementedError + + def _get_dummy_audios( + self, + *, + length: int, + num_audios: int, + ) -> list[npt.NDArray]: + audio = np.zeros((length, )) + return [audio] * num_audios + + def _get_dummy_images( + self, + *, + width: int, + height: int, + num_images: int, + ) -> list[Image.Image]: + image = Image.new("RGB", (width, height), color=0) + return [image] * num_images + + def _get_dummy_videos( + self, + *, + width: int, + height: int, + num_frames: int, + num_videos: int, + ) -> list[npt.NDArray]: + video = np.zeros((num_frames, width, height, 3)) + return [video] * num_videos + + def get_mm_limits(self) -> Mapping[str, int]: + mm_config = self.ctx.get_mm_config() + mm_limit_per_prompt = mm_config.limit_per_prompt + + supported_mm_limits = self.get_supported_mm_limits() + + mm_limits = { + modality: mm_limit_per_prompt.get(modality, 1) + for modality in supported_mm_limits + } + + for modality, supported_limit in supported_mm_limits.items(): + limit = mm_limits[modality] + if supported_limit is not None and supported_limit < limit: + raise ValueError( + f"You set {modality}={limit} (or defaulted to 1) in " + f"`--limit-mm-per-prompt`, but this model only supports " + f"at most {supported_limit} {modality} items.") + + return mm_limits diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index fb4389dc4df42..f75a594a4c4e0 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -224,7 +224,7 @@ def get_max_tokens_per_item_by_modality( tokenizer = cached_get_tokenizer(model_config.tokenizer) processor = self.create_processor(model_config, tokenizer) seq_len = model_config.max_model_len - return processor.get_mm_max_tokens_per_item(seq_len) + return processor.profiling_info.get_mm_max_tokens_per_item(seq_len) return { key: plugin.get_max_multimodal_tokens(model_config) From ee77fdb5de42a6fead2b897d87d99d4b1e5650a9 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Mon, 6 Jan 2025 21:40:31 +0800 Subject: [PATCH 090/462] [Doc][2/N] Reorganize Models and Usage sections (#11755) Signed-off-by: DarkLight1337 --- .github/ISSUE_TEMPLATE/600-new-model.yml | 2 +- .../disagg_prefill/abstraction.jpg | Bin .../disagg_prefill/overview.jpg | Bin docs/source/contributing/model/basic.md | 102 ++++++++++++ docs/source/contributing/model/index.md | 26 +++ .../model/multimodal.md} | 8 +- .../source/contributing/model/registration.md | 56 +++++++ .../automatic_prefix_caching.md} | 6 +- docs/source/design/kernel/paged_attention.md | 2 + .../dev/offline_inference/offline_index.md | 1 + .../automatic_prefix_caching.md} | 8 +- .../compatibility_matrix.md | 6 +- .../{usage => features}/disagg_prefill.md | 4 +- docs/source/{usage => features}/lora.md | 0 .../{usage => features}/multimodal_inputs.md | 0 .../{ => features}/quantization/auto_awq.md | 0 .../source/{ => features}/quantization/bnb.md | 0 .../source/{ => features}/quantization/fp8.md | 0 .../quantization/fp8_e4m3_kvcache.md | 0 .../quantization/fp8_e5m2_kvcache.md | 0 .../{ => features}/quantization/gguf.md | 0 docs/source/features/quantization/index.md | 19 +++ .../{ => features}/quantization/int8.md | 0 .../quantization/supported_hardware.md | 10 +- .../source/{usage => features}/spec_decode.md | 0 .../{usage => features}/structured_outputs.md | 0 .../{usage => features}/tool_calling.md | 0 docs/source/index.md | 66 +++----- docs/source/models/adding_model.md | 155 ------------------ docs/source/models/supported_models.md | 2 +- .../optimization.md} | 4 +- docs/source/{usage => serving}/engine_args.md | 0 docs/source/{usage => serving}/env_vars.md | 0 .../serving/openai_compatible_server.md | 2 +- docs/source/{usage => serving}/usage_stats.md | 0 vllm/attention/backends/rocm_flash_attn.py | 2 +- vllm/config.py | 6 +- vllm/engine/arg_utils.py | 2 +- vllm/engine/output_processor/multi_step.py | 2 +- vllm/executor/cpu_executor.py | 2 +- vllm/platforms/cpu.py | 2 +- vllm/spec_decode/spec_decode_worker.py | 2 +- vllm/utils.py | 2 +- vllm/worker/multi_step_model_runner.py | 2 +- vllm/worker/utils.py | 2 +- 45 files changed, 265 insertions(+), 238 deletions(-) rename docs/source/assets/{usage => features}/disagg_prefill/abstraction.jpg (100%) rename docs/source/assets/{usage => features}/disagg_prefill/overview.jpg (100%) create mode 100644 docs/source/contributing/model/basic.md create mode 100644 docs/source/contributing/model/index.md rename docs/source/{models/enabling_multimodal_inputs.md => contributing/model/multimodal.md} (96%) create mode 100644 docs/source/contributing/model/registration.md rename docs/source/{automatic_prefix_caching/details.md => design/automatic_prefix_caching.md} (90%) rename docs/source/{automatic_prefix_caching/apc.md => features/automatic_prefix_caching.md} (97%) rename docs/source/{usage => features}/compatibility_matrix.md (98%) rename docs/source/{usage => features}/disagg_prefill.md (96%) rename docs/source/{usage => features}/lora.md (100%) rename docs/source/{usage => features}/multimodal_inputs.md (100%) rename docs/source/{ => features}/quantization/auto_awq.md (100%) rename docs/source/{ => features}/quantization/bnb.md (100%) rename docs/source/{ => features}/quantization/fp8.md (100%) rename docs/source/{ => features}/quantization/fp8_e4m3_kvcache.md (100%) rename docs/source/{ => features}/quantization/fp8_e5m2_kvcache.md (100%) rename docs/source/{ => features}/quantization/gguf.md (100%) create mode 100644 docs/source/features/quantization/index.md rename docs/source/{ => features}/quantization/int8.md (100%) rename docs/source/{ => features}/quantization/supported_hardware.md (86%) rename docs/source/{usage => features}/spec_decode.md (100%) rename docs/source/{usage => features}/structured_outputs.md (100%) rename docs/source/{usage => features}/tool_calling.md (100%) delete mode 100644 docs/source/models/adding_model.md rename docs/source/{usage/performance.md => performance/optimization.md} (98%) rename docs/source/{usage => serving}/engine_args.md (100%) rename docs/source/{usage => serving}/env_vars.md (100%) rename docs/source/{usage => serving}/usage_stats.md (100%) diff --git a/.github/ISSUE_TEMPLATE/600-new-model.yml b/.github/ISSUE_TEMPLATE/600-new-model.yml index 794617a0cfdf6..713e76c1a5cec 100644 --- a/.github/ISSUE_TEMPLATE/600-new-model.yml +++ b/.github/ISSUE_TEMPLATE/600-new-model.yml @@ -9,7 +9,7 @@ body: value: > #### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+). - #### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model. + #### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model. - type: textarea attributes: label: The model to consider. diff --git a/docs/source/assets/usage/disagg_prefill/abstraction.jpg b/docs/source/assets/features/disagg_prefill/abstraction.jpg similarity index 100% rename from docs/source/assets/usage/disagg_prefill/abstraction.jpg rename to docs/source/assets/features/disagg_prefill/abstraction.jpg diff --git a/docs/source/assets/usage/disagg_prefill/overview.jpg b/docs/source/assets/features/disagg_prefill/overview.jpg similarity index 100% rename from docs/source/assets/usage/disagg_prefill/overview.jpg rename to docs/source/assets/features/disagg_prefill/overview.jpg diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md new file mode 100644 index 0000000000000..14690ffe24a83 --- /dev/null +++ b/docs/source/contributing/model/basic.md @@ -0,0 +1,102 @@ +(new-model-basic)= + +# Basic Implementation + +This guide walks you through the steps to implement a basic vLLM model. + +## 1. Bring your model code + +First, clone the PyTorch model code from the source repository. +For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from +HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. + +```{warning} +Make sure to review and adhere to the original code's copyright and licensing terms! +``` + +## 2. Make your code compatible with vLLM + +To ensure compatibility with vLLM, your model must meet the following requirements: + +### Initialization Code + +All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: + +- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. +- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. + +The initialization code should look like this: + +```python +from torch import nn +from vllm.config import VllmConfig +from vllm.attention import Attention + +class MyAttention(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.attn = Attention(prefix=f"{prefix}.attn") + +class MyDecoderLayer(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") + +class MyModel(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str): + super().__init__() + self.layers = nn.ModuleList( + [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] + ) + +class MyModelForCausalLM(nn.Module): + def __init__(self, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + self.model = MyModel(vllm_config, prefix=f"{prefix}.model") +``` + +### Computation Code + +Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. + +```python +def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, +) -> torch.Tensor: + ... +``` + +```{note} +Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. +If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. +``` + +For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. + +## 3. (Optional) Implement tensor parallelism and quantization support + +If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. +To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. +For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`. +When it comes to the linear layers, we provide the following options to parallelize them: + +- `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. +- `RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. +- `ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. +- `MergedColumnParallelLinear`: Column-parallel linear that merges multiple `ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. +- `QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. + +Note that all the linear layers above take `linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. + +## 4. Implement the weight loading logic + +You now need to implement the `load_weights` method in your `*ForCausalLM` class. +This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for `MergedColumnParallelLinear` and `QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. + +## 5. Register your model + +See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM. diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md new file mode 100644 index 0000000000000..a2d601c83cf47 --- /dev/null +++ b/docs/source/contributing/model/index.md @@ -0,0 +1,26 @@ +(new-model)= + +# Adding a New Model + +This section provides more information on how to integrate a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. + +```{toctree} +:caption: Contents +:maxdepth: 1 + +basic +registration +multimodal +``` + +```{note} +The complexity of adding a new model depends heavily on the model's architecture. +The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. +However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. +``` + +```{tip} +If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues) +or ask on our [developer slack](https://slack.vllm.ai). +We will be happy to help you out! +``` diff --git a/docs/source/models/enabling_multimodal_inputs.md b/docs/source/contributing/model/multimodal.md similarity index 96% rename from docs/source/models/enabling_multimodal_inputs.md rename to docs/source/contributing/model/multimodal.md index fdd770887900e..e5dcd1223b361 100644 --- a/docs/source/models/enabling_multimodal_inputs.md +++ b/docs/source/contributing/model/multimodal.md @@ -2,15 +2,11 @@ # Enabling Multimodal Inputs -This document walks you through the steps to extend a vLLM model so that it accepts [multi-modal inputs](#multimodal-inputs). - -```{seealso} -[Adding a New Model](adding-a-new-model) -``` +This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs). ## 1. Update the base vLLM model -It is assumed that you have already implemented the model in vLLM according to [these steps](#adding-a-new-model). +It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic). Further update the model as follows: - Implement the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md new file mode 100644 index 0000000000000..cf1cdb0c9de0f --- /dev/null +++ b/docs/source/contributing/model/registration.md @@ -0,0 +1,56 @@ +(new-model-registration)= + +# Model Registration + +vLLM relies on a model registry to determine how to run each model. +A list of pre-registered architectures can be found on the [Supported Models](#supported-models) page. + +If your model is not on this list, you must register it to vLLM. +This page provides detailed instructions on how to do so. + +## Built-in models + +To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source). +This gives you the ability to modify the codebase and test your model. + +After you have implemented your model (see [tutorial](#new-model-basic)), put it into the directory. +Then, add your model class to `_VLLM_MODELS` in so that it is automatically registered upon importing vLLM. +You should also include an example HuggingFace repository for this model in to run the unit tests. +Finally, update the [Supported Models](#supported-models) documentation page to promote your model! + +```{important} +The list of models in each section should be maintained in alphabetical order. +``` + +## Out-of-tree models + +You can load an external model using a plugin without modifying the vLLM codebase. + +```{seealso} +[vLLM's Plugin System](#plugin-system) +``` + +To register the model, use the following code: + +```python +from vllm import ModelRegistry +from your_code import YourModelForCausalLM +ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) +``` + +If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`: + +```python +from vllm import ModelRegistry + +ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") +``` + +```{important} +If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. +Read more about that [here](#enabling-multimodal-inputs). +``` + +```{note} +Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. +``` diff --git a/docs/source/automatic_prefix_caching/details.md b/docs/source/design/automatic_prefix_caching.md similarity index 90% rename from docs/source/automatic_prefix_caching/details.md rename to docs/source/design/automatic_prefix_caching.md index 17f806217aa65..4398536b2b4ad 100644 --- a/docs/source/automatic_prefix_caching/details.md +++ b/docs/source/design/automatic_prefix_caching.md @@ -1,6 +1,8 @@ -# Implementation +(design-automatic-prefix-caching)= -The core idea of PagedAttention is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. +# Automatic Prefix Caching + +The core idea of [PagedAttention](#design-paged-attention) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand. To automatically cache the KV cache, we utilize the following key observation: Each KV block can be uniquely identified by the tokens within the block and the tokens in the prefix before the block. diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md index c21985b36eb3a..f896f903c78f5 100644 --- a/docs/source/design/kernel/paged_attention.md +++ b/docs/source/design/kernel/paged_attention.md @@ -1,3 +1,5 @@ +(design-paged-attention)= + # vLLM Paged Attention - Currently, vLLM utilizes its own implementation of a multi-head query diff --git a/docs/source/dev/offline_inference/offline_index.md b/docs/source/dev/offline_inference/offline_index.md index 318a02d8c78df..c32f99d59e3db 100644 --- a/docs/source/dev/offline_inference/offline_index.md +++ b/docs/source/dev/offline_inference/offline_index.md @@ -1,6 +1,7 @@ # Offline Inference ```{toctree} +:caption: Contents :maxdepth: 1 llm diff --git a/docs/source/automatic_prefix_caching/apc.md b/docs/source/features/automatic_prefix_caching.md similarity index 97% rename from docs/source/automatic_prefix_caching/apc.md rename to docs/source/features/automatic_prefix_caching.md index c0c141c5fb7ef..3d70cbb29c385 100644 --- a/docs/source/automatic_prefix_caching/apc.md +++ b/docs/source/features/automatic_prefix_caching.md @@ -1,13 +1,13 @@ -(apc)= +(automatic-prefix-caching)= -# Introduction +# Automatic Prefix Caching -## What is Automatic Prefix Caching +## Introduction Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part. ```{note} -Technical details on how vLLM implements APC are in the next page. +Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching). ``` ## Enabling APC in vLLM diff --git a/docs/source/usage/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md similarity index 98% rename from docs/source/usage/compatibility_matrix.md rename to docs/source/features/compatibility_matrix.md index 3cefa12ea8a1d..8d8f7dca2e5b5 100644 --- a/docs/source/usage/compatibility_matrix.md +++ b/docs/source/features/compatibility_matrix.md @@ -32,7 +32,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar * - Feature - [CP](#chunked-prefill) - - [APC](#apc) + - [APC](#automatic-prefix-caching) - [LoRA](#lora-adapter) - prmpt adptr - [SD](#spec_decode) @@ -64,7 +64,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - - - - * - [APC](#apc) + * - [APC](#automatic-prefix-caching) - ✅ - - @@ -345,7 +345,7 @@ Check the '✗' with links to see tracking issue for unsupported feature/hardwar - ✅ - ✅ - ✅ - * - [APC](#apc) + * - [APC](#automatic-prefix-caching) - [✗](gh-issue:3687) - ✅ - ✅ diff --git a/docs/source/usage/disagg_prefill.md b/docs/source/features/disagg_prefill.md similarity index 96% rename from docs/source/usage/disagg_prefill.md rename to docs/source/features/disagg_prefill.md index a61c00fad1e3c..05226f2dec87c 100644 --- a/docs/source/usage/disagg_prefill.md +++ b/docs/source/features/disagg_prefill.md @@ -41,13 +41,13 @@ Key abstractions for disaggregated prefilling: Here is a figure illustrating how the above 3 abstractions are organized: -```{image} /assets/usage/disagg_prefill/abstraction.jpg +```{image} /assets/features/disagg_prefill/abstraction.jpg :alt: Disaggregated prefilling abstractions ``` The workflow of disaggregated prefilling is as follows: -```{image} /assets/usage/disagg_prefill/overview.jpg +```{image} /assets/features/disagg_prefill/overview.jpg :alt: Disaggregated prefilling workflow ``` diff --git a/docs/source/usage/lora.md b/docs/source/features/lora.md similarity index 100% rename from docs/source/usage/lora.md rename to docs/source/features/lora.md diff --git a/docs/source/usage/multimodal_inputs.md b/docs/source/features/multimodal_inputs.md similarity index 100% rename from docs/source/usage/multimodal_inputs.md rename to docs/source/features/multimodal_inputs.md diff --git a/docs/source/quantization/auto_awq.md b/docs/source/features/quantization/auto_awq.md similarity index 100% rename from docs/source/quantization/auto_awq.md rename to docs/source/features/quantization/auto_awq.md diff --git a/docs/source/quantization/bnb.md b/docs/source/features/quantization/bnb.md similarity index 100% rename from docs/source/quantization/bnb.md rename to docs/source/features/quantization/bnb.md diff --git a/docs/source/quantization/fp8.md b/docs/source/features/quantization/fp8.md similarity index 100% rename from docs/source/quantization/fp8.md rename to docs/source/features/quantization/fp8.md diff --git a/docs/source/quantization/fp8_e4m3_kvcache.md b/docs/source/features/quantization/fp8_e4m3_kvcache.md similarity index 100% rename from docs/source/quantization/fp8_e4m3_kvcache.md rename to docs/source/features/quantization/fp8_e4m3_kvcache.md diff --git a/docs/source/quantization/fp8_e5m2_kvcache.md b/docs/source/features/quantization/fp8_e5m2_kvcache.md similarity index 100% rename from docs/source/quantization/fp8_e5m2_kvcache.md rename to docs/source/features/quantization/fp8_e5m2_kvcache.md diff --git a/docs/source/quantization/gguf.md b/docs/source/features/quantization/gguf.md similarity index 100% rename from docs/source/quantization/gguf.md rename to docs/source/features/quantization/gguf.md diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md new file mode 100644 index 0000000000000..861cb165c11c2 --- /dev/null +++ b/docs/source/features/quantization/index.md @@ -0,0 +1,19 @@ +(quantization-index)= + +# Quantization + +Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. + +```{toctree} +:caption: Contents +:maxdepth: 1 + +supported_hardware +auto_awq +bnb +gguf +int8 +fp8 +fp8_e5m2_kvcache +fp8_e4m3_kvcache +``` diff --git a/docs/source/quantization/int8.md b/docs/source/features/quantization/int8.md similarity index 100% rename from docs/source/quantization/int8.md rename to docs/source/features/quantization/int8.md diff --git a/docs/source/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md similarity index 86% rename from docs/source/quantization/supported_hardware.md rename to docs/source/features/quantization/supported_hardware.md index 7330c2f8aa194..988288a82d9bc 100644 --- a/docs/source/quantization/supported_hardware.md +++ b/docs/source/features/quantization/supported_hardware.md @@ -1,6 +1,6 @@ -(supported-hardware-for-quantization)= +(quantization-supported-hardware)= -# Supported Hardware for Quantization Kernels +# Supported Hardware The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM: @@ -120,12 +120,12 @@ The table below shows the compatibility of various quantization implementations - ✗ ``` -## Notes: - - Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0. - "✅︎" indicates that the quantization method is supported on the specified hardware. - "✗" indicates that the quantization method is not supported on the specified hardware. -Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. +```{note} +This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods. For the most up-to-date information on hardware support and quantization methods, please refer to or consult with the vLLM development team. +``` diff --git a/docs/source/usage/spec_decode.md b/docs/source/features/spec_decode.md similarity index 100% rename from docs/source/usage/spec_decode.md rename to docs/source/features/spec_decode.md diff --git a/docs/source/usage/structured_outputs.md b/docs/source/features/structured_outputs.md similarity index 100% rename from docs/source/usage/structured_outputs.md rename to docs/source/features/structured_outputs.md diff --git a/docs/source/usage/tool_calling.md b/docs/source/features/tool_calling.md similarity index 100% rename from docs/source/usage/tool_calling.md rename to docs/source/features/tool_calling.md diff --git a/docs/source/index.md b/docs/source/index.md index f390474978790..4bc40bf0f5e41 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -79,6 +79,9 @@ serving/metrics serving/integrations serving/tensorizer serving/runai_model_streamer +serving/engine_args +serving/env_vars +serving/usage_stats ``` ```{toctree} @@ -88,53 +91,28 @@ serving/runai_model_streamer models/supported_models models/generative_models models/pooling_models -models/adding_model -models/enabling_multimodal_inputs ``` ```{toctree} -:caption: Usage +:caption: Features :maxdepth: 1 -usage/lora -usage/multimodal_inputs -usage/tool_calling -usage/structured_outputs -usage/spec_decode -usage/compatibility_matrix -usage/performance -usage/engine_args -usage/env_vars -usage/usage_stats -usage/disagg_prefill -``` - -```{toctree} -:caption: Quantization -:maxdepth: 1 - -quantization/supported_hardware -quantization/auto_awq -quantization/bnb -quantization/gguf -quantization/int8 -quantization/fp8 -quantization/fp8_e5m2_kvcache -quantization/fp8_e4m3_kvcache -``` - -```{toctree} -:caption: Automatic Prefix Caching -:maxdepth: 1 - -automatic_prefix_caching/apc -automatic_prefix_caching/details +features/quantization/index +features/lora +features/multimodal_inputs +features/tool_calling +features/structured_outputs +features/automatic_prefix_caching +features/disagg_prefill +features/spec_decode +features/compatibility_matrix ``` ```{toctree} :caption: Performance :maxdepth: 1 +performance/optimization performance/benchmarks ``` @@ -148,10 +126,8 @@ community/meetups community/sponsors ``` -% API Documentation: API reference aimed at vllm library usage - ```{toctree} -:caption: API Documentation +:caption: API Reference :maxdepth: 2 dev/sampling_params @@ -160,30 +136,32 @@ dev/offline_inference/offline_index dev/engine/engine_index ``` -% Design: docs about vLLM internals +% Design Documents: Details about vLLM internals ```{toctree} -:caption: Design +:caption: Design Documents :maxdepth: 2 design/arch_overview design/huggingface_integration design/plugin_system -design/input_processing/model_inputs_index design/kernel/paged_attention +design/input_processing/model_inputs_index design/multimodal/multimodal_index +design/automatic_prefix_caching design/multiprocessing ``` -% For Developers: contributing to the vLLM project +% Developer Guide: How to contribute to the vLLM project ```{toctree} -:caption: For Developers +:caption: Developer Guide :maxdepth: 2 contributing/overview contributing/profiling/profiling_index contributing/dockerfile/dockerfile +contributing/model/index ``` # Indices and tables diff --git a/docs/source/models/adding_model.md b/docs/source/models/adding_model.md deleted file mode 100644 index 02537fba020c4..0000000000000 --- a/docs/source/models/adding_model.md +++ /dev/null @@ -1,155 +0,0 @@ -(adding-a-new-model)= - -# Adding a New Model - -This document provides a high-level guide on integrating a [HuggingFace Transformers](https://github.com/huggingface/transformers) model into vLLM. - -```{note} -The complexity of adding a new model depends heavily on the model's architecture. -The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. -However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. -``` - -```{note} -By default, vLLM models do not support multi-modal inputs. To enable multi-modal support, -please follow [this guide](#enabling-multimodal-inputs) after implementing the model here. -``` - -```{tip} -If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our [GitHub](https://github.com/vllm-project/vllm/issues) repository. -We will be happy to help you out! -``` - -## 0. Fork the vLLM repository - -Start by forking our [GitHub] repository and then [build it from source](#build-from-source). -This gives you the ability to modify the codebase and test your model. - -```{tip} -If you don't want to fork the repository and modify vLLM's codebase, please refer to the "Out-of-Tree Model Integration" section below. -``` - -## 1. Bring your model code - -Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the directory. -For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from the HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file. - -```{warning} -When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. -``` - -## 2. Make your code compatible with vLLM - -To ensure compatibility with vLLM, your model must meet the following requirements: - -### Initialization Code - -All vLLM modules within the model must include a `prefix` argument in their constructor. This `prefix` is typically the full name of the module in the model's state dictionary and is crucial for: - -- Runtime support: vLLM's attention operators are registered in a model's state by their full names. Each attention operator must have a unique prefix as its layer name to avoid conflicts. -- Non-uniform quantization support: A quantized checkpoint can selectively quantize certain layers while keeping others in full precision. By providing the `prefix` during initialization, vLLM can match the current layer's `prefix` with the quantization configuration to determine if the layer should be initialized in quantized mode. - -The initialization code should look like this: - -```python -from torch import nn -from vllm.config import VllmConfig -from vllm.attention import Attention - -class MyAttention(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.attn = Attention(prefix=f"{prefix}.attn") - -class MyDecoderLayer(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.self_attn = MyAttention(prefix=f"{prefix}.self_attn") - -class MyModel(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str): - super().__init__() - self.layers = nn.ModuleList( - [MyDecoderLayer(vllm_config, prefix=f"{prefix}.layers.{i}") for i in range(vllm_config.model_config.hf_config.num_hidden_layers)] - ) - -class MyModelForCausalLM(nn.Module): - def __init__(self, vllm_config: VllmConfig, prefix: str = ""): - super().__init__() - self.model = MyModel(vllm_config, prefix=f"{prefix}.model") -``` - -### Computation Code - -Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension. - -```python -def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, -) -> torch.Tensor: - ... -``` - -```{note} -Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. -If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. -``` - -For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out for more examples. - -## 3. (Optional) Implement tensor parallelism and quantization support - -If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. -To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. -For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with {code}`VocabParallelEmbedding`. For the output LM head, you can use {code}`ParallelLMHead`. -When it comes to the linear layers, we provide the following options to parallelize them: - -- {code}`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving. -- {code}`RowParallelLinear`: The input tensor is partitioned along the hidden dimension. The weight matrix is partitioned along the rows (input dimension). An *all-reduce* operation is performed after the matrix multiplication to reduce the results. Typically used for the second FFN layer and the output linear transformation of the attention layer. -- {code}`ColumnParallelLinear`: The input tensor is replicated. The weight matrix is partitioned along the columns (output dimension). The result is partitioned along the column dimension. Typically used for the first FFN layer and the separated QKV transformation of the attention layer in the original Transformer. -- {code}`MergedColumnParallelLinear`: Column-parallel linear that merges multiple {code}`ColumnParallelLinear` operators. Typically used for the first FFN layer with weighted activation functions (e.g., SiLU). This class handles the sharded weight loading logic of multiple weight matrices. -- {code}`QKVParallelLinear`: Parallel linear layer for the query, key, and value projections of the multi-head and grouped-query attention mechanisms. When number of key/value heads are less than the world size, this class replicates the key/value heads properly. This class handles the weight loading and replication of the weight matrices. - -Note that all the linear layers above take {code}`linear_method` as an input. vLLM will set this parameter according to different quantization schemes to support weight quantization. - -## 4. Implement the weight loading logic - -You now need to implement the {code}`load_weights` method in your {code}`*ForCausalLM` class. -This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. Specifically, for {code}`MergedColumnParallelLinear` and {code}`QKVParallelLinear` layers, if the original model has separated weight matrices, you need to load the different parts separately. - -## 5. Register your model - -Finally, register your {code}`*ForCausalLM` class to the {code}`_VLLM_MODELS` in . - -## 6. Out-of-Tree Model Integration - -You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see [plugin-system](#plugin-system). - -To register the model, use the following code: - -```python -from vllm import ModelRegistry -from your_code import YourModelForCausalLM -ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM) -``` - -If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like {code}`RuntimeError: Cannot re-initialize CUDA in forked subprocess`: - -```python -from vllm import ModelRegistry - -ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM") -``` - -```{important} -If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface. -Read more about that [here](#enabling-multimodal-inputs). -``` - -```{note} -Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server. -``` diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 7682ed104b8c5..5a2778026192a 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -37,7 +37,7 @@ print(output) If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported. ```` -Otherwise, please refer to [Adding a New Model](#adding-a-new-model) and [Enabling Multimodal Inputs](#enabling-multimodal-inputs) for instructions on how to implement your model in vLLM. +Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM. Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support. ### ModelScope diff --git a/docs/source/usage/performance.md b/docs/source/performance/optimization.md similarity index 98% rename from docs/source/usage/performance.md rename to docs/source/performance/optimization.md index 2cd3801bfc82d..4fcde9b03b887 100644 --- a/docs/source/usage/performance.md +++ b/docs/source/performance/optimization.md @@ -1,6 +1,6 @@ -(performance)= +(optimization-and-tuning)= -# Performance and Tuning +# Optimization and Tuning ## Preemption diff --git a/docs/source/usage/engine_args.md b/docs/source/serving/engine_args.md similarity index 100% rename from docs/source/usage/engine_args.md rename to docs/source/serving/engine_args.md diff --git a/docs/source/usage/env_vars.md b/docs/source/serving/env_vars.md similarity index 100% rename from docs/source/usage/env_vars.md rename to docs/source/serving/env_vars.md diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index caf5e8cafd9aa..97e9879075570 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -217,7 +217,7 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai We support both [Vision](https://platform.openai.com/docs/guides/vision)- and [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters; -see our [Multimodal Inputs](../usage/multimodal_inputs.md) guide for more information. +see our [Multimodal Inputs](#multimodal-inputs) guide for more information. - *Note: `image_url.detail` parameter is not supported.* Code example: diff --git a/docs/source/usage/usage_stats.md b/docs/source/serving/usage_stats.md similarity index 100% rename from docs/source/usage/usage_stats.md rename to docs/source/serving/usage_stats.md diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 480901f71047f..d43c15b661ef7 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -430,7 +430,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if attn_type != AttentionType.DECODER: raise NotImplementedError("Encoder self-attention and " diff --git a/vllm/config.py b/vllm/config.py index b0ed88cb7f42b..8b824a1fca511 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -644,7 +644,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, self.use_async_output_proc = False return - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid from vllm.platforms import current_platform if not current_platform.is_async_output_supported(self.enforce_eager): @@ -665,7 +665,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config, if self.runner_type == "pooling": self.use_async_output_proc = False - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if speculative_config: logger.warning("Async output processing is not supported with" @@ -2064,7 +2064,7 @@ def verify_with_model_config(self, model_config: ModelConfig): model_config.quantization) def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if scheduler_config.chunked_prefill_enabled: logger.warning("LoRA with chunked prefill is still experimental " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 69c7c5077fe32..e94664308cf8d 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1148,7 +1148,7 @@ def create_engine_config(self, disable_logprobs=self.disable_logprobs_during_spec_decoding, ) - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if self.num_scheduler_steps > 1: if speculative_config is not None: diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 1c6f735f39e04..c8b282b1a7676 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -65,7 +65,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup, @staticmethod @functools.lru_cache def _log_prompt_logprob_unsupported_warning_once(): - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid logger.warning( "Prompt logprob is not supported by multi step workers. " diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 5495bc50ede83..c7f018d9a203e 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -22,7 +22,7 @@ class CPUExecutor(ExecutorBase): def _init_executor(self) -> None: assert self.device_config.device_type == "cpu" - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid assert self.lora_config is None, "cpu backend doesn't support LoRA" diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 09bde9f065eaa..7ba7f5150150c 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -50,7 +50,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: import vllm.envs as envs from vllm.utils import GiB_bytes model_config = vllm_config.model_config - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if not model_config.enforce_eager: logger.warning( diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index de593113b938b..e369da1a70c23 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -108,7 +108,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": return spec_decode_worker -# Reminder: Please update docs/source/usage/compatibility_matrix.md +# Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid class SpecDecodeWorker(LoraNotSupportedWorkerBase): """Worker which implements speculative decoding. diff --git a/vllm/utils.py b/vllm/utils.py index aadeddabf8b55..63057153f851d 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -58,7 +58,7 @@ # Exception strings for non-implemented encoder/decoder scenarios -# Reminder: Please update docs/source/usage/compatibility_matrix.md +# Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid STR_NOT_IMPL_ENC_DEC_SWA = \ diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index dee63a75c0605..a2c2cebf8d1f6 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -822,7 +822,7 @@ def _pythonize_sampler_output( for sgdx, (seq_group, sample_result) in enumerate(zip(seq_groups, samples_list)): - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid # (Check for Guided Decoding) if seq_group.sampling_params.logits_processors: diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index 8f2d343440d3e..ffa8c4cb0ff46 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -13,7 +13,7 @@ def assert_enc_dec_mr_supported_scenario( a supported scenario. ''' - # Reminder: Please update docs/source/usage/compatibility_matrix.md + # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid if enc_dec_mr.cache_config.enable_prefix_caching: From 9279b9f83dd3aa5bb3d3ce57bf92d9361755d164 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 6 Jan 2025 05:48:53 -0800 Subject: [PATCH 091/462] [Bugfix] Fix max image size for LLaVA-Onevision (#11769) Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 6dccc1e0d3b8d..5eac2f223d794 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -19,8 +19,8 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors -from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems, - VideoProcessorItems) +from vllm.multimodal.parse import (ImageSize, MultiModalDataItems, + VideoEmbeddingItems, VideoProcessorItems) from vllm.multimodal.processing import MultiModalFieldConfig, PromptReplacement from vllm.multimodal.profiling import BaseProfilingInfo, ProcessorInputs from vllm.sequence import IntermediateTensors @@ -170,6 +170,22 @@ def _get_num_video_tokens( class LlavaOnevisionProfilingInfo(LlavaOnevisionProcessingMixin, BaseLlavaProfilingInfo): + def _get_image_size_with_most_features(self) -> ImageSize: + hf_config = self._get_hf_config() + largest_feature_size, largest_feature_pinpoint = 0, None + for (height, width) in hf_config.image_grid_pinpoints: + feat_size = self._get_num_image_tokens(image_width=width, + image_height=height) + if feat_size > largest_feature_size: + largest_feature_size = feat_size + largest_feature_pinpoint = ImageSize(width=width, + height=height) + + if largest_feature_size == 0 or largest_feature_pinpoint is None: + raise ValueError("Cannot have a largest feature size of 0!") + + return largest_feature_pinpoint + def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]: return {"image": None, "video": None} From 4ca5d40adc53aca2a1fbaed81d9d622fde46ebf1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Mon, 6 Jan 2025 21:57:44 +0800 Subject: [PATCH 092/462] [doc] explain how to add interleaving sliding window support (#11771) Signed-off-by: youkaichao --- docs/source/contributing/model/basic.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/source/contributing/model/basic.md b/docs/source/contributing/model/basic.md index 14690ffe24a83..002808ac5fbbd 100644 --- a/docs/source/contributing/model/basic.md +++ b/docs/source/contributing/model/basic.md @@ -100,3 +100,16 @@ This method should load the weights from the HuggingFace's checkpoint file and a ## 5. Register your model See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM. + +## Frequently Asked Questions + +### How to support models with interleaving sliding windows? + +For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `mistralai/Ministral-8B-Instruct-2410`), the scheduler will treat the model as a full-attention model, i.e., kv-cache of all tokens will not be dropped. This is to make sure prefix caching works with these models. Sliding window only appears as a parameter to the attention kernel computation. + +To support a model with interleaving sliding windows, we need to take care of the following details: + +- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model. +- In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171). + +With these two steps, interleave sliding windows should work with the model. From 32c9eff2fff8ee91a60c9410c69042dc4c1cc5c8 Mon Sep 17 00:00:00 2001 From: Jee Jee Li Date: Mon, 6 Jan 2025 23:22:25 +0800 Subject: [PATCH 093/462] [Bugfix][V1] Fix molmo text-only inputs (#11676) Signed-off-by: Jee Jee Li --- .../vision_language/test_models.py | 10 ++ .../vision_language/vlm_utils/model_utils.py | 99 ++++++++++++++++++- vllm/model_executor/models/molmo.py | 56 ++++------- 3 files changed, 123 insertions(+), 42 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py index dc0b683c1f1cb..146685738a1d0 100644 --- a/tests/models/decoder_only/vision_language/test_models.py +++ b/tests/models/decoder_only/vision_language/test_models.py @@ -341,6 +341,16 @@ ), hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, ), + "molmo": VLMTestInfo( + models=["allenai/Molmo-7B-D-0924"], + test_type=(VLMTestType.IMAGE), + prompt_formatter=lambda img_prompt:"User: " + img_prompt + " Assistant:", # noqa: E501 + max_model_len=4096, + max_num_seqs=2, + image_size_factors=[(),(1.0, 1.0, 1.0)], + patch_hf_runner=model_utils.mlomo_patch_hf_runner, + postprocess_inputs=model_utils.molmo_post_processor, + ), # Tests for phi3v currently live in another file because of a bug in # transformers. Once this issue is fixed, we can enable them here instead. # https://github.com/huggingface/transformers/issues/34307 diff --git a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py index 3eca8fb9dcb1a..6c7a753af787e 100644 --- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py +++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py @@ -5,17 +5,20 @@ import re import types from pathlib import PosixPath -from typing import Callable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch from PIL.Image import Image -from transformers import AutoConfig, AutoTokenizer, BatchEncoding +from transformers import (AutoConfig, AutoTokenizer, BatchEncoding, + GenerationConfig) from vllm.sequence import SampleLogprobs from vllm.transformers_utils.tokenizer import patch_padding_side from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE -from .....conftest import HfRunner, ImageAsset, _ImageAssets +from .....conftest import (HfRunner, ImageAsset, PromptAudioInput, + PromptImageInput, PromptVideoInput, _ImageAssets) +from ....utils import TokensTextLogprobs from .types import RunnerOutput @@ -222,6 +225,11 @@ def wrap_inputs_post_processor(hf_inputs: BatchEncoding, dtype: str): return {"model_inputs": hf_inputs} +def molmo_post_processor(hf_inputs: BatchEncoding, dtype: str): + hf_inputs = cast_dtype_post_processor("images")(hf_inputs, dtype) + return {k: v.unsqueeze(0) for k, v in hf_inputs.items()} + + ####### Prompt path encoders for models that need models on disk def qwen_prompt_path_encoder( tmp_path: PosixPath, prompt: str, assets: Union[List[ImageAsset], @@ -451,3 +459,88 @@ def _generate(self, *args, **kwargs): hf_model.model.generate = types.MethodType(_generate, hf_model.model) return hf_model + + +def _generate_greedy_logprobs_limit( + self, + prompts: List[str], + max_tokens: int, + num_logprobs: int, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + **kwargs: Any, +) -> List[TokensTextLogprobs]: + all_inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + # Process in batches for inference. + if len(all_inputs): + input_ids_lst = [] + images_lst = [] + images_input_idx_lst = [] + imges_masks_lst = [] + for inputs in all_inputs: + input_ids_lst.append(inputs["input_ids"]) + images_lst.append(inputs["images"]) + images_input_idx_lst.append(inputs["image_input_idx"]) + imges_masks_lst.append(inputs["image_masks"]) + batch_inputs = {} + batch_inputs['input_ids'] = torch.cat(input_ids_lst, dim=0) + batch_inputs['images'] = torch.cat(images_lst, dim=0) + batch_inputs['image_input_idx'] = torch.cat(images_input_idx_lst, + dim=0) + batch_inputs['image_masks'] = torch.cat(imges_masks_lst, dim=0) + + outputs = self.model.generate_from_batch( + batch=self.wrap_device(batch_inputs, + device=self.model.device.type), + generation_config=GenerationConfig( + max_new_tokens=max_tokens, + stop_strings="<|endoftext|>", + do_sample=False, + ), + tokenizer=self.tokenizer, + output_hidden_states=True, + return_dict_in_generate=True, + ) + + all_logprobs: List[List[Dict[int, float]]] = [] + all_output_ids: List[List[int]] = [] + all_output_strs: List[str] = [] + + for index in range(len(all_inputs)): + ( + seq_logprobs_lst, + output_len, + ) = self._hidden_states_to_logprobs(outputs.hidden_states, + num_logprobs) + all_logprobs.append(seq_logprobs_lst) + seq_ids = outputs.sequences[index] + output_ids = seq_ids[-output_len:] + all_output_ids.append(output_ids.tolist()) + all_output_strs.append(self.tokenizer.decode(output_ids)) + outputs = zip(all_output_ids, all_output_strs, all_logprobs) + return [(output_ids, output_str, output_logprobs) + for output_ids, output_str, output_logprobs in outputs] + + +####### Molmo-specific HuggingFace runner patchers +def mlomo_patch_hf_runner(hf_model: HfRunner) -> HfRunner: + """Patches and returns an instance of the HfRunner to use for Molmo.""" + hf_processor = hf_model.processor + + def _processor(*args, **kwargs): + return hf_processor.process(*args, **kwargs) + + hf_model.processor = _processor + + setattr( # noqa: B010 + hf_model, + "generate_greedy_logprobs_limit", + types.MethodType(_generate_greedy_logprobs_limit, hf_model), + ) + + return hf_model diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index cc25be9f5b6a9..0e8287bb56b6b 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1081,45 +1081,25 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): else: out = processor.process(None, image, tokens=inputs["prompt_token_ids"]) - image_processor = processor.image_processor - max_total_crops = 1 + image_processor.max_crops - if image is not None: - images, image_input_idx, image_masks = pad_images( - max_total_crops, - out["images"], - out["image_input_idx"], - out.get("image_masks"), - ) - else: - base_image_input_size = image_processor.base_image_input_size - image_patch_size = image_processor.image_patch_size - image_num_patch = ( - base_image_input_size[0] // image_patch_size, - base_image_input_size[1] // image_patch_size, - ) - n_pixels = image_patch_size * image_patch_size * 3 - n_patches = image_num_patch[0] * image_num_patch[1] - - image_length_w = image_processor.image_token_length_w - image_length_h = image_processor.image_token_length_h - tokens_per_image = image_length_w * image_length_h - images = torch.full( - (max_total_crops, n_patches, n_pixels), - -1, - dtype=torch.float32, - ) - image_input_idx = torch.full( - (max_total_crops, tokens_per_image), - -1, - dtype=torch.int32, + # If there is no image, return directly. + if image is None: + new_prompt_token_ids = out["input_ids"].tolist() + prompt = inputs.get("prompt") + if prompt is None: + prompt = tokenizer.decode(new_prompt_token_ids) + return token_inputs( + prompt_token_ids=new_prompt_token_ids, + prompt=prompt, ) - if image_processor.image_padding_mask: - image_masks = torch.full( - (max_total_crops, n_patches), - -1, - dtype=torch.float32, - ) + image_processor = processor.image_processor + max_total_crops = 1 + image_processor.max_crops + images, image_input_idx, image_masks = pad_images( + max_total_crops, + out["images"], + out["image_input_idx"], + out.get("image_masks"), + ) image_data = dict( images=images, image_input_idx=image_input_idx, @@ -1143,11 +1123,9 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs): offset = i size += 1 image_data["image_start_end"] = (offset, offset + size) - prompt = inputs.get("prompt") if prompt is None: prompt = tokenizer.decode(new_prompt_token_ids) - return token_inputs( prompt_token_ids=new_prompt_token_ids, prompt=prompt, From e20c92bb618384ce8d0013e0c9ad273d0c23d65b Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 7 Jan 2025 00:11:28 +0800 Subject: [PATCH 094/462] [Kernel] Move attn_type to Attention.__init__() (#11690) Signed-off-by: Chen Zhang --- tests/kernels/test_encoder_decoder_attn.py | 100 ++++++++++---------- tests/kernels/utils.py | 12 ++- vllm/attention/backends/abstract.py | 2 +- vllm/attention/backends/blocksparse_attn.py | 14 +-- vllm/attention/backends/flash_attn.py | 4 +- vllm/attention/backends/flashinfer.py | 15 ++- vllm/attention/backends/hpu_attn.py | 13 +-- vllm/attention/backends/ipex_attn.py | 12 +-- vllm/attention/backends/pallas.py | 13 +-- vllm/attention/backends/rocm_flash_attn.py | 14 +-- vllm/attention/backends/torch_sdpa.py | 4 +- vllm/attention/backends/xformers.py | 6 +- vllm/attention/layer.py | 37 ++------ vllm/model_executor/models/bart.py | 44 +++------ vllm/model_executor/models/bert.py | 10 +- vllm/model_executor/models/mllama.py | 11 +-- vllm/model_executor/models/qwen2.py | 35 ++++--- vllm/v1/attention/backends/flash_attn.py | 14 +-- 18 files changed, 159 insertions(+), 201 deletions(-) diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index d943b048b7934..614674375786e 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -13,8 +13,7 @@ import torch from tests.kernels.utils import * -from vllm.attention import (Attention, AttentionBackend, AttentionMetadata, - AttentionType) +from vllm.attention import Attention, AttentionMetadata, AttentionType from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) @@ -64,6 +63,7 @@ class TestPoint(NamedTuple): max_dec_seq_len: int max_enc_seq_len: int num_blocks: int + attn_type: AttentionType class TestResources(NamedTuple): @@ -96,7 +96,6 @@ class TestResources(NamedTuple): ''' scale: float - attn_backend: AttentionBackend attn: Attention kv_cache: torch.Tensor @@ -129,16 +128,17 @@ class that Attention will automatically select when it is constructed. ''' scale = float(1.0 / (test_pt.head_size**0.5)) - attn_backend = make_backend(test_pt.backend_name) attn = Attention( test_pt.num_heads, test_pt.head_size, scale=scale, + prefix=f"{test_pt.attn_type}", + attn_type=test_pt.attn_type, ) if test_pt.num_blocks is None or test_pt.num_heads is None: # Caller does not require a KV cache return TestResources( - scale, attn_backend, attn, + scale, attn, torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)) # Construct KV cache @@ -148,7 +148,7 @@ class that Attention will automatically select when it is constructed. test_pt.block_size, device=CUDA_DEVICE, backend=test_pt.backend_name) - return TestResources(scale, attn_backend, attn, kv_cache) + return TestResources(scale, attn, kv_cache) def _encoder_attn_setup( @@ -193,6 +193,7 @@ def _encoder_attn_setup( _, max_q_seq_len, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -301,6 +302,7 @@ def _decoder_attn_setup( max_q_seq_len, _, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -488,6 +490,7 @@ def _enc_dec_cross_attn_setup_reuses_query( max_decoder_seq_len, max_encoder_seq_len, _, + _, ) = test_pt scale = test_rsrcs.scale @@ -622,7 +625,6 @@ def _run_encoder_attention_test( & attn_metadata ''' assert attn_metadata.num_decode_tokens == 0 - attn_type = AttentionType.ENCODER packed_qkv = encoder_test_params.packed_qkvo.packed_qkv assert packed_qkv is not None with set_forward_context(attn_metadata, vllm_config): @@ -635,14 +637,11 @@ def _run_encoder_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - packed_qkv.key, - packed_qkv.value, - torch.tensor([], - dtype=torch.float32, - device=packed_qkv.query.device), - attn_metadata, - attn_type=attn_type) + return attn.forward( + reshaped_query, packed_qkv.key, packed_qkv.value, + torch.tensor([], + dtype=torch.float32, + device=packed_qkv.query.device), attn_metadata) def _run_decoder_self_attention_test( @@ -675,7 +674,6 @@ def _run_decoder_self_attention_test( * Attention.forward() applied to packed_{query,key,value}, kv_cache & attn_metadata ''' - attn_type = AttentionType.DECODER attn = test_rsrcs.attn kv_cache = test_rsrcs.kv_cache packed_qkv = decoder_test_params.packed_qkvo.packed_qkv @@ -690,12 +688,8 @@ def _run_decoder_self_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - packed_qkv.key, - packed_qkv.value, - kv_cache, - attn_metadata, - attn_type=attn_type) + return attn.forward(reshaped_query, packed_qkv.key, packed_qkv.value, + kv_cache, attn_metadata) def _run_encoder_decoder_cross_attention_test( @@ -742,7 +736,6 @@ def _run_encoder_decoder_cross_attention_test( ''' assert decoder_test_params.packed_qkvo.packed_qkv is not None - attn_type = AttentionType.ENCODER_DECODER attn = test_rsrcs.attn kv_cache = test_rsrcs.kv_cache if cross_test_params is None: @@ -762,12 +755,8 @@ def _run_encoder_decoder_cross_attention_test( # is shaped as [num_tokens, hidden_size] and we can skip the reshape. reshaped_query = decoder_test_params.packed_qkvo.packed_qkv.query.view( -1, test_pt.num_heads * test_pt.head_size) - return attn.forward(reshaped_query, - key, - value, - kv_cache, - attn_metadata, - attn_type=attn_type) + return attn.forward(reshaped_query, key, value, kv_cache, + attn_metadata) @pytest.fixture(autouse=True) @@ -839,7 +828,7 @@ def test_encoder_only( # is not part of this test test_pt = TestPoint(num_heads, head_size, attn_backend.name, batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096) + max_enc_seq_len, 4096, AttentionType.ENCODER) # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init @@ -855,7 +844,7 @@ def test_encoder_only( # Shared prefill metadata structure prephase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, True, None, decoder_test_params=None, @@ -961,20 +950,29 @@ def test_e2e_enc_dec_attn( # Note: KV cache size of 4096 is arbitrary & chosen intentionally # to be more than necessary, since exceeding the kv cache size # is not part of this test - test_pt = TestPoint(num_heads, head_size, attn_backend.name, - batch_size, block_size, max_dec_seq_len, - max_enc_seq_len, 4096) + enc_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, AttentionType.ENCODER) + enc_dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, + AttentionType.ENCODER_DECODER) + dec_test_pt = TestPoint(num_heads, head_size, attn_backend.name, + batch_size, block_size, max_dec_seq_len, + max_enc_seq_len, 4096, AttentionType.DECODER) # Attention scale factor, attention backend instance, attention wrapper # instance, KV cache init vllm_config = VllmConfig() with set_current_vllm_config(vllm_config): - test_rsrcs = _make_test_resources(test_pt) + enc_test_rsrcs = _make_test_resources(enc_test_pt) + enc_dec_test_rsrcs = _make_test_resources(enc_dec_test_pt) + dec_test_rsrcs = _make_test_resources(dec_test_pt) # Construct encoder attention test params (only used # during prefill) - enc_test_params = _encoder_attn_setup(test_pt, test_rsrcs) + enc_test_params = _encoder_attn_setup(enc_test_pt, enc_test_rsrcs) # Construct Decoder self-attention prefill-phase & decode-phase # test params, including query/key/value tensors, decoder self-attention @@ -987,7 +985,7 @@ def test_e2e_enc_dec_attn( prephase_dec_test_params, decphase_dec_test_params, cross_block_base_addr, - ) = _decoder_attn_setup(test_pt, test_rsrcs) + ) = _decoder_attn_setup(dec_test_pt, dec_test_rsrcs) # Construct encoder/decoder cross-attention prefill-phase # & decode-phase test params, including key/value tensors, @@ -1000,14 +998,14 @@ def test_e2e_enc_dec_attn( dec_qkv, enc_test_params, prephase_dec_test_params, - test_pt, - test_rsrcs, + enc_dec_test_pt, + enc_dec_test_rsrcs, block_base_addr=cross_block_base_addr) # Shared prefill metadata structure assert prephase_dec_test_params.packed_qkvo.packed_qkv is not None prephase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, True, prephase_dec_test_params.packed_qkvo.packed_qkv.q_seq_lens, decoder_test_params=prephase_dec_test_params, @@ -1017,10 +1015,10 @@ def test_e2e_enc_dec_attn( # PREFILL: encoder attention - enc_pckd_act_out = _run_encoder_attention_test(test_rsrcs.attn, + enc_pckd_act_out = _run_encoder_attention_test(enc_test_rsrcs.attn, enc_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=enc_test_pt, vllm_config=vllm_config) # - Is encoder attention result correct? @@ -1030,10 +1028,10 @@ def test_e2e_enc_dec_attn( # PREFILL: decoder self-attention test prephase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, + dec_test_rsrcs, prephase_dec_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=dec_test_pt, vllm_config=vllm_config) # - Is prefill decoder self-attention correct? @@ -1044,11 +1042,11 @@ def test_e2e_enc_dec_attn( # PREFILL: encoder/decoder cross-attention test prephase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, + enc_dec_test_rsrcs, prephase_dec_test_params, prephase_cross_test_params, prephase_attn_metadata, - test_pt=test_pt, + test_pt=enc_dec_test_pt, vllm_config=vllm_config) # - Is prefill encoder/decoder cross-attention correct? @@ -1059,7 +1057,7 @@ def test_e2e_enc_dec_attn( # DECODE: build decode-phase attention metadata decphase_attn_metadata: AttentionMetadata = make_test_metadata( - test_rsrcs.attn_backend, + attn_backend, False, dec_qkv.q_seq_lens, decoder_test_params=decphase_dec_test_params, @@ -1070,10 +1068,10 @@ def test_e2e_enc_dec_attn( # DECODE: decoder self-attention test decphase_dec_pckd_act_out = _run_decoder_self_attention_test( - test_rsrcs, + dec_test_rsrcs, decphase_dec_test_params, decphase_attn_metadata, - test_pt=test_pt, + test_pt=dec_test_pt, vllm_config=vllm_config) # - Is decode-phase decoder self-attention correct? @@ -1084,11 +1082,11 @@ def test_e2e_enc_dec_attn( # DECODE: encoder/decoder cross-attention test decphase_cross_pckd_act_out = _run_encoder_decoder_cross_attention_test( - test_rsrcs, + enc_dec_test_rsrcs, decphase_dec_test_params, None, decphase_attn_metadata, - test_pt=test_pt, + test_pt=enc_dec_test_pt, vllm_config=vllm_config) # - Is decode-phase encoder/decoder cross-attention correct? diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index e7865fb2500ef..848eea7f54cab 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -13,6 +13,7 @@ from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType from vllm.model_executor.layers.activation import SiluAndMul +from vllm.platforms.interface import _Backend from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) @@ -790,7 +791,7 @@ def make_block_tables_slot_mapping( def make_test_metadata( - attn_backend: AttentionBackend, + attn_backend: _Backend, is_prompt: bool, seq_lens: Optional[List[int]], decoder_test_params: Optional[PhaseTestParameters], @@ -815,7 +816,7 @@ def make_test_metadata( Arguments: - * attn_backend: Backend for sourcing attention kernels + * attn_backend_name: Backend for sourcing attention kernels * is_prompt: prefill if True, o/w decode * seq_lens: list of token counts for each sequence * decoder_test_params: decoder self-attention test params; @@ -882,6 +883,8 @@ def make_test_metadata( # (kv_mmap) cross_kv_mmap = cross_test_params.kv_mmap + attn_backend_obj = make_backend(attn_backend.name) + if is_prompt: # Prefill-phase scenario @@ -902,8 +905,7 @@ def make_test_metadata( context_lens, encoder_seq_lens, device=device) - - return attn_backend.make_metadata( + return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=(None if kv_mmap is None else kv_mmap.slot_mapping), multi_modal_placeholder_index_maps=None, @@ -952,7 +954,7 @@ def make_test_metadata( encoder_seq_lens, device=device) - return attn_backend.make_metadata( + return attn_backend_obj.make_metadata( num_prefills=num_prefills, slot_mapping=kv_mmap.slot_mapping, multi_modal_placeholder_index_maps=None, diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index aed04361e5fb4..f5dcaea79af93 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -233,6 +233,7 @@ def __init__( kv_cache_dtype: str = "auto", blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: raise NotImplementedError @@ -246,7 +247,6 @@ def forward( attn_metadata: T, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: raise NotImplementedError diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index 99cb84346d84e..7089d59392c36 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -300,6 +300,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: assert blocksparse_params is not None assert alibi_slopes is None, ValueError( @@ -350,6 +351,12 @@ def __init__( active_head_range=self.blocksparse_params.active_head_range, ) + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "BlocksparseFlashAttentionImpl") + def forward( self, query: torch.Tensor, @@ -359,7 +366,6 @@ def forward( attn_metadata: BlocksparseFlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -375,12 +381,6 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "BlocksparseFlashAttentionImpl") - num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index c69e12ad78c44..23ea244f07dfe 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -600,6 +600,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -627,6 +628,7 @@ def __init__( raise ValueError( f"Head size {head_size} is not supported by FlashAttention. " f"Supported head sizes are: {support_head_sizes}.") + self.attn_type = attn_type def forward( self, @@ -637,7 +639,6 @@ def forward( attn_metadata: FlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -659,6 +660,7 @@ def forward( assert output is not None, "Output tensor must be provided." + attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): raise AttributeError("Encoder attention requires setting " diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index e367468d05d26..a11462b2068a5 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -748,6 +748,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -764,6 +765,12 @@ def __init__( assert self.num_heads % self.num_kv_heads == 0 self.num_queries_per_kv = self.num_heads // self.num_kv_heads + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "FlashInferImpl") + def forward( self, query: torch.Tensor, @@ -773,18 +780,10 @@ def forward( attn_metadata: FlashInferMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: # TODO: directly write to output tensor - - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "FlashInferImpl") - num_heads: int = self.num_heads head_size: int = self.head_size num_kv_heads: int = self.num_kv_heads diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index f90d15d4207e7..94a461e0c8c29 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -102,6 +102,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, max_seq_len: int = 4096, + attn_type: str = AttentionType.DECODER, ) -> None: super(AttentionImpl, self).__init__() self.kv_cache_dtype = kv_cache_dtype @@ -143,6 +144,12 @@ def __init__( f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {suppored_head_sizes}.") + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "HPUAttentionImpl") + def forward( self, query: torch.Tensor, @@ -152,7 +159,6 @@ def forward( attn_metadata: HPUAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -166,11 +172,6 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "HPUAttentionImpl") batch_size, seq_len, hidden_size = query.shape _, seq_len_kv, _ = key.shape diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 21949874bea47..da1d307daa517 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -115,6 +115,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -146,6 +147,11 @@ def __init__( raise NotImplementedError( "IPEX backend does not support FP8 KV cache. " "Please use xFormers backend instead.") + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "IpexAttnBackendImpl") def split_kv_cache( self, @@ -172,7 +178,6 @@ def forward( attn_metadata: IpexAttnMetadata, # type: ignore k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with IPEX varlen_attention and PagedAttention. @@ -189,11 +194,6 @@ def forward( shape = [num_tokens, num_heads * head_size] """ assert k_scale == 1.0 and v_scale == 1.0 - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "IpexAttnBackendImpl") num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 9809aed0e66f9..2ac492dd8ae54 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -100,6 +100,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: self.num_heads = num_heads self.head_size = head_size @@ -141,6 +142,12 @@ def __init__( # megacore mode will be None. self.megacore_mode = "batch" + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "PallasAttentionBackendImpl") + def forward( self, query: torch.Tensor, @@ -150,7 +157,6 @@ def forward( attn_metadata: PallasMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with Pallas attention. @@ -168,11 +174,6 @@ def forward( shape = [batch_size, seq_len, num_heads * head_size] """ assert k_scale == 1.0 and v_scale == 1.0 - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "PallasAttentionBackendImpl") batch_size, seq_len, hidden_size = query.shape query = query.view(batch_size, seq_len, self.num_heads, self.head_size) key = key.view(batch_size, seq_len, self.num_kv_heads, self.head_size) diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index d43c15b661ef7..a91a5af5c3d58 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -338,6 +338,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -397,6 +398,12 @@ def __init__( self.attn_func = _sdpa_attention logger.debug("Using naive attention in ROCmBackend") + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "ROCmFlashAttentionImpl") + def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor: """torch.repeat_interleave(x, dim=1, repeats=n_rep)""" tokens, n_kv_heads, head_dim = x.shape @@ -414,7 +421,6 @@ def forward( attn_metadata: ROCmFlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention and PagedAttention. @@ -432,12 +438,6 @@ def forward( """ # Reminder: Please update docs/source/features/compatibility_matrix.md # If the feature combo become valid - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "ROCmFlashAttentionImpl") - num_tokens, hidden_size = query.shape # Reshape the query, key, and value tensors. query = query.view(-1, self.num_heads, self.head_size) diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 0cff6f5952aba..c14f7754596dd 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -390,6 +390,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -421,6 +422,7 @@ def __init__( raise NotImplementedError( "Torch SDPA backend does not support FP8 KV cache. " "Please use xFormers backend instead.") + self.attn_type = attn_type def forward( self, @@ -431,7 +433,6 @@ def forward( attn_metadata: TorchSDPAMetadata, # type: ignore k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with torch SDPA and PagedAttention. @@ -448,6 +449,7 @@ def forward( shape = [num_tokens, num_heads * head_size] """ assert k_scale == 1.0 and v_scale == 1.0 + attn_type = self.attn_type if (attn_type == AttentionType.ENCODER and (not attn_metadata.is_all_encoder_attn_metadata_set)): raise AttributeError("Encoder attention requires setting " diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index 3e59b3603d2c6..694c7cc1bc36a 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -379,6 +379,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -405,6 +406,8 @@ def __init__( f"Head size {head_size} is not supported by PagedAttention. " f"Supported head sizes are: {suppored_head_sizes}.") + self.attn_type = attn_type + def forward( self, query: torch.Tensor, @@ -414,7 +417,6 @@ def forward( attn_metadata: "XFormersMetadata", k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with xFormers and PagedAttention. @@ -468,7 +470,7 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - + attn_type = self.attn_type # Check that appropriate attention metadata attributes are # selected for the desired attention type if (attn_type == AttentionType.ENCODER diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 69b6d1e4648df..f1b3598e60b54 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -41,6 +41,7 @@ def __init__( logits_soft_cap: Optional[float] = None, per_layer_sliding_window: Optional[int] = None, prefix: str = "", + attn_type: str = AttentionType.DECODER, ) -> None: super().__init__() if per_layer_sliding_window is not None: @@ -96,7 +97,7 @@ def __init__( impl_cls = attn_backend.get_impl_cls() self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads, alibi_slopes, sliding_window, kv_cache_dtype, - blocksparse_params, logits_soft_cap) + blocksparse_params, logits_soft_cap, attn_type) self.num_heads = num_heads self.head_size = head_size self.num_kv_heads = num_kv_heads @@ -119,6 +120,7 @@ def __init__( raise ValueError(f"Duplicate layer name: {prefix}") compilation_config.static_forward_context[prefix] = self self.layer_name = prefix + self.attn_type = attn_type def forward( self, @@ -127,18 +129,12 @@ def forward( value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, - attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: if self.use_direct_call: - return self.impl.forward(query, - key, - value, - kv_cache, - attn_metadata, - self._k_scale, - self._v_scale, - attn_type=attn_type) + return self.impl.forward(query, key, value, kv_cache, + attn_metadata, self._k_scale, + self._v_scale) elif self.use_output: output = torch.empty_like(query) hidden_size = query.size(-1) @@ -152,13 +148,11 @@ def forward( if value is not None: value = value.view(-1, self.num_kv_heads, self.head_size) torch.ops.vllm.unified_attention_with_output( - query, key, value, output, kv_cache, attn_type, - self.layer_name) + query, key, value, output, kv_cache, self.layer_name) return output.view(-1, hidden_size) else: return torch.ops.vllm.unified_attention(query, key, value, - kv_cache, attn_type, - self.layer_name) + kv_cache, self.layer_name) def extra_repr(self) -> str: s = f"head_size={self.impl.head_size}" # type: ignore @@ -237,20 +231,13 @@ def unified_attention( key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, - attn_type: str, layer_name: str, ) -> torch.Tensor: forward_context: ForwardContext = get_forward_context() attn_metadata = forward_context.dynamic_forward_context self = forward_context.static_forward_context[layer_name] - return self.impl.forward(query, - key, - value, - kv_cache, - attn_metadata, - self._k_scale, - self._v_scale, - attn_type=attn_type) + return self.impl.forward(query, key, value, kv_cache, attn_metadata, + self._k_scale, self._v_scale) def unified_attention_fake( @@ -258,7 +245,6 @@ def unified_attention_fake( key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, - attn_type: str, layer_name: str, ) -> torch.Tensor: return torch.empty_like(query).contiguous() @@ -279,7 +265,6 @@ def unified_attention_with_output( value: torch.Tensor, output: torch.Tensor, kv_cache: torch.Tensor, - attn_type: str, layer_name: str, ) -> None: forward_context: ForwardContext = get_forward_context() @@ -292,7 +277,6 @@ def unified_attention_with_output( attn_metadata, self._k_scale, self._v_scale, - attn_type=attn_type, output=output) @@ -302,7 +286,6 @@ def unified_attention_with_output_fake( value: torch.Tensor, output: torch.Tensor, kv_cache: torch.Tensor, - attn_type: str, layer_name: str, ) -> None: return diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 3776490cb3465..57eb5adc82d5b 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -71,12 +71,8 @@ def __init__(self, num_embeddings: int, embedding_dim: int): def forward( self, positions: torch.Tensor, - attn_type: AttentionType, ) -> torch.Tensor: """`input_ids' shape is expected to be [bsz x seqlen].""" - - assert attn_type != AttentionType.ENCODER_DECODER - return super().forward(positions + self.offset) @@ -180,7 +176,8 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER) def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata) -> torch.Tensor: @@ -189,12 +186,7 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor, qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.out_proj(attn_output) return output @@ -264,7 +256,8 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=AttentionType.DECODER) def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata) -> torch.Tensor: @@ -273,12 +266,7 @@ def forward(self, hidden_states: torch.Tensor, kv_cache: torch.Tensor, qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.DECODER) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.out_proj(attn_output) return output @@ -348,7 +336,8 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER_DECODER) def forward( self, @@ -372,12 +361,7 @@ def forward( _, k, v = qkv_enc.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER_DECODER) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.out_proj(attn_output) return output @@ -644,10 +628,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, # retrieve input_ids and inputs_embeds inputs_embeds = self.embed_tokens(input_ids) - embed_pos = self.embed_positions( - positions, - AttentionType.ENCODER, - ) + embed_pos = self.embed_positions(positions) embed_pos = embed_pos.to(inputs_embeds.device) hidden_states = inputs_embeds + embed_pos @@ -734,10 +715,7 @@ def forward(self, decoder_input_ids: torch.Tensor, inputs_embeds = self.embed_tokens(decoder_input_ids) # embed positions - embed_pos = self.embed_positions( - decoder_positions, - AttentionType.DECODER, - ) + embed_pos = self.embed_positions(decoder_positions) embed_pos = embed_pos.to(inputs_embeds.device) hidden_states = inputs_embeds + embed_pos diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index c1d47b1bc9bcd..4be136543de15 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -238,7 +238,8 @@ def __init__( num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER_ONLY) def forward( self, @@ -248,12 +249,7 @@ def forward( ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) - output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER_ONLY) + output = self.attn(q, k, v, kv_cache, attn_metadata) return output diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 6536f9807730c..c5046e06edecb 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -770,6 +770,7 @@ def __init__( self.scaling, self.num_local_key_value_heads, prefix=f"{prefix}.attn", + attn_type=AttentionType.ENCODER_DECODER, ) def forward( @@ -805,13 +806,9 @@ def forward( kv_range_for_decode, attn_metadata) else: - output = self.attn(q.view(-1, - self.num_local_heads * self.head_dim), - k, - v, - kv_cache, - attn_metadata, - attn_type=AttentionType.ENCODER_DECODER) + output = self.attn( + q.view(-1, self.num_local_heads * self.head_dim), k, v, + kv_cache, attn_metadata) out, _ = self.o_proj(output) return out diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 88f4ea4352726..01745b5fd53e1 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -107,7 +107,8 @@ def __init__(self, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, rope_scaling: Optional[Tuple] = None, - prefix: str = "") -> None: + prefix: str = "", + attn_type: str = AttentionType.DECODER) -> None: super().__init__() self.hidden_size = hidden_size tp_size = get_tensor_model_parallel_world_size() @@ -160,7 +161,8 @@ def __init__(self, num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - prefix=f"{prefix}.attn") + prefix=f"{prefix}.attn", + attn_type=attn_type) def forward( self, @@ -168,17 +170,11 @@ def forward( hidden_states: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, - attn_type: str = AttentionType.DECODER, ) -> torch.Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1) q, k = self.rotary_emb(positions, q, k) - attn_output = self.attn(q, - k, - v, - kv_cache, - attn_metadata, - attn_type=attn_type) + attn_output = self.attn(q, k, v, kv_cache, attn_metadata) output, _ = self.o_proj(attn_output) return output @@ -197,6 +193,16 @@ def __init__( # Requires transformers > 4.32.0 rope_theta = getattr(config, "rope_theta", 1000000) rope_scaling = getattr(config, "rope_scaling", None) + + # By default, Qwen2 uses causal attention as it is a decoder-only model. + # You can override the HF config with `is_causal=False` to enable + # bidirectional attention, which is used in some embedding models + # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct) + if getattr(config, "is_causal", True): + attn_type = AttentionType.DECODER + else: + attn_type = AttentionType.ENCODER_ONLY + self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, @@ -207,6 +213,7 @@ def __init__( quant_config=quant_config, rope_scaling=rope_scaling, prefix=f"{prefix}.self_attn", + attn_type=attn_type, ) self.mlp = Qwen2MLP( hidden_size=self.hidden_size, @@ -220,15 +227,6 @@ def __init__( self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) - # By default, Qwen2 uses causal attention as it is a decoder-only model. - # You can override the HF config with `is_causal=False` to enable - # bidirectional attention, which is used in some embedding models - # (e.g. Alibaba-NLP/gte-Qwen2-7B-instruct) - if getattr(config, "is_causal", True): - self._attn_type = AttentionType.DECODER - else: - self._attn_type = AttentionType.ENCODER_ONLY - def forward( self, positions: torch.Tensor, @@ -249,7 +247,6 @@ def forward( hidden_states=hidden_states, kv_cache=kv_cache, attn_metadata=attn_metadata, - attn_type=self._attn_type, ) # Fully Connected diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index 65002f1ad70c7..b02bc9ffde538 100644 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -89,6 +89,7 @@ def __init__( kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: AttentionType = AttentionType.DECODER, ) -> None: if blocksparse_params is not None: raise ValueError( @@ -119,6 +120,12 @@ def __init__( f"Head size {head_size} is not supported by FlashAttention. " f"Supported head sizes are: {support_head_sizes}.") + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "FlashAttentionImpl") + def forward( self, query: torch.Tensor, @@ -128,7 +135,6 @@ def forward( attn_metadata: FlashAttentionMetadata, k_scale: float = 1.0, v_scale: float = 1.0, - attn_type: AttentionType = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: """Forward pass with FlashAttention. @@ -142,12 +148,6 @@ def forward( Returns: shape = [num_tokens, num_heads * head_size] """ - if attn_type != AttentionType.DECODER: - raise NotImplementedError("Encoder self-attention and " - "encoder/decoder cross-attention " - "are not implemented for " - "FlashAttentionImpl") - # NOTE(woosuk): FlashAttention does not support FP8 KV cache. assert k_scale == 1.0 and v_scale == 1.0, ( "key/v_scale is not supported in FlashAttention.") From 91b361ae898c944f823534121613f9d3dc19d7d1 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:58:16 -0800 Subject: [PATCH 095/462] [V1] Extend beyond image modality and support mixed-modality inference with Llava-OneVision (#11685) Signed-off-by: Roger Wang Signed-off-by: DarkLight1337 Co-authored-by: DarkLight1337 --- docs/source/models/supported_models.md | 2 +- tests/multimodal/test_utils.py | 209 +++++++++++++++++- tests/v1/core/test_kv_cache_utils.py | 18 +- tests/v1/core/test_prefix_caching.py | 17 +- vllm/model_executor/models/interfaces.py | 6 +- vllm/model_executor/models/llava_onevision.py | 65 +++--- vllm/model_executor/models/molmo.py | 3 - vllm/multimodal/__init__.py | 3 + vllm/multimodal/hasher.py | 100 +++++++++ vllm/multimodal/inputs.py | 9 +- vllm/multimodal/processing.py | 92 +++----- vllm/multimodal/utils.py | 86 ++++++- vllm/v1/engine/__init__.py | 18 +- vllm/v1/engine/mm_input_mapper.py | 67 ------ vllm/v1/engine/processor.py | 101 ++++++--- vllm/v1/request.py | 48 ++-- vllm/v1/worker/gpu_model_runner.py | 74 ++++--- 17 files changed, 636 insertions(+), 282 deletions(-) create mode 100644 vllm/multimodal/hasher.py diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md index 5a2778026192a..94a8849f7edcd 100644 --- a/docs/source/models/supported_models.md +++ b/docs/source/models/supported_models.md @@ -647,7 +647,7 @@ See [this page](#generative-models) for more information on how to use generativ - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. - - ✅︎ - - + - ✅︎ * - `MiniCPMV` - MiniCPM-V - T + IE+ diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 6029f2e514772..198344e5bd88c 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -2,16 +2,22 @@ import mimetypes import os from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import Dict, Tuple +from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple import numpy as np import pytest from PIL import Image, ImageChops from transformers import AutoConfig, AutoTokenizer +from vllm.multimodal.inputs import PlaceholderRange from vllm.multimodal.utils import (MediaConnector, + merge_and_sort_multimodal_metadata, repeat_and_pad_placeholder_tokens) +if TYPE_CHECKING: + from vllm.multimodal.hasher import MultiModalHashDict + from vllm.multimodal.inputs import MultiModalPlaceholderDict + # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) TEST_IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", @@ -191,3 +197,204 @@ def test_repeat_and_pad_placeholder_tokens(model): assert new_prompt == expected_prompt assert new_token_ids == expected_token_ids assert ranges == expected_ranges + + +# Used for the next two tests related to `merge_and_sort_multimodal_metadata`. +class TestCase(NamedTuple): + mm_positions: "MultiModalPlaceholderDict" + mm_hashes: Optional["MultiModalHashDict"] + expected_modalities: list[str] + expected_ranges: list[PlaceholderRange] + expected_hashes: Optional[list[str]] + + +def test_merge_and_sort_multimodal_metadata(): + + test_cases = [ + # Single modality should return result as is but flattened + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ] + }, + mm_hashes={"image": ["hash1", "hash2"]}, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=2), + ], + expected_hashes=["hash1", "hash2"], + ), + + # Single modality without hashes return None for mm hash. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ] + }, + mm_hashes=None, + expected_modalities=["image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=2), + ], + expected_hashes=None, + ), + + # Multiple modalities with hashes should return sorted modalities + # and flattened ranges and hashes. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1", "audio_hash2"], + }, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=[ + "audio_hash1", "audio_hash2", "image_hash1", "image_hash2" + ], + ), + + # Multiple modalities without hashes should return sorted modalities + # and flattened ranges and None. + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + ] + }, + mm_hashes=None, + expected_modalities=["audio", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=2, length=3), + PlaceholderRange(offset=7, length=4), + PlaceholderRange(offset=11, length=5), + ], + expected_hashes=None, + ), + + # Three modalities + TestCase( + mm_positions={ + "image": [ + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + "audio": [ + PlaceholderRange(offset=0, length=2), + ], + "video": [ + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + ] + }, + mm_hashes={ + "image": ["image_hash1", "image_hash2"], + "audio": ["audio_hash1"], + "video": ["video_hash1", "video_hash2", "video_hash3"] + }, + expected_modalities=["audio", "video", "image"], + expected_ranges=[ + PlaceholderRange(offset=0, length=2), + PlaceholderRange(offset=3, length=4), + PlaceholderRange(offset=7, length=5), + PlaceholderRange(offset=12, length=6), + PlaceholderRange(offset=15, length=7), + PlaceholderRange(offset=22, length=8), + ], + expected_hashes=[ + "audio_hash1", "video_hash1", "video_hash2", "video_hash3", + "image_hash1", "image_hash2" + ], + ), + ] + + for (mm_positions, mm_hashes, expected_modalities, expected_ranges, + expected_hashes) in test_cases: + modalities, ranges, hashes = merge_and_sort_multimodal_metadata( + mm_positions, mm_hashes) + + assert modalities == expected_modalities + assert ranges == expected_ranges + assert hashes == expected_hashes + + +def test_merge_and_sort_multimodal_metadata_with_interleaving(): + + test_cases = [ + + #