From 17c79f3c364be166b68923bced94f902c00bd8bb Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 22 Oct 2024 13:43:37 -0700 Subject: [PATCH 01/15] [torch.compile] auto infer dynamic_arg_dims from type annotation (#9589) --- vllm/compilation/decorators.py | 68 ++++++++++++++++++++++++++-- vllm/model_executor/models/gemma2.py | 8 +--- vllm/model_executor/models/llama.py | 8 +--- 3 files changed, 65 insertions(+), 19 deletions(-) diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 3ae74cc5cb7dd..0449f9354d0a2 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -1,24 +1,58 @@ import inspect -from typing import Dict, List, Union +from typing import Dict, List, Optional, Union import torch import vllm.envs as envs from vllm.compilation.levels import CompilationLevel from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher +from vllm.logger import init_logger from vllm.sequence import IntermediateTensors from vllm.utils import supports_dynamo +logger = init_logger(__name__) -def support_torch_compile(dynamic_arg_dims: Dict[str, Union[int, List[int]]]): + +def support_torch_compile( + cls: Optional[type] = None, + dynamic_arg_dims: Optional[Dict[str, Union[int, List[int]]]] = None): """ A decorator to add support for compiling the forward method of a class. + Usage 1: use directly as a decorator without arguments: + + ```python + @support_torch_compile + class MyModel(nn.Module): + def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): + ... + ``` + + Usage 2: use as a decorator with arguments: + + ```python + @support_torch_compile(dynamic_arg_dims={"x": 0, "y": 0}) + class MyModel(nn.Module): + def forward(self, x: torch.Tensor, y: Optional[torch.Tensor]): + ... + ``` + `dynamic_arg_dims` is a dictionary that maps argument names to the dynamic dimensions of the argument. The dynamic dimensions can be either a single integer or a list of integers. - Depending on the value of arguments: + if `dynamic_arg_dims` is `None`, it is inferred from the type annotation + of the `forward` method, based on the following default rules: + + - if the argument is annotated as `torch.Tensor` or + `Optional[torch.Tensor]`, the first dimension will be + marked as dynamic. + - if the argument is annotated as `IntermediateTensors`, the first + dimension of all the tensors in the intermediate tensors + will be marked as dynamic. + + During runtime, when we actually mark dimensions of tensors, + it depends on the value of arguments: - if it is a single integer, the corresponding dimension of the argument will be marked as dynamic. @@ -38,11 +72,35 @@ def cls_decorator_helper(cls: type): if not hasattr(cls, 'forward'): raise TypeError("decorated class should have a forward method.") sig = inspect.signature(cls.forward) - for k in dynamic_arg_dims: + inferred_dynamic_arg_dims = dynamic_arg_dims + if inferred_dynamic_arg_dims is None: + inferred_dynamic_arg_dims = {} + for k, v in sig.parameters.items(): + if v.annotation in [ + torch.Tensor, Optional[torch.Tensor], + IntermediateTensors, Optional[IntermediateTensors] + ]: + inferred_dynamic_arg_dims[k] = 0 + + logger.debug(("Inferred dynamic dimensions for " + "forward method of %s: %s"), cls, + list(inferred_dynamic_arg_dims.keys())) + + if len(inferred_dynamic_arg_dims) == 0: + raise ValueError( + "No dynamic dimensions found in the forward method of " + f"{cls}. Please provide dynamic_arg_dims explicitly.") + + for k in inferred_dynamic_arg_dims: if k not in sig.parameters: raise ValueError( f"Argument {k} not found in the forward method of {cls}") - return _support_torch_compile(cls, dynamic_arg_dims) + return _support_torch_compile(cls, inferred_dynamic_arg_dims) + + if cls is not None: + # use `support_torch_compile` as a decorator without arguments + assert isinstance(cls, type) + return cls_decorator_helper(cls) return cls_decorator_helper diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index f958268741cd5..d79248f93f5ae 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -241,13 +241,7 @@ def forward( return hidden_states, residual -@support_torch_compile( - dynamic_arg_dims={ - "input_ids": 0, - "positions": 0, - "inputs_embeds": 0, - "intermediate_tensors": 0, - }) +@support_torch_compile class Gemma2Model(nn.Module): def __init__( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index fd88ae8b50402..c346e3e808e3f 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -268,13 +268,7 @@ def forward( return hidden_states, residual -@support_torch_compile( - dynamic_arg_dims={ - "input_ids": 0, - "positions": 0, - "inputs_embeds": 0, - "intermediate_tensors": 0, - }) +@support_torch_compile class LlamaModel(nn.Module): def __init__( From 23b899a8e62c7ea07981bf8487b0dc2cb17847b8 Mon Sep 17 00:00:00 2001 From: Aurick Qiao Date: Tue, 22 Oct 2024 18:38:12 -0400 Subject: [PATCH 02/15] [Bugfix] fix detokenizer shallow copy (#5919) --- vllm/transformers_utils/detokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py index 345ea14f9f273..7c8423d2b0a34 100644 --- a/vllm/transformers_utils/detokenizer.py +++ b/vllm/transformers_utils/detokenizer.py @@ -90,7 +90,7 @@ def decode_prompt_logprobs_inplace(self, seq_group: SequenceGroup, prefix_offset = next_iter_prefix_offset read_offset = next_iter_read_offset if prev_tokens is None: - prev_tokens = next_iter_tokens + prev_tokens = next_iter_tokens.copy() else: prev_tokens.extend(next_iter_tokens) From cb6fdaa0a0b31985df4fa3ddf069c022c1faacb9 Mon Sep 17 00:00:00 2001 From: Jeremy Arnold <103538711+JArnoldAMD@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:40:38 -0500 Subject: [PATCH 03/15] [Misc] Make benchmarks use EngineArgs (#9529) --- benchmarks/benchmark_latency.py | 155 +--------------- benchmarks/benchmark_prefix_caching.py | 24 +-- benchmarks/benchmark_prioritization.py | 134 +------------- benchmarks/benchmark_throughput.py | 237 ++----------------------- 4 files changed, 38 insertions(+), 512 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index ea1a7788f621d..0a14aedd5feba 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -1,5 +1,6 @@ """Benchmark the latency of processing a single batch of requests.""" import argparse +import dataclasses import json import time from pathlib import Path @@ -10,43 +11,19 @@ from tqdm import tqdm from vllm import LLM, SamplingParams -from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): print(args) + engine_args = EngineArgs.from_cli_args(args) + # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM( - model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - speculative_draft_tensor_parallel_size=\ - args.speculative_draft_tensor_parallel_size, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - max_model_len=args.max_model_len, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - load_format=args.load_format, - distributed_executor_backend=args.distributed_executor_backend, - otlp_traces_endpoint=args.otlp_traces_endpoint, - enable_prefix_caching=args.enable_prefix_caching, - ) + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams( n=args.n, @@ -125,19 +102,6 @@ def run_to_completion(profile_dir: Optional[str] = None): parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') - parser.add_argument('--model', type=str, default='facebook/opt-125m') - parser.add_argument('--speculative-model', type=str, default=None) - parser.add_argument('--num-speculative-tokens', type=int, default=None) - parser.add_argument('--speculative-draft-tensor-parallel-size', - '-spec-draft-tp', - type=int, - default=None) - parser.add_argument('--tokenizer', type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--input-len', type=int, default=32) parser.add_argument('--output-len', type=int, default=128) parser.add_argument('--batch-size', type=int, default=8) @@ -154,45 +118,6 @@ def run_to_completion(profile_dir: Optional[str] = None): type=int, default=30, help='Number of iterations to run.') - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--enforce-eager', - action='store_true', - help='enforce eager mode and disable CUDA graph') - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') parser.add_argument( '--profile', action='store_true', @@ -203,78 +128,12 @@ def run_to_completion(profile_dir: Optional[str] = None): default=None, help=('path to save the pytorch profiler output. Can be visualized ' 'with ui.perfetto.dev or Tensorboard.')) - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument('--block-size', - type=int, - default=16, - help='block size of key/value cache') - parser.add_argument( - '--enable-chunked-prefill', - action='store_true', - help='If True, the prefill requests can be chunked based on the ' - 'max_num_batched_tokens') - parser.add_argument("--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching") - parser.add_argument( - "--ray-workers-use-nsight", - action='store_true', - help="If specified, use nsight to profile ray workers", - ) - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the latency results in JSON format.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--otlp-traces-endpoint', - type=str, - default=None, - help='Target URL to which OpenTelemetry traces will be sent.') + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index a354358e43aa3..1aac029992dbf 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -25,6 +25,7 @@ --input-length-range 128:256 """ +import dataclasses import json import random import time @@ -33,6 +34,7 @@ from transformers import PreTrainedTokenizerBase from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.utils import FlexibleArgumentParser try: @@ -129,12 +131,9 @@ def main(args): filtered_datasets = [(PROMPT, prompt_len, args.output_len) ] * args.num_prompts - llm = LLM(model=args.model, - tokenizer_mode='auto', - trust_remote_code=True, - enforce_eager=True, - tensor_parallel_size=args.tensor_parallel_size, - enable_prefix_caching=args.enable_prefix_caching) + engine_args = EngineArgs.from_cli_args(args) + + llm = LLM(**dataclasses.asdict(engine_args)) sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len) @@ -162,18 +161,11 @@ def main(args): parser = FlexibleArgumentParser( description= 'Benchmark the performance with or without automatic prefix caching.') - parser.add_argument('--model', - type=str, - default='baichuan-inc/Baichuan2-13B-Chat') parser.add_argument("--dataset-path", type=str, default=None, help="Path to the dataset.") - parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) parser.add_argument('--output-len', type=int, default=10) - parser.add_argument('--enable-prefix-caching', - action='store_true', - help='enable prefix caching') parser.add_argument('--num-prompts', type=int, default=1, @@ -190,9 +182,7 @@ def main(args): default='128:256', help='Range of input lengths for sampling prompts,' 'specified as "min:max" (e.g., "128:256").') - parser.add_argument("--seed", - type=int, - default=0, - help='Random seed for reproducibility') + + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py index 8843e3a927a01..e0c9e6a6db502 100644 --- a/benchmarks/benchmark_prioritization.py +++ b/benchmarks/benchmark_prioritization.py @@ -1,5 +1,6 @@ """Benchmark offline prioritization.""" import argparse +import dataclasses import json import random import time @@ -7,7 +8,8 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.engine.arg_utils import EngineArgs +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -62,46 +64,11 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - gpu_memory_utilization: float = 0.9, - download_dir: Optional[str] = None, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - disable_log_stats=False, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts = [] @@ -142,16 +109,8 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm(requests, args.model, args.tokenizer, - args.quantization, args.tensor_parallel_size, - args.seed, args.n, args.trust_remote_code, - args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, - args.enable_chunked_prefill, - args.max_num_batched_tokens, - args.gpu_memory_utilization, args.download_dir) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) else: raise ValueError(f"Unknown backend: {args.backend}") total_num_tokens = sum(prompt_len + output_len @@ -173,7 +132,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], @@ -191,13 +150,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -206,81 +158,13 @@ def main(args: argparse.Namespace): type=int, default=200, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument( - "--device", - type=str, - default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') + parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index e26706af606b0..5cca92edb251b 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -1,5 +1,6 @@ """Benchmark offline inference throughput.""" import argparse +import dataclasses import json import random import time @@ -11,10 +12,9 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.entrypoints.openai.api_server import ( build_async_engine_client_from_engine_args) -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.sampling_params import BeamSearchParams from vllm.utils import FlexibleArgumentParser, merge_async_iterators @@ -67,53 +67,11 @@ def sample_requests( def run_vllm( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: EngineArgs, ) -> float: from vllm import LLM, SamplingParams - llm = LLM( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - disable_async_output_proc=disable_async_output_proc, - ) + llm = LLM(**dataclasses.asdict(engine_args)) # Add the requests to the engine. prompts: List[str] = [] @@ -155,56 +113,11 @@ def run_vllm( async def run_vllm_async( requests: List[Tuple[str, int, int]], - model: str, - tokenizer: str, - quantization: Optional[str], - tensor_parallel_size: int, - seed: int, n: int, - trust_remote_code: bool, - dtype: str, - max_model_len: Optional[int], - enforce_eager: bool, - kv_cache_dtype: str, - quantization_param_path: Optional[str], - device: str, - enable_prefix_caching: bool, - enable_chunked_prefill: bool, - max_num_batched_tokens: int, - distributed_executor_backend: Optional[str], - gpu_memory_utilization: float = 0.9, - num_scheduler_steps: int = 1, - download_dir: Optional[str] = None, - load_format: str = EngineArgs.load_format, - disable_async_output_proc: bool = False, + engine_args: AsyncEngineArgs, disable_frontend_multiprocessing: bool = False, ) -> float: from vllm import SamplingParams - engine_args = AsyncEngineArgs( - model=model, - tokenizer=tokenizer, - quantization=quantization, - tensor_parallel_size=tensor_parallel_size, - seed=seed, - trust_remote_code=trust_remote_code, - dtype=dtype, - max_model_len=max_model_len, - gpu_memory_utilization=gpu_memory_utilization, - enforce_eager=enforce_eager, - kv_cache_dtype=kv_cache_dtype, - quantization_param_path=quantization_param_path, - device=device, - enable_prefix_caching=enable_prefix_caching, - download_dir=download_dir, - enable_chunked_prefill=enable_chunked_prefill, - max_num_batched_tokens=max_num_batched_tokens, - distributed_executor_backend=distributed_executor_backend, - load_format=load_format, - num_scheduler_steps=num_scheduler_steps, - disable_async_output_proc=disable_async_output_proc, - worker_use_ray=False, - disable_log_requests=True, - ) async with build_async_engine_client_from_engine_args( engine_args, disable_frontend_multiprocessing) as llm: @@ -328,23 +241,17 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - run_args = [ - requests, args.model, args.tokenizer, args.quantization, - args.tensor_parallel_size, args.seed, args.n, - args.trust_remote_code, args.dtype, args.max_model_len, - args.enforce_eager, args.kv_cache_dtype, - args.quantization_param_path, args.device, - args.enable_prefix_caching, args.enable_chunked_prefill, - args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.num_scheduler_steps, - args.download_dir, args.load_format, args.disable_async_output_proc - ] - if args.async_engine: - run_args.append(args.disable_frontend_multiprocessing) - elapsed_time = uvloop.run(run_vllm_async(*run_args)) + elapsed_time = uvloop.run( + run_vllm_async( + requests, + args.n, + AsyncEngineArgs.from_cli_args(args), + args.disable_frontend_multiprocessing, + )) else: - elapsed_time = run_vllm(*run_args) + elapsed_time = run_vllm(requests, args.n, + EngineArgs.from_cli_args(args)) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -391,13 +298,6 @@ def main(args: argparse.Namespace): default=None, help="Output length for each request. Overrides the " "output length from the dataset.") - parser.add_argument("--model", type=str, default="facebook/opt-125m") - parser.add_argument("--tokenizer", type=str, default=None) - parser.add_argument('--quantization', - '-q', - choices=[*QUANTIZATION_METHODS, None], - default=None) - parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) parser.add_argument("--n", type=int, default=1, @@ -406,123 +306,15 @@ def main(args: argparse.Namespace): type=int, default=1000, help="Number of prompts to process.") - parser.add_argument("--seed", type=int, default=0) parser.add_argument("--hf-max-batch-size", type=int, default=None, help="Maximum batch size for HF backend.") - parser.add_argument('--trust-remote-code', - action='store_true', - help='trust remote code from huggingface') - parser.add_argument( - '--max-model-len', - type=int, - default=None, - help='Maximum length of a sequence (including prompt and output). ' - 'If None, will be derived from the model.') - parser.add_argument( - '--dtype', - type=str, - default='auto', - choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') - parser.add_argument('--gpu-memory-utilization', - type=float, - default=0.9, - help='the fraction of GPU memory to be used for ' - 'the model executor, which can range from 0 to 1.' - 'If unspecified, will use the default value of 0.9.') - parser.add_argument("--enforce-eager", - action="store_true", - help="enforce eager execution") - parser.add_argument( - '--kv-cache-dtype', - type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'], - default="auto", - help='Data type for kv cache storage. If "auto", will use model ' - 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' - 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)') - parser.add_argument( - '--quantization-param-path', - type=str, - default=None, - help='Path to the JSON file containing the KV cache scaling factors. ' - 'This should generally be supplied, when KV cache dtype is FP8. ' - 'Otherwise, KV cache scaling factors default to 1.0, which may cause ' - 'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' - 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' - 'instead supported for common inference criteria.') - parser.add_argument("--device", - type=str, - default="auto", - choices=DEVICE_OPTIONS, - help='device type for vLLM execution') - parser.add_argument( - "--num-scheduler-steps", - type=int, - default=1, - help="Maximum number of forward steps per scheduler call.") - parser.add_argument( - "--enable-prefix-caching", - action='store_true', - help="Enable automatic prefix caching for vLLM backend.") - parser.add_argument("--enable-chunked-prefill", - action='store_true', - help="enable chunked prefill for vLLM backend.") - parser.add_argument('--max-num-batched-tokens', - type=int, - default=None, - help='maximum number of batched tokens per ' - 'iteration') - parser.add_argument('--download-dir', - type=str, - default=None, - help='directory to download and load the weights, ' - 'default to the default cache dir of huggingface') parser.add_argument( '--output-json', type=str, default=None, help='Path to save the throughput results in JSON format.') - parser.add_argument( - '--distributed-executor-backend', - choices=['ray', 'mp'], - default=None, - help='Backend to use for distributed serving. When more than 1 GPU ' - 'is used, will be automatically set to "ray" if installed ' - 'or "mp" (multiprocessing) otherwise.') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=[ - 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', - 'bitsandbytes' - ], - help='The format of the model weights to load.\n\n' - '* "auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available.\n' - '* "pt" will load the weights in the pytorch bin format.\n' - '* "safetensors" will load the weights in the safetensors format.\n' - '* "npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading.\n' - '* "dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.\n' - '* "tensorizer" will load the weights using tensorizer from ' - 'CoreWeave. See the Tensorize vLLM Model script in the Examples' - 'section for more information.\n' - '* "bitsandbytes" will load the weights using bitsandbytes ' - 'quantization.\n') - parser.add_argument( - "--disable-async-output-proc", - action='store_true', - default=False, - help="Disable async output processor for vLLM backend.") parser.add_argument("--async-engine", action='store_true', default=False, @@ -531,6 +323,7 @@ def main(args: argparse.Namespace): action='store_true', default=False, help="Disable decoupled async engine frontend.") + parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model From d1e82408759067eca0ae55e548f6243a9e0aa12d Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 22 Oct 2024 18:41:13 -0400 Subject: [PATCH 04/15] [Bugfix] Fix spurious "No compiled cutlass_scaled_mm ..." for W8A8 on Turing (#9487) --- CMakeLists.txt | 4 ++-- csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f6d1c66b2cf7..a53a8575d01ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,7 +252,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}") else() message(STATUS "Not building Marlin kernels as no compatible archs found" - "in CUDA target architectures") + " in CUDA target architectures") endif() # @@ -432,7 +432,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") message(STATUS "Building Marlin MOE kernels for archs: ${MARLIN_MOE_ARCHS}") else() message(STATUS "Not building Marlin MOE kernels as no compatible archs found" - "in CUDA target architectures") + " in CUDA target architectures") endif() endif() diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 1657f7d0b16e8..97a969cf5e3e0 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -137,9 +137,11 @@ void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, return; } - // Turing - TORCH_CHECK(version_num >= 75); - cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); + if (version_num >= 75) { + // Turing + cutlass_scaled_mm_sm75(c, a, b, a_scales, b_scales, bias); + return; + } #endif TORCH_CHECK_NOT_IMPLEMENTED( From b17046e2982cad4cc205851c5af98375e0d1c3f3 Mon Sep 17 00:00:00 2001 From: yulei Date: Wed, 23 Oct 2024 06:43:03 +0800 Subject: [PATCH 05/15] [BugFix] Fix metrics error for --num-scheduler-steps > 1 (#8234) --- tests/metrics/test_metrics.py | 39 +++++++++++++++++++++++++++++++++++ vllm/engine/llm_engine.py | 9 ++++++++ 2 files changed, 48 insertions(+) diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 92e6086e312f7..7a361ef320810 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -84,6 +84,45 @@ def test_metric_counter_generation_tokens( f"metric: {metric_count!r}") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("max_tokens", [128, 129]) +@pytest.mark.parametrize("disable_async_output_proc", [True, False]) +def test_metric_counter_generation_tokens_multi_step( + vllm_runner, + example_prompts, + model: str, + max_tokens: int, + disable_async_output_proc: bool, +) -> None: + num_scheduler_steps = 8 + with vllm_runner( + model, + disable_log_stats=False, + gpu_memory_utilization=0.4, + num_scheduler_steps=num_scheduler_steps, + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + tokenizer = vllm_model.model.get_tokenizer() + stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + metric_count = stat_logger.metrics.counter_generation_tokens.labels( + **stat_logger.labels)._value.get() + vllm_generation_count = 0 + for i in range(len(example_prompts)): + vllm_output_ids, vllm_output_str = vllm_outputs[i] + prompt_ids = tokenizer.encode(example_prompts[i]) + # vllm_output_ids contains both prompt tokens and generation tokens. + # We're interested only in the count of the generation tokens. + vllm_generation_count += len(vllm_output_ids) - len(prompt_ids) + + # The multi-step scheduling will continue to execute forward even when + # encountering EOS, leading to slightly imprecise metrics. + assert abs(vllm_generation_count - metric_count) <\ + len(example_prompts) * num_scheduler_steps, \ + (f"generation token count: {vllm_generation_count!r}\n" + f"metric: {metric_count!r}") + + @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 3a29e6a9ae094..99beea932882d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1718,6 +1718,15 @@ def _get_stats(self, # TPOTs. latency = seq_group.get_last_latency(now) time_per_output_tokens_iter.append(latency) + if seq_group.state.current_step == 0: + # For async_output_proc, the do_log_stats() + # is called following init_multi_step(), which + # sets the current_step to zero. + actual_num_batched_tokens +=\ + seq_group.state.num_steps - 1 + else: + actual_num_batched_tokens +=\ + seq_group.state.current_step - 1 # Because of chunked prefill, we can have a single sequence # group that does multiple prompt_runs. To prevent logging From 208cb34c812585ce387d7aff82678a3776a66756 Mon Sep 17 00:00:00 2001 From: Seth Kimmel Date: Tue, 22 Oct 2024 15:43:25 -0700 Subject: [PATCH 06/15] [Doc]: Update tensorizer docs to include vllm[tensorizer] (#7889) Co-authored-by: Kaunil Dhruv --- docs/source/serving/tensorizer.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/serving/tensorizer.rst b/docs/source/serving/tensorizer.rst index a44696507fb9a..96a93db94871b 100644 --- a/docs/source/serving/tensorizer.rst +++ b/docs/source/serving/tensorizer.rst @@ -9,4 +9,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor For more information on CoreWeave's Tensorizer, please refer to `CoreWeave's Tensorizer documentation `_. For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see -the `vLLM example script `_. \ No newline at end of file +the `vLLM example script `_. + +.. note:: + Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`. From 65050a40e63fb8d57f383ea833d8869f77e85c89 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Tue, 22 Oct 2024 17:45:35 -0700 Subject: [PATCH 07/15] [Bugfix] Generate exactly input_len tokens in benchmark_throughput (#9592) --- benchmarks/benchmark_throughput.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 5cca92edb251b..24eb54e7b73bc 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -233,7 +233,16 @@ def main(args: argparse.Namespace): args.tokenizer, trust_remote_code=args.trust_remote_code) if args.dataset is None: # Synthesize a prompt with the given input length. - prompt = "hi" * (args.input_len - 1) + # As tokenizer may add additional tokens like BOS, we need to try + # different lengths to get the desired input length. + for i in range(-10, 10): + prompt = "hi " * (args.input_len + i) + tokenized_prompt = tokenizer(prompt).input_ids + if len(tokenized_prompt) == args.input_len: + break + else: + raise ValueError( + f"Failed to synthesize a prompt with {args.input_len} tokens.") requests = [(prompt, args.input_len, args.output_len) for _ in range(args.num_prompts)] else: From 29061ed9df84f1298806b2fc525ce4bc7eba1d29 Mon Sep 17 00:00:00 2001 From: Flex Wang Date: Tue, 22 Oct 2024 20:17:28 -0700 Subject: [PATCH 08/15] [Misc] Add an env var VLLM_LOGGING_PREFIX, if set, it will be prepend to all logging messages (#9590) --- vllm/envs.py | 5 +++++ vllm/logger.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/vllm/envs.py b/vllm/envs.py index a20271229c567..ae6825f280073 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -27,6 +27,7 @@ VLLM_USAGE_SOURCE: str = "" VLLM_CONFIGURE_LOGGING: int = 1 VLLM_LOGGING_LEVEL: str = "INFO" + VLLM_LOGGING_PREFIX: str = "" VLLM_LOGGING_CONFIG_PATH: Optional[str] = None VLLM_TRACE_FUNCTION: int = 0 VLLM_ATTENTION_BACKEND: Optional[str] = None @@ -268,6 +269,10 @@ def get_default_config_root(): "VLLM_LOGGING_LEVEL": lambda: os.getenv("VLLM_LOGGING_LEVEL", "INFO"), + # if set, VLLM_LOGGING_PREFIX will be prepended to all log messages + "VLLM_LOGGING_PREFIX": + lambda: os.getenv("VLLM_LOGGING_PREFIX", ""), + # Trace function calls # If set to 1, vllm will trace function calls # Useful for debugging diff --git a/vllm/logger.py b/vllm/logger.py index 77dddbfb60965..ccf09691a052a 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -15,8 +15,10 @@ VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL +VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX -_FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" +_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " + "%(filename)s:%(lineno)d] %(message)s") _DATE_FORMAT = "%m-%d %H:%M:%S" DEFAULT_LOGGING_CONFIG = { From 831540cf04b0b40cd1fe462356de4a30b831e4ea Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 23 Oct 2024 11:35:29 +0800 Subject: [PATCH 09/15] [Model] Support E5-V (#9576) --- docs/source/models/supported_models.rst | 14 ++ examples/offline_inference_vision_language.py | 6 +- ...ine_inference_vision_language_embedding.py | 190 ++++++++++++++++-- ...e_inference_vision_language_multi_image.py | 7 +- tests/conftest.py | 60 +++--- tests/models/embedding/utils.py | 3 +- .../vision_language/test_llava_next.py | 135 +++++++++++++ .../embedding/vision_language/test_phi3v.py | 93 +++++++-- vllm/model_executor/models/llava_next.py | 33 ++- vllm/model_executor/models/phi3v.py | 2 - vllm/model_executor/models/registry.py | 1 + vllm/model_executor/models/utils.py | 78 ++++++- 12 files changed, 532 insertions(+), 90 deletions(-) create mode 100644 tests/models/embedding/vision_language/test_llava_next.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 3d8df3c9f8c9f..ad153d2927d6c 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -334,6 +334,14 @@ The following modalities are supported depending on the model: - **V**\ ideo - **A**\ udio +Any combination of modalities joined by :code:`+` are supported. + +- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs. + +On the other hand, modalities separated by :code:`/` are mutually exclusive. + +- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs. + .. _supported_vlms: Text Generation @@ -484,6 +492,12 @@ Multimodal Embedding - Example HF Models - :ref:`LoRA ` - :ref:`PP ` + * - :code:`LlavaNextForConditionalGeneration` + - LLaVA-NeXT-based + - T / I + - :code:`royokong/e5-v` + - + - ✅︎ * - :code:`Phi3VForCausalLM` - Phi-3-Vision-based - T + I diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 06b424abd50b5..610cc31db9c4e 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -1,6 +1,6 @@ """ -This example shows how to use vLLM for running offline inference -with the correct prompt format on vision language models. +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. @@ -450,7 +450,7 @@ def main(args): if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' - 'vision language models') + 'vision language models for text generation') parser.add_argument('--model-type', '-m', type=str, diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py index cfedd145a015d..e1732d045f949 100644 --- a/examples/offline_inference_vision_language_embedding.py +++ b/examples/offline_inference_vision_language_embedding.py @@ -1,22 +1,170 @@ +""" +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for multimodal embedding. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from argparse import Namespace +from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args + +from PIL.Image import Image + from vllm import LLM -from vllm.assets.image import ImageAsset - -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") -prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501 - -# Create an LLM. -llm = LLM( - model="TIGER-Lab/VLM2Vec-Full", - task="embedding", - trust_remote_code=True, - max_model_len=4096, - max_num_seqs=2, - mm_processor_kwargs={"num_crops": 16}, -) - -# Generate embedding. The output is a list of EmbeddingRequestOutputs. -outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}}) - -# Print the outputs. -for output in outputs: - print(output.outputs.embedding) # list of 3072 floats +from vllm.multimodal.utils import fetch_image +from vllm.utils import FlexibleArgumentParser + + +class TextQuery(TypedDict): + modality: Literal["text"] + text: str + + +class ImageQuery(TypedDict): + modality: Literal["image"] + image: Image + + +class TextImageQuery(TypedDict): + modality: Literal["text+image"] + text: str + image: Image + + +QueryModality = Literal["text", "image", "text+image"] +Query = Union[TextQuery, ImageQuery, TextImageQuery] + + +class ModelRequestData(NamedTuple): + llm: LLM + prompt: str + image: Optional[Image] + + +def run_e5_v(query: Query): + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + + if query["modality"] == "text": + text = query["text"] + prompt = llama3_template.format( + f"{text}\nSummary above sentence in one word: ") + image = None + elif query["modality"] == "image": + prompt = llama3_template.format( + "\nSummary above image in one word: ") + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM( + model="royokong/e5-v", + task="embedding", + max_model_len=4096, + ) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + +def run_vlm2vec(query: Query): + if query["modality"] == "text": + text = query["text"] + prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501 + image = None + elif query["modality"] == "image": + prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501 + image = query["image"] + elif query["modality"] == "text+image": + text = query["text"] + prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501 + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM( + model="TIGER-Lab/VLM2Vec-Full", + task="embedding", + trust_remote_code=True, + mm_processor_kwargs={"num_crops": 4}, + ) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + +def get_query(modality: QueryModality): + if modality == "text": + return TextQuery(modality="text", text="A dog sitting in the grass") + + if modality == "image": + return ImageQuery( + modality="image", + image=fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501 + ), + ) + + if modality == "text+image": + return TextImageQuery( + modality="text+image", + text="A cat standing in the snow.", + image=fetch_image( + "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501 + ), + ) + + msg = f"Modality {modality} is not supported." + raise ValueError(msg) + + +def run_encode(model: str, modality: QueryModality): + query = get_query(modality) + req_data = model_example_map[model](query) + + mm_data = {} + if req_data.image is not None: + mm_data["image"] = req_data.image + + outputs = req_data.llm.encode({ + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + }) + + for output in outputs: + print(output.outputs.embedding) + + +def main(args: Namespace): + run_encode(args.model_name, args.modality) + + +model_example_map = { + "e5_v": run_e5_v, + "vlm2vec": run_vlm2vec, +} + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for multimodal embedding') + parser.add_argument('--model-name', + '-m', + type=str, + default="vlm2vec", + choices=model_example_map.keys(), + help='The name of the embedding model.') + parser.add_argument('--modality', + type=str, + default="image", + choices=get_args(QueryModality), + help='Modality of the input.') + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 69f590fb7950d..e28514bf403f7 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -1,7 +1,7 @@ """ This example shows how to use vLLM for running offline inference with -multi-image input on vision language models, using the chat template defined -by the model. +multi-image input on vision language models for text generation, +using the chat template defined by the model. """ from argparse import Namespace from typing import List, NamedTuple, Optional @@ -334,7 +334,8 @@ def main(args: Namespace): if __name__ == "__main__": parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' - 'vision language models that support multi-image input') + 'vision language models that support multi-image input for text ' + 'generation') parser.add_argument('--model-type', '-m', type=str, diff --git a/tests/conftest.py b/tests/conftest.py index fc8bd1a473476..76f581e0363f7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -43,10 +43,12 @@ _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] -PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]] -PromptAudioInput = Union[List[Tuple[np.ndarray, int]], - List[List[Tuple[np.ndarray, int]]]] -PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]] +_M = TypeVar("_M") +_PromptMultiModalInput = Union[List[_M], List[List[_M]]] + +PromptImageInput = _PromptMultiModalInput[Image.Image] +PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptVideoInput = _PromptMultiModalInput[np.ndarray] def _read_prompts(filename: str) -> List[str]: @@ -318,12 +320,12 @@ def get_inputs( "text": prompt, "return_tensors": "pt", } - if images is not None and images[i] is not None: - processor_kwargs["images"] = images[i] - if videos is not None and videos[i] is not None: - processor_kwargs["videos"] = videos[i] - if audios is not None and audios[i] is not None: - audio, sr = audios[i] + if images is not None and (image := images[i]) is not None: + processor_kwargs["images"] = image + if videos is not None and (video := videos[i]) is not None: + processor_kwargs["videos"] = video + if audios is not None and (audio_tuple := audios[i]) is not None: + audio, sr = audio_tuple processor_kwargs["audio"] = audio processor_kwargs["sampling_rate"] = sr @@ -338,7 +340,7 @@ def generate( self, prompts: List[str], images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[Tuple[List[List[int]], List[str]]]: @@ -368,7 +370,7 @@ def generate_greedy( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[Tuple[List[int], str]]: @@ -409,7 +411,7 @@ def generate_greedy_logprobs( prompts: List[str], max_tokens: int, images: Optional[PromptImageInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, audios: Optional[PromptAudioInput] = None, **kwargs: Any, ) -> List[List[torch.Tensor]]: @@ -488,7 +490,7 @@ def generate_greedy_logprobs_limit( num_logprobs: int, images: Optional[PromptImageInput] = None, audios: Optional[PromptAudioInput] = None, - videos: Optional[List[np.ndarray]] = None, + videos: Optional[PromptVideoInput] = None, **kwargs: Any, ) -> List[TokensTextLogprobs]: all_inputs = self.get_inputs(prompts, @@ -657,15 +659,18 @@ def get_inputs( inputs = [TextPrompt(prompt=prompt) for prompt in prompts] if images is not None: for i, image in enumerate(images): - inputs[i]["multi_modal_data"] = {"image": image} + if image is not None: + inputs[i]["multi_modal_data"] = {"image": image} if videos is not None: for i, video in enumerate(videos): - inputs[i]["multi_modal_data"] = {"video": video} + if video is not None: + inputs[i]["multi_modal_data"] = {"video": video} if audios is not None: for i, audio in enumerate(audios): - inputs[i]["multi_modal_data"] = {"audio": audio} + if audio is not None: + inputs[i]["multi_modal_data"] = {"audio": audio} return inputs @@ -837,13 +842,20 @@ def generate_beam_search( returned_outputs.append((token_ids, texts)) return returned_outputs - def encode(self, prompts: List[str]) -> List[List[float]]: - req_outputs = self.model.encode(prompts) - outputs = [] - for req_output in req_outputs: - embedding = req_output.outputs.embedding - outputs.append(embedding) - return outputs + def encode( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[List[float]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.encode(inputs) + return [req_output.outputs.embedding for req_output in req_outputs] def __enter__(self): return self diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py index 2fcc2013d91ef..fd1c44d9c117e 100644 --- a/tests/models/embedding/utils.py +++ b/tests/models/embedding/utils.py @@ -16,7 +16,8 @@ def check_embeddings_close( for prompt_idx, (embeddings_0, embeddings_1) in enumerate( zip(embeddings_0_lst, embeddings_1_lst)): - assert len(embeddings_0) == len(embeddings_1) + assert len(embeddings_0) == len(embeddings_1), ( + f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}") sim = F.cosine_similarity(torch.tensor(embeddings_0), torch.tensor(embeddings_1), diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py new file mode 100644 index 0000000000000..52aef8c34d6f3 --- /dev/null +++ b/tests/models/embedding/vision_language/test_llava_next.py @@ -0,0 +1,135 @@ +from typing import List, Type + +import pytest +import torch.nn.functional as F +from transformers import AutoModelForVision2Seq + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test +from ..utils import check_embeddings_close + +llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + +HF_TEXT_PROMPTS = [ + # T -> X + llama3_template.format( + "The label of the object is stop sign\nSummary above sentence in one word: " # noqa: E501 + ), + # T -> X + llama3_template.format( + "cherry blossom\nSummary above sentence in one word: "), +] + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + # I -> X + "stop_sign": + llama3_template.format("\nSummary above image in one word: "), + # I -> X + "cherry_blossom": + llama3_template.format("\nSummary above image in one word: "), +}) + +MODELS = ["royokong/e5-v"] + + +def _run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + input_texts: List[str], + input_images: PromptImageInput, + model: str, + *, + dtype: str, +) -> None: + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner(model, + task="embedding", + dtype=dtype, + max_model_len=4096, + enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.encode(input_texts, images=input_images) + + with hf_runner(model, dtype=dtype, + auto_cls=AutoModelForVision2Seq) as hf_model: + # Patch the issue where image_token_id + # exceeds the maximum allowed vocab size + hf_model.model.resize_token_embeddings( + hf_model.model.language_model.vocab_size + 1) + + all_inputs = hf_model.get_inputs(input_texts, images=input_images) + + all_outputs = [] + for inputs in all_inputs: + # Based on: https://huggingface.co/royokong/e5-v + outputs = hf_model.model( + **hf_model.wrap_device(inputs, + device=hf_model.model.device.type), + return_dict=True, + output_hidden_states=True, + ) + pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], + dim=-1) + + all_outputs.append(pooled_output.tolist()) + + hf_outputs = all_outputs + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_text( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, # type: ignore + model, + dtype=dtype, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [ + (text, asset.pil_image) + for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) + ] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, + model, + dtype=dtype, + ) diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py index 0ca90e6bfa52e..ee411472ba284 100644 --- a/tests/models/embedding/vision_language/test_phi3v.py +++ b/tests/models/embedding/vision_language/test_phi3v.py @@ -1,42 +1,53 @@ +from typing import List, Type + import pytest import torch.nn.functional as F -from ....conftest import IMAGE_ASSETS +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test from ..utils import check_embeddings_close +HF_TEXT_PROMPTS = [ + # T -> X + "Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501 + # T -> X + "Retrieve an image of this caption: cherry blossom", +] + HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + # T + I -> X "stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501 + # I -> X "cherry_blossom": - "<|image_1|> Represent the given image with the following question: What is in the image", # noqa: E501 + "<|image_1|> Represent the given image for classification", # noqa: E501 }) MODELS = ["TIGER-Lab/VLM2Vec-Full"] -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half"]) -def test_models( - hf_runner, - vllm_runner, - example_prompts, +def _run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + input_texts: List[str], + input_images: PromptImageInput, model: str, + *, dtype: str, ) -> None: # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). - with vllm_runner(model, - task="embedding", - max_model_len=4096, - max_num_seqs=2, - dtype=dtype, + with vllm_runner(model, task="embedding", dtype=dtype, enforce_eager=True) as vllm_model: - vllm_outputs = vllm_model.encode(example_prompts) + vllm_outputs = vllm_model.encode(input_texts, images=input_images) - with hf_runner(model, dtype=dtype) as hf_model: - all_inputs = hf_model.get_inputs(example_prompts) + # use eager mode for hf runner, since phi3_v didn't work with flash_attn + hf_model_kwargs = {"_attn_implementation": "eager"} + with hf_runner(model, dtype=dtype, + model_kwargs=hf_model_kwargs) as hf_model: + all_inputs = hf_model.get_inputs(input_texts, images=input_images) all_outputs = [] for inputs in all_inputs: @@ -61,3 +72,53 @@ def test_models( name_0="hf", name_1="vllm", ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_text( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, # type: ignore + model, + dtype=dtype, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [ + (text, asset.pil_image) + for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) + ] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, + model, + dtype=dtype, + ) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 4dd472b04bb1a..46cba8ebbc583 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -13,11 +13,13 @@ from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig from vllm.inputs import INPUT_REGISTRY, DecoderOnlyInputs, InputContext +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.sequence import IntermediateTensors +from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import is_list_of from .clip import (CLIPVisionModel, dummy_image_for_clip, @@ -28,8 +30,8 @@ from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) -from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model, - merge_multimodal_embeddings) +from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn, + init_vllm_registered_model) # Result in the max possible feature size (2x2 grid of 336x336px tiles) MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448 @@ -312,6 +314,10 @@ def __init__(self, self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) + # The same model class supports both language generation and embedding + # because the architecture name is the same + self._pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True) + self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) @@ -605,14 +611,12 @@ def forward( image_input = self._parse_and_validate_image_input(**kwargs) if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - + inputs_embeds = embed_multimodal( + input_ids, + self.config.image_token_index, + self.language_model.model.get_input_embeddings, + lambda _: self._process_image_input(image_input), + ) input_ids = None else: inputs_embeds = None @@ -641,6 +645,13 @@ def sample( ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 91c14e32c946c..9a1083520efd2 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -467,8 +467,6 @@ def input_processor_for_phi3v(ctx: InputContext, prompt_token_ids = inputs["prompt_token_ids"].copy() - print("prompt_token_ids (old)", prompt_token_ids) - # masked placeholder with image token id for idx in image_idx: candidates = _get_image_placeholder_token_id_candidates(model_config, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 8745e0cbd97b6..a255b2a2f3982 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -94,6 +94,7 @@ "MistralModel": ("llama", "LlamaEmbeddingModel"), "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), # [Multimodal] + "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), } diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index ec1d76d2117f3..d96e988fba384 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,7 +1,7 @@ import itertools from dataclasses import dataclass, field -from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, - Protocol, Tuple, Union, overload) +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Protocol, Tuple, Union, overload) import torch import torch.nn as nn @@ -294,10 +294,11 @@ def _embedding_count_expression(embeddings: NestedTensors) -> str: _embedding_count_expression(inner) for inner in embeddings) -def merge_multimodal_embeddings(input_ids: torch.Tensor, - inputs_embeds: torch.Tensor, - multimodal_embeddings: NestedTensors, - placeholder_token_id: int) -> torch.Tensor: +def _merge_multimodal_embeddings( + inputs_embeds: torch.Tensor, + is_multimodal: torch.Tensor, + multimodal_embeddings: NestedTensors, +) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the positions in ``inputs_embeds`` corresponding to placeholder tokens in @@ -306,8 +307,7 @@ def merge_multimodal_embeddings(input_ids: torch.Tensor, Note: This updates ``inputs_embeds`` in place. """ - mask = (input_ids == placeholder_token_id) - num_expected_tokens = mask.sum().item() + num_expected_tokens = is_multimodal.sum().item() assert isinstance(num_expected_tokens, int) flattened = _flatten_embeddings(multimodal_embeddings) @@ -317,10 +317,70 @@ def merge_multimodal_embeddings(input_ids: torch.Tensor, f"Attempted to assign {expr} = {flattened.shape[0]} " f"multimodal tokens to {num_expected_tokens} placeholders") - inputs_embeds[mask] = flattened + inputs_embeds[is_multimodal] = flattened return inputs_embeds +def embed_multimodal( + input_ids: torch.Tensor, + multimodal_token_id: int, + get_text_embeds: Callable[[torch.Tensor], torch.Tensor], + get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor, + List[torch.Tensor]]], +) -> torch.Tensor: + """ + Embed token IDs and multimodal inputs and combine their embeddings. + + ``multimodal_token_id`` is used to determine whether a token ID should + be embedded using ``get_text_embeds`` or ``get_multimodal_embeds``. + + Compared to ``merge_multimodal_embeddings`, this avoids running + ``get_text_embeds`` on ``input_ids[input_ids == multimodal_token_id]`` + which causes issues when the placeholder token ID exceeds the + vocabulary size of the language model. + """ + is_multimodal = input_ids == multimodal_token_id + is_text = ~is_multimodal + + text_embeds = get_text_embeds(input_ids[is_text]) + multimodal_embeds = get_multimodal_embeds(input_ids[is_multimodal]) + + merged_embeds = torch.empty( + (input_ids.shape[0], text_embeds.shape[1]), + dtype=text_embeds.dtype, + device=text_embeds.device, + ) + + merged_embeds[is_text] = text_embeds + + return _merge_multimodal_embeddings( + merged_embeds, + is_multimodal, + multimodal_embeds, + ) + + +def merge_multimodal_embeddings( + input_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + multimodal_embeddings: NestedTensors, + placeholder_token_id: int, +) -> torch.Tensor: + """ + Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the + positions in ``inputs_embeds`` corresponding to placeholder tokens in + ``input_ids``. + + Note: + This updates ``inputs_embeds`` in place. + """ + return _merge_multimodal_embeddings( + inputs_embeds, + (input_ids == placeholder_token_id), + multimodal_embeddings, + ) + + class LayerFn(Protocol): def __call__(self, prefix: str) -> torch.nn.Module: From 51c24c9736b1dbe65cb203deb9e56d4037eb1ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Wed, 23 Oct 2024 00:43:07 -0400 Subject: [PATCH 10/15] [Build] Fix `FetchContent` multiple build issue (#9596) Signed-off-by: luka --- CMakeLists.txt | 10 ++++++---- setup.py | 8 ++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a53a8575d01ca..d1956f3d409b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,12 +169,12 @@ endif() # # Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process. -# Configure it to place files in vllm/.deps, in order to play nicely with sccache. +# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache. +# Each dependency that produces build artifacts should override its BINARY_DIR to avoid +# conflicts between build types. It should instead be set to ${CMAKE_BINARY_DIR}/. # include(FetchContent) -get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) -file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}") -set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps") +file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}") # @@ -509,6 +509,8 @@ else() GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git GIT_TAG 013f0c4fc47e6574060879d9734c1df8c5c273bd GIT_PROGRESS TRUE + # Don't share the vllm-flash-attn build between build types + BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn ) endif() diff --git a/setup.py b/setup.py index d1f4b7f1c1119..8abeb0ba739db 100644 --- a/setup.py +++ b/setup.py @@ -157,6 +157,14 @@ def configure(self, ext: CMakeExtension) -> None: # on subsequent calls to python. cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))] + # Override the base directory for FetchContent downloads to $ROOT/.deps + # This allows sharing dependencies between profiles, + # and plays more nicely with sccache. + # To override this, set the FETCHCONTENT_BASE_DIR environment variable. + fc_base_dir = os.path.join(ROOT_DIR, ".deps") + fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir) + cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)] + # # Setup parallelism and build tool # From 2394962d7083f1c1001dba9efefadb674321e688 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 23 Oct 2024 16:28:21 +0800 Subject: [PATCH 11/15] [Hardware][XPU] using current_platform.is_xpu (#9605) --- vllm/attention/selector.py | 6 +++--- vllm/config.py | 4 ++-- vllm/executor/ray_utils.py | 4 ++-- vllm/model_executor/custom_op.py | 4 ++-- vllm/utils.py | 29 +++-------------------------- vllm/worker/xpu_worker.py | 7 ++++--- 6 files changed, 16 insertions(+), 38 deletions(-) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 714c4f7fdb4e5..cd3c642b8c8a2 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -10,7 +10,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino, is_xpu +from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino logger = init_logger(__name__) @@ -136,7 +136,7 @@ def get_attn_backend( from vllm.attention.backends.openvino import OpenVINOAttentionBackend return OpenVINOAttentionBackend elif backend == _Backend.IPEX: - assert is_xpu(), RuntimeError( + assert current_platform.is_xpu(), RuntimeError( "IPEX attention backend is only used for the XPU device.") logger.info("Using IPEX attention backend.") from vllm.attention.backends.ipex_attn import IpexAttnBackend @@ -198,7 +198,7 @@ def which_attn_to_use( logger.info("Cannot use %s backend on OpenVINO.", selected_backend) return _Backend.OPENVINO - if is_xpu(): + if current_platform.is_xpu(): if selected_backend != _Backend.IPEX: logger.info("Cannot use %s backend on XPU.", selected_backend) return _Backend.IPEX diff --git a/vllm/config.py b/vllm/config.py index 12935e77c2aa7..c569789c650ab 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -17,7 +17,7 @@ get_hf_image_processor_config, get_hf_text_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - is_hip, is_openvino, is_xpu, print_warning_once) + is_hip, is_openvino, print_warning_once) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -1121,7 +1121,7 @@ def __init__(self, device: str = "auto") -> None: self.device_type = "tpu" elif current_platform.is_cpu(): self.device_type = "cpu" - elif is_xpu(): + elif current_platform.is_xpu(): self.device_type = "xpu" else: raise RuntimeError("Failed to infer device type") diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7e46acefc5b0e..0af7b3386d895 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -10,7 +10,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.utils import get_ip, is_hip, is_xpu +from vllm.utils import get_ip, is_hip from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -231,7 +231,7 @@ def initialize_ray_cluster( assert_ray_available() # Connect to a ray cluster. - if is_hip() or is_xpu(): + if is_hip() or current_platform.is_xpu(): ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index d7506d268e73b..71eed6eb68d78 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -7,7 +7,7 @@ from vllm.compilation.levels import CompilationLevel from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import is_hip, is_xpu, print_warning_once +from vllm.utils import is_hip, print_warning_once logger = init_logger(__name__) @@ -78,7 +78,7 @@ def dispatch_forward(self): return self.forward_cpu elif current_platform.is_tpu(): return self.forward_tpu - elif is_xpu(): + elif current_platform.is_xpu(): return self.forward_xpu else: return self.forward_cuda diff --git a/vllm/utils.py b/vllm/utils.py index 797c1bcfd5342..0e9b241b6f9f6 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -327,29 +327,6 @@ def is_openvino() -> bool: return False -@lru_cache(maxsize=None) -def is_xpu() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - is_xpu_flag = "xpu" in version("vllm") - except PackageNotFoundError: - return False - # vllm is not build with xpu - if not is_xpu_flag: - return False - try: - import intel_extension_for_pytorch as ipex # noqa: F401 - _import_ipex = True - except ImportError as e: - logger.warning("Import Error for IPEX: %s", e.msg) - _import_ipex = False - # ipex dependency is not ready - if not _import_ipex: - logger.warning("not found ipex lib") - return False - return hasattr(torch, "xpu") and torch.xpu.is_available() - - @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -379,7 +356,7 @@ def seed_everything(seed: int) -> None: if current_platform.is_cuda_alike(): torch.cuda.manual_seed_all(seed) - if is_xpu(): + if current_platform.is_xpu(): torch.xpu.manual_seed_all(seed) @@ -774,7 +751,7 @@ def is_pin_memory_available() -> bool: print_warning_once("Using 'pin_memory=False' as WSL is detected. " "This may slow down the performance.") return False - elif is_xpu(): + elif current_platform.is_xpu(): print_warning_once("Pin memory is not supported on XPU.") return False elif current_platform.is_neuron(): @@ -795,7 +772,7 @@ def current_memory_usage(self) -> float: if current_platform.is_cuda_alike(): torch.cuda.reset_peak_memory_stats(self.device) mem = torch.cuda.max_memory_allocated(self.device) - elif is_xpu(): + elif current_platform.is_xpu(): torch.xpu.reset_peak_memory_stats(self.device) # type: ignore mem = torch.xpu.max_memory_allocated(self.device) # type: ignore return mem diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 9ad070d042a3d..917866f2d985b 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -17,7 +17,7 @@ from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.utils import is_xpu +from vllm.platforms import current_platform from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker from vllm.worker.worker_base import LoraNotSupportedWorkerBase @@ -53,7 +53,7 @@ def __init__( observability_config: Optional[ObservabilityConfig] = None, ) -> None: assert device_config.device_type == "xpu" - assert is_xpu() + assert current_platform.is_xpu() self.model_config = model_config self.parallel_config = parallel_config @@ -91,7 +91,8 @@ def __init__( self.gpu_cache: Optional[List[List[torch.Tensor]]] def init_device(self) -> None: - if self.device_config.device.type == "xpu" and is_xpu(): + if self.device_config.device.type == "xpu" and current_platform.is_xpu( + ): self.device = torch.device(f"xpu:{self.local_rank}") torch.xpu.set_device(self.device) torch.xpu.empty_cache() From 3ff57ebfcacdd4f7690ed8f5693657de2bdedea8 Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Wed, 23 Oct 2024 18:42:47 +0800 Subject: [PATCH 12/15] [Model] Initialize Florence-2 language backbone support (#9555) --- examples/florence2_inference.py | 44 +++ tests/conftest.py | 28 +- .../vision_language/test_florence2.py | 102 +++++++ vllm/model_executor/models/florence2.py | 261 ++++++++++++++++++ vllm/model_executor/models/registry.py | 1 + 5 files changed, 428 insertions(+), 8 deletions(-) create mode 100644 examples/florence2_inference.py create mode 100644 tests/models/encoder_decoder/vision_language/test_florence2.py create mode 100644 vllm/model_executor/models/florence2.py diff --git a/examples/florence2_inference.py b/examples/florence2_inference.py new file mode 100644 index 0000000000000..b58ac2e1f7ed4 --- /dev/null +++ b/examples/florence2_inference.py @@ -0,0 +1,44 @@ +''' +Demonstrate prompting of text-to-text +encoder/decoder models, specifically Florence-2 +''' +# TODO(Isotr0py): +# Move to offline_inference_vision_language.py after porting vision backbone +from vllm import LLM, SamplingParams + +dtype = "float" + +# Create a Florence-2 encoder/decoder model instance +llm = LLM( + model="microsoft/Florence-2-base", + tokenizer="facebook/bart-base", + dtype=dtype, + trust_remote_code=True, +) + +prompts = [ + "", "", "", + "", "", "", + "", "", "" +] +# Create a sampling params object. +sampling_params = SamplingParams( + temperature=0, + top_p=1.0, + min_tokens=0, + max_tokens=20, +) + +# Generate output tokens from the prompts. The output is a list of +# RequestOutput objects that contain the prompt, generated +# text, and other information. +outputs = llm.generate(prompts, sampling_params) + +# Print the outputs. +for output in outputs: + prompt = output.prompt + encoder_prompt = output.encoder_prompt + generated_text = output.outputs[0].text + print(f"Encoder prompt: {encoder_prompt!r}, " + f"Decoder prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") diff --git a/tests/conftest.py b/tests/conftest.py index 76f581e0363f7..b11bbcb4ab7d1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -253,7 +253,9 @@ def __init__( dtype: str = "half", *, model_kwargs: Optional[Dict[str, Any]] = None, + is_embedding_model: bool = False, is_sentence_transformer: bool = False, + skip_tokenizer_init: bool = False, auto_cls: Type[_BaseAutoModelClass] = AutoModelForCausalLM, postprocess_inputs: Callable[[BatchEncoding], BatchEncoding] = identity, @@ -281,11 +283,12 @@ def __init__( **model_kwargs, )) - self.tokenizer = AutoTokenizer.from_pretrained( - model_name, - torch_dtype=torch_dtype, - trust_remote_code=True, - ) + if not skip_tokenizer_init: + self.tokenizer = AutoTokenizer.from_pretrained( + model_name, + torch_dtype=torch_dtype, + trust_remote_code=True, + ) # don't put this import at the top level # it will call torch.cuda.device_count() @@ -295,6 +298,8 @@ def __init__( torch_dtype=torch_dtype, trust_remote_code=True, ) + if skip_tokenizer_init: + self.tokenizer = self.processor.tokenizer self.postprocess_inputs = postprocess_inputs @@ -535,6 +540,7 @@ def generate_encoder_decoder_greedy_logprobs_limit( encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], max_tokens: int, num_logprobs: int, + images: Optional[PromptImageInput] = None, **kwargs: Any, ) -> List[TokensTextLogprobs]: ''' @@ -545,11 +551,17 @@ def generate_encoder_decoder_greedy_logprobs_limit( all_output_ids: List[List[int]] = [] all_output_strs: List[str] = [] - for (encoder_prompt, - decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts): + for i, (encoder_prompt, decoder_prompt) in enumerate( + to_enc_dec_tuple_list(encoder_decoder_prompts)): + processor_kwargs: Dict[str, Any] = { + "text": encoder_prompt, + "return_tensors": "pt", + } + if images is not None and images[i] is not None: + processor_kwargs["images"] = images[i] encoder_input_ids = self.wrap_device( - self.tokenizer(encoder_prompt, return_tensors="pt").input_ids, + self.processor(**processor_kwargs).input_ids, device=self.model.device.type, ) diff --git a/tests/models/encoder_decoder/vision_language/test_florence2.py b/tests/models/encoder_decoder/vision_language/test_florence2.py new file mode 100644 index 0000000000000..483773f069133 --- /dev/null +++ b/tests/models/encoder_decoder/vision_language/test_florence2.py @@ -0,0 +1,102 @@ +from functools import partial +from typing import List, Optional, Tuple, Type + +import pytest +from PIL import Image + +from vllm.inputs.data import ExplicitEncoderDecoderPrompt +from vllm.sequence import SampleLogprobs + +from ....conftest import HfRunner, VllmRunner +from ...utils import check_logprobs_close + +Florence2Prompt = partial(ExplicitEncoderDecoderPrompt, + decoder_prompt=None, + mm_processor_kwargs=None) + +MODELS = ["microsoft/Florence-2-base"] +# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer +# Therefore, we borrow the BartTokenizer from the original Bart model +TOKENIZER = "facebook/bart-base" +PROMPTS = [ + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), + Florence2Prompt(encoder_prompt=""), +] + + +def vllm_to_hf_output(vllm_output: Tuple[List[int], str, + Optional[SampleLogprobs]], ): + """Sanitize vllm output to be comparable with hf output.""" + output_ids, output_str, out_logprobs = vllm_output + + hf_output_str = "" + output_str + "" + + return output_ids, hf_output_str, out_logprobs + + +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + prompts: List[ExplicitEncoderDecoderPrompt], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +) -> None: + with vllm_runner(model, + tokenizer_name=TOKENIZER, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs = vllm_model.generate_encoder_decoder_greedy_logprobs( + prompts, max_tokens, num_logprobs) + + # Florence-2 processors require image inputs + dummy_image = Image.new(mode="RGB", size=(2, 2)) + with hf_runner(model, dtype=dtype, skip_tokenizer_init=True) as hf_model: + hf_model.model.get_output_embeddings = lambda: \ + hf_model.model.language_model.lm_head + hf_outputs = (hf_model.generate_encoder_decoder_greedy_logprobs_limit( + prompts, + max_tokens, + num_logprobs, + images=[dummy_image] * len(prompts), + )) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=[ + vllm_to_hf_output(vllm_output) for vllm_output in vllm_outputs + ], + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["float"]) +@pytest.mark.parametrize("max_tokens", [64]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(hf_runner, vllm_runner, model, dtype, max_tokens, + num_logprobs) -> None: + run_test( + hf_runner, + vllm_runner, + PROMPTS, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py new file mode 100644 index 0000000000000..6840ac8b9e303 --- /dev/null +++ b/vllm/model_executor/models/florence2.py @@ -0,0 +1,261 @@ +import math +from typing import Iterable, List, Optional, Tuple + +import torch +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.attention import AttentionMetadata +from vllm.config import CacheConfig +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.sampler import Sampler, SamplerOutput +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.bart import (BartDecoder, BartEncoder, + BartParallelLMHead, + BartScaledWordEmbedding) +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors + +from .utils import AutoWeightsLoader + + +class Florence2LanguageModel(nn.Module): + + def __init__(self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.shared = BartScaledWordEmbedding(self.vocab_size, config.d_model) + self.encoder = BartEncoder(config, + cache_config=cache_config, + quant_config=quant_config) + self.decoder = BartDecoder(config, + cache_config=cache_config, + quant_config=quant_config) + + if self.config.tie_word_embeddings: + self.encoder.embed_tokens.weight = self.shared.weight + self.decoder.embed_tokens.weight = self.shared.weight + + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata) -> torch.Tensor: + r""" + Args: + input_ids + Indices of *decoder* input sequence tokens in the vocabulary. + Padding will be ignored by default should you + provide it. + positions + Positions of *decoder* input sequence tokens. + encoder_input_ids + Indices of *encoder* input sequence tokens in the vocabulary. + encoder_positions: + Positions of *encoder* input sequence tokens. + kv_caches: + Layer-wise list of KV cache tensors + attn_metadata: + vLLM Attention metadata structure + Returns: + Model output torch.Tensor + """ + + encoder_hidden_states = None + + if encoder_input_ids.numel() > 0: + # Run encoder attention if a non-zero number of encoder tokens + # are provided as input + encoder_hidden_states = self.encoder(input_ids=encoder_input_ids, + positions=encoder_positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata) + + # decoder outputs consists of + # (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + decoder_input_ids=input_ids, + decoder_positions=positions, + encoder_hidden_states=encoder_hidden_states, + kv_caches=kv_caches, + attn_metadata=attn_metadata) + + return decoder_outputs + + +class Florence2LanguageForConditionalGeneration(nn.Module): + + def __init__(self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + self.config = config + self.model = Florence2LanguageModel(config, + cache_config=cache_config, + quant_config=quant_config) + embed_scale = math.sqrt( + config.d_model) if config.scale_embedding else 1.0 + + self.vocab_size = config.vocab_size + self.lm_head = BartParallelLMHead(self.vocab_size, + config.d_model, + embed_scale=embed_scale) + + self.logits_processor = LogitsProcessor(self.vocab_size, + config.vocab_size) + self.sampler = Sampler() + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + **kwargs, + ) -> torch.Tensor: + r""" + Args: + input_ids + torch.Tensor of *decoder* input token ids. + positions + torch.Tensor of *decoder* position indices. + encoder_input_ids + torch.Tensor of *encoder* input token ids. + encoder_positions + torch.Tensor of *encoder* position indices + kv_caches: + Layer-wise list of KV cache tensors + attn_metadata: + vLLM Attention metadata structure + Returns: + Output torch.Tensor + """ + return self.model(input_ids, positions, encoder_input_ids, + encoder_positions, kv_caches, attn_metadata) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata) + return logits + + def sample(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata) -> SamplerOutput: + next_tokens = self.sampler(logits, sampling_metadata) + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] + + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + if "final_logits_bias" in name: + continue + if self.config.tie_word_embeddings and "embed_tokens" in name: + continue + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + +class Florence2ForConditionalGeneration(nn.Module): + + def __init__(self, + config: PretrainedConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): + super().__init__() + + # TODO(Isotr0py): Add vision backbone + self.language_model = Florence2LanguageForConditionalGeneration( + config=config.text_config, + cache_config=cache_config, + quant_config=quant_config) + + @property + def sampler(self): + return self.language_model.sampler + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + *, + encoder_input_ids: torch.Tensor, + encoder_positions: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + r""" + Args: + input_ids + torch.Tensor of *decoder* input token ids. + positions + torch.Tensor of *decoder* position indices. + encoder_input_ids + torch.Tensor of *encoder* input token ids. + encoder_positions + torch.Tensor of *encoder* position indices + kv_caches: + Layer-wise list of KV cache tensors + attn_metadata: + vLLM Attention metadata structure + Returns: + Output torch.Tensor + """ + return self.language_model(input_ids, positions, encoder_input_ids, + encoder_positions, kv_caches, attn_metadata) + + def compute_logits( + self, + hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[torch.Tensor]: + return self.language_model.compute_logits(hidden_states, + sampling_metadata) + + def sample( + self, + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> SamplerOutput: + return self.language_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + skip_prefixes = [ + 'image_projection', "vision_tower", "image_proj_norm", + "image_pos_embed", "visual_temporal_embed" + ] + loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) + loader.load_weights(weights) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index a255b2a2f3982..787c65743e894 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -85,6 +85,7 @@ # [Encoder-decoder] "BartModel": ("bart", "BartForConditionalGeneration"), "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), + "Florence2ForConditionalGeneration": ("florence2", "Florence2ForConditionalGeneration"), # noqa: E501 } _EMBEDDING_MODELS = { From c18e1a34189812af21aa504f9166de5ed4a86675 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 23 Oct 2024 19:27:37 +0800 Subject: [PATCH 13/15] [VLM] Enable overriding whether post layernorm is used in vision encoder + fix quant args (#9217) Co-authored-by: Isotr0py <2037008807@qq.com> --- .../model_executor/layers/quantization/awq.py | 20 ++- vllm/model_executor/models/blip.py | 87 +++++++++---- vllm/model_executor/models/blip2.py | 2 +- vllm/model_executor/models/clip.py | 104 ++++++++++----- .../models/idefics2_vision_model.py | 51 ++++++-- vllm/model_executor/models/intern_vit.py | 41 ++++-- vllm/model_executor/models/internvl.py | 41 +++++- vllm/model_executor/models/llava.py | 32 ++++- vllm/model_executor/models/llava_next.py | 30 +---- .../model_executor/models/llava_next_video.py | 29 +---- vllm/model_executor/models/llava_onevision.py | 29 +---- vllm/model_executor/models/minicpmv.py | 33 +++-- vllm/model_executor/models/mllama.py | 120 +++++++++++++----- vllm/model_executor/models/nvlm_d.py | 5 + vllm/model_executor/models/paligemma.py | 3 +- vllm/model_executor/models/phi3v.py | 15 ++- vllm/model_executor/models/pixtral.py | 90 +++++++++++-- vllm/model_executor/models/siglip.py | 72 ++++++++--- 18 files changed, 551 insertions(+), 253 deletions(-) diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 410b3cb5321cb..38dd1f2e10fcd 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -3,7 +3,8 @@ import torch from vllm import _custom_ops as ops -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.parameter import (GroupQuantScaleParameter, @@ -21,10 +22,12 @@ def __init__( weight_bits: int, group_size: int, zero_point: bool, + modules_to_not_convert: Optional[List[str]] = None, ) -> None: self.weight_bits = weight_bits self.group_size = group_size self.zero_point = zero_point + self.modules_to_not_convert = modules_to_not_convert or [] if self.weight_bits != 4: raise ValueError( @@ -35,7 +38,8 @@ def __init__( def __repr__(self) -> str: return (f"AWQConfig(weight_bits={self.weight_bits}, " f"group_size={self.group_size}, " - f"zero_point={self.zero_point})") + f"zero_point={self.zero_point}, " + f"modules_to_not_convert={self.modules_to_not_convert})") def get_name(self) -> str: return "awq" @@ -61,11 +65,15 @@ def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) zero_point = cls.get_from_keys(config, ["zero_point"]) - return cls(weight_bits, group_size, zero_point) + modules_to_not_convert = cls.get_from_keys_or( + config, ["modules_to_not_convert"], None) + return cls(weight_bits, group_size, zero_point, modules_to_not_convert) def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["AWQLinearMethod"]: + prefix: str) -> Optional["LinearMethodBase"]: if isinstance(layer, LinearBase): + if is_layer_skipped_awq(prefix, self.modules_to_not_convert): + return UnquantizedLinearMethod() return AWQLinearMethod(self) return None @@ -73,6 +81,10 @@ def get_scaled_act_names(self) -> List[str]: return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"] +def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]): + return any(module_name in prefix for module_name in modules_to_not_convert) + + class AWQLinearMethod(LinearMethodBase): """Linear method for AWQ. diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 778162dd63ca6..1f2d7384076ed 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -122,7 +122,7 @@ def input_processor_for_blip( # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/blip/modeling_blip.py#L164 # noqa class BlipVisionEmbeddings(nn.Module): - def __init__(self, config: BlipVisionConfig): + def __init__(self, config: Union[BlipVisionConfig, Blip2VisionConfig]): super().__init__() self.config = config @@ -167,9 +167,10 @@ class BlipParallelAttention(nn.Module): def __init__( self, - config: BlipVisionConfig, + config: Union[BlipVisionConfig, Blip2VisionConfig], quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -189,11 +190,13 @@ def __init__( self.num_heads, bias=config.qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv", ) self.projection = RowParallelLinear( self.embed_dim, self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.projection", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -235,9 +238,12 @@ def forward( class BlipMLP(nn.Module): - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -246,11 +252,13 @@ def __init__(self, self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -262,24 +270,32 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class BlipEncoderLayer(nn.Module): - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() # fallback to sdpa attention if tp unavailable num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = BlipParallelAttention(config, - quant_config=quant_config) + self.self_attn = BlipParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: # Blip doesn't have SDPA attention implemented in transformers # use eager attention instead for cpu backend self.self_attn = BlipAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = BlipMLP(config, quant_config=quant_config) + self.mlp = BlipMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -307,10 +323,13 @@ class BlipEncoder(nn.Module): config: BlipConfig """ - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -321,8 +340,10 @@ def __init__(self, num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - BlipEncoderLayer(config=config, quant_config=quant_config) - for _ in range(num_hidden_layers) + BlipEncoderLayer(config=config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -337,10 +358,15 @@ class BlipVisionModel(nn.Module): config_class = BlipVisionConfig main_input_name = "pixel_values" - def __init__(self, - config: BlipVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: BlipVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -354,19 +380,24 @@ def __init__(self, config=config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.encoder", ) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index d6fe7d150336a..cd2013e91514d 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -490,7 +490,7 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_model = BlipVisionModel(config.vision_config) + self.vision_model = BlipVisionModel(config.vision_config, quant_config) self.query_tokens = nn.Parameter( torch.zeros(1, config.num_query_tokens, diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 7b0981d611b25..6b45cb384d4a0 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -192,6 +192,7 @@ def __init__( self, config: CLIPVisionConfig, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -211,12 +212,14 @@ def __init__( head_size=self.head_dim, total_num_heads=self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( input_size=self.embed_dim, output_size=self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -259,20 +262,25 @@ def forward( class CLIPMLP(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -284,21 +292,29 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class CLIPEncoderLayer(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = CLIPParallelAttention(config, - quant_config=quant_config) + self.self_attn = CLIPParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: self.self_attn = CLIPSdpaAttention(config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = CLIPMLP(config, quant_config=quant_config) + self.mlp = CLIPMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -327,11 +343,15 @@ class CLIPEncoder(nn.Module): config: CLIPConfig """ - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + num_hidden_layers_override: Optional[int] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config if num_hidden_layers_override is None: @@ -339,8 +359,10 @@ def __init__(self, else: num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - CLIPEncoderLayer(config=config, quant_config=quant_config) - for _ in range(num_hidden_layers) + CLIPEncoderLayer(config=config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -354,11 +376,17 @@ def forward(self, inputs_embeds: torch.Tensor): class CLIPVisionTransformer(nn.Module): - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config embed_dim = config.hidden_size @@ -370,19 +398,25 @@ def __init__(self, self.encoder = CLIPEncoder( config=config, quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers_override) + num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.encoder", + ) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None def forward( @@ -405,10 +439,15 @@ class CLIPVisionModel(nn.Module): config_class = CLIPVisionConfig main_input_name = "pixel_values" - def __init__(self, - config: CLIPVisionConfig, - quant_config: Optional[QuantizationConfig] = None, - num_hidden_layers_override: Optional[int] = None): + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() tp_size = get_tensor_model_parallel_world_size() @@ -418,7 +457,10 @@ def __init__(self, self.vision_model = CLIPVisionTransformer( config=config, quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers_override) + num_hidden_layers_override=num_hidden_layers_override, + require_post_norm=require_post_norm, + prefix=f"{prefix}.vision_model", + ) def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return self.vision_model(pixel_values) diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index 3b0b6febaa48c..43f4f29814e6d 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -113,7 +113,8 @@ def __init__( self, config: Idefics2Config, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -130,12 +131,14 @@ def __init__( self.head_dim, self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( self.embed_dim, self.embed_dim, bias=True, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) @@ -178,7 +181,8 @@ def __init__( self, config: Idefics2Config, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config self.activation_fn = get_act_fn(config.hidden_act) @@ -187,12 +191,14 @@ def __init__( config.intermediate_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.fc1", ) self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.fc2", ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -204,13 +210,22 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: class Idefics2EncoderLayer(nn.Module): - def __init__(self, config: Idefics2Config): + def __init__( + self, + config: Idefics2Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() self.embed_dim = config.hidden_size - self.self_attn = Idefics2VisionAttention(config) + self.self_attn = Idefics2VisionAttention(config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn") self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = Idefics2VisionMLP(config) + self.mlp = Idefics2VisionMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -245,12 +260,20 @@ class Idefics2Encoder(nn.Module): config: Idefics2Config """ - def __init__(self, config: Idefics2Config): + def __init__( + self, + config: Idefics2Config, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.layers = nn.ModuleList([ - Idefics2EncoderLayer(config) - for _ in range(config.num_hidden_layers) + Idefics2EncoderLayer(config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(config.num_hidden_layers) ]) def forward( @@ -275,12 +298,20 @@ def forward( class Idefics2VisionTransformer(nn.Module): - def __init__(self, config: Idefics2VisionConfig): + def __init__( + self, + config: Idefics2VisionConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + embed_dim = config.hidden_size self.config = config self.embeddings = Idefics2VisionEmbeddings(config) - self.encoder = Idefics2Encoder(config) + self.encoder = Idefics2Encoder(config, + quant_config=quant_config, + prefix=f"{prefix}.encoder") self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index b59671e914e7d..9761635d2a6c2 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -137,6 +137,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, *, num_dummy_heads: int = 0, + prefix: str = "", ) -> None: super().__init__() @@ -165,6 +166,7 @@ def __init__( num_dummy_heads + self.num_heads, bias=config.qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv", ) self.qk_normalization = config.qk_normalization @@ -181,6 +183,7 @@ def __init__( self.dummy_dim, self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.proj", ) def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor): @@ -284,20 +287,26 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class InternMLP(nn.Module): - def __init__(self, - config: PretrainedConfig, - quant_config: Optional[QuantizationConfig] = None): + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.activation_fn = get_act_fn(config.hidden_act) self.fc1 = ColumnParallelLinear(config.hidden_size, config.intermediate_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc1") self.fc2 = RowParallelLinear(config.intermediate_size, config.hidden_size, bias=True, - quant_config=quant_config) + quant_config=quant_config, + prefix=f"{prefix}.fc2") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states, _ = self.fc1(hidden_states) @@ -315,6 +324,7 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, *, num_dummy_heads: int = 0, + prefix: str = "", ) -> None: super().__init__() @@ -324,9 +334,12 @@ def __init__( self.attn = self._init_attn(config, quant_config, - num_dummy_heads=num_dummy_heads) + num_dummy_heads=num_dummy_heads, + prefix=f"{prefix}.attn") - self.mlp = InternMLP(config, quant_config=quant_config) + self.mlp = InternMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) self.norm2 = NORM2FN[self.norm_type](self.embed_dim, @@ -343,6 +356,7 @@ def _init_attn( quant_config: Optional[QuantizationConfig], *, num_dummy_heads: int, + prefix: str = "", ): # fallback to sdpa attention if tp unavailable tp_size = get_tensor_model_parallel_world_size() @@ -351,7 +365,8 @@ def _init_attn( if USE_XFORMERS_OPS and (num_heads + num_dummy_heads) % tp_size == 0: return InternParallelAttention(config, quant_config=quant_config, - num_dummy_heads=num_dummy_heads) + num_dummy_heads=num_dummy_heads, + prefix=prefix) return InternSdpaAttention(config, num_dummy_heads=num_dummy_heads) @@ -377,6 +392,7 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, + prefix: str = "", ): super().__init__() @@ -390,8 +406,9 @@ def __init__( self.layers = nn.ModuleList([ InternVisionEncoderLayer(config, quant_config, - num_dummy_heads=num_dummy_heads) - for _ in range(num_hidden_layers) + num_dummy_heads=num_dummy_heads, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward(self, inputs_embeds: torch.Tensor): @@ -412,7 +429,8 @@ def __init__( *, num_hidden_layers_override: Optional[int] = None, num_dummy_heads: int = 0, - ): + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -423,6 +441,7 @@ def __init__( quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, num_dummy_heads=num_dummy_heads, + prefix=f"{prefix}.encoder", ) def get_input_embeddings(self): diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index a80e00e34957c..3ae37d9fe5d85 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -19,7 +19,8 @@ from vllm.config import CacheConfig, MultiModalConfig from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, token_inputs) -from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.quantization import (AWQConfig, + QuantizationConfig) from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) @@ -418,11 +419,11 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config + self._patch_quant_config(config, quant_config) image_size = config.force_image_size or config.vision_config.image_size patch_size = config.vision_config.patch_size self.patch_size = patch_size - self.select_layer = config.select_layer self.num_image_token = int( (image_size // patch_size)**2 * (config.downsample_ratio**2)) self.downsample_ratio = config.downsample_ratio @@ -430,7 +431,12 @@ def __init__(self, self.llm_arch_name = config.text_config.architectures[0] self.is_mono = self.llm_arch_name == 'InternLM2VEForCausalLM' - self.vision_model = self._init_vision_model(config, self.is_mono) + self.vision_model = self._init_vision_model( + config, + quant_config=quant_config, + is_mono=self.is_mono, + prefix="vision_model", + ) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) @@ -441,6 +447,18 @@ def __init__(self, self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) + def _patch_quant_config(self, config: PretrainedConfig, + quant_config: QuantizationConfig): + # the awq models from OpenGVLab missing `modules_to_not_convert` + # patch the quant_config to add `modules_to_not_convert` back + if isinstance(quant_config, AWQConfig): + text_config = config.text_config + llm_quant_config = getattr(text_config, "quantization_config", + None) + if (not quant_config.modules_to_not_convert) and \ + (llm_quant_config is not None): + quant_config.modules_to_not_convert.append("vision_model") + @cached_property def sampler(self): if hasattr(self.language_model, "sampler"): @@ -448,17 +466,28 @@ def sampler(self): return Sampler() - def _init_vision_model(self, config: PretrainedConfig, is_mono: bool): + def _init_vision_model( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + *, + is_mono: bool, + prefix: str, + ): if not is_mono: - vision_feature_layer = self.select_layer + vision_feature_layer = config.select_layer if vision_feature_layer < 0: num_hidden_layers = config.vision_config.num_hidden_layers \ + vision_feature_layer + 1 else: num_hidden_layers = vision_feature_layer + 1 + return InternVisionModel( config.vision_config, - num_hidden_layers_override=num_hidden_layers) + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers, + prefix=prefix, + ) else: return InternVisionPatchModel(config.vision_config) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index a666dcba290f2..83e869efa4712 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,12 +1,12 @@ from functools import cached_property -from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, - TypedDict, Union) +from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, + Tuple, TypedDict, Union) import torch import torch.nn as nn from PIL import Image from transformers import (CLIPVisionConfig, LlavaConfig, PixtralVisionConfig, - SiglipVisionConfig) + PretrainedConfig, SiglipVisionConfig) from vllm.attention import AttentionMetadata from vllm.config import CacheConfig, MultiModalConfig @@ -200,7 +200,17 @@ def input_processor_for_llava(ctx: InputContext, inputs: DecoderOnlyInputs): raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaConfig): +class LlavaLikeConfig(Protocol): + vision_config: PretrainedConfig + vision_feature_layer: int + + +def init_vision_tower_for_llava( + hf_config: LlavaLikeConfig, + quant_config: Optional[QuantizationConfig], + *, + require_post_norm: Optional[bool] = None, +): vision_config = hf_config.vision_config # Initialize the vision tower only up to the required feature layer @@ -214,16 +224,24 @@ def _init_vision_tower(hf_config: LlavaConfig): if isinstance(vision_config, CLIPVisionConfig): return CLIPVisionModel( vision_config, + quant_config, num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, ) elif isinstance(vision_config, SiglipVisionConfig): return SiglipVisionModel( vision_config, + quant_config, num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, ) elif isinstance(vision_config, PixtralVisionConfig): - # TODO: allow layer override? - return PixtralHFVisionModel(vision_config) + return PixtralHFVisionModel( + vision_config, + quant_config, + num_hidden_layers_override=num_hidden_layers, + require_post_norm=require_post_norm, + ) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) @@ -255,7 +273,7 @@ def __init__(self, config.projector_hidden_act = "gelu" # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 46cba8ebbc583..d33d4ac5bfaed 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -26,7 +26,7 @@ dummy_seq_data_for_clip, get_clip_image_feature_size, get_clip_patch_grid_length, input_processor_for_clip) from .interfaces import SupportsMultiModal, SupportsPP -from .llava import LlavaMultiModalProjector +from .llava import LlavaMultiModalProjector, init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) @@ -259,32 +259,6 @@ def input_processor_for_llava_next(ctx: InputContext, raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaNextConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - @MULTIMODAL_REGISTRY.register_image_input_mapper() @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_next_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next) @@ -303,7 +277,7 @@ def __init__(self, self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.image_newline = nn.Parameter( torch.empty(config.text_config.hidden_size)) self.multi_modal_projector = LlavaMultiModalProjector( diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index 4a354b616c2f6..d02cf9044dfc0 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -26,6 +26,7 @@ from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsMultiModal, SupportsPP +from .llava import init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip) from .utils import (AutoWeightsLoader, init_vllm_registered_model, @@ -179,32 +180,6 @@ def input_processor_for_llava_next_video(ctx: InputContext, raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaNextVideoConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - # adopted from transformers modeling_llava_next_video.py class LlavaNextVideoPooler(nn.Module): @@ -281,7 +256,7 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.vision_resampler = LlavaNextVideoPooler(config) self.multi_modal_projector = LlavaNextMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5bd3055ca181a..10aa8049a2347 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -31,6 +31,7 @@ dummy_video_for_clip, get_clip_image_feature_size, get_clip_patch_grid_length, input_processor_for_clip) from .interfaces import SupportsMultiModal, SupportsPP +from .llava import init_vision_tower_for_llava from .siglip import (SiglipVisionModel, dummy_seq_data_for_siglip, dummy_video_for_siglip, get_siglip_image_feature_size, get_siglip_patch_grid_length, input_processor_for_siglip) @@ -357,32 +358,6 @@ def input_processor_for_llava_onevision(ctx: InputContext, raise NotImplementedError(msg) -def _init_vision_tower(hf_config: LlavaOnevisionConfig): - vision_config = hf_config.vision_config - - # Initialize the vision tower only up to the required feature layer - vision_feature_layer = hf_config.vision_feature_layer - if vision_feature_layer < 0: - num_hidden_layers = hf_config.vision_config.num_hidden_layers \ - + vision_feature_layer + 1 - else: - num_hidden_layers = vision_feature_layer + 1 - - if isinstance(vision_config, CLIPVisionConfig): - return CLIPVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - elif isinstance(vision_config, SiglipVisionConfig): - return SiglipVisionModel( - vision_config, - num_hidden_layers_override=num_hidden_layers, - ) - - msg = f"Unsupported vision config: {type(vision_config)}" - raise NotImplementedError(msg) - - class LlavaOnevisionMultiModalProjector(nn.Module): def __init__(self, config: LlavaOnevisionConfig): @@ -425,7 +400,7 @@ def __init__(self, self.multimodal_config = multimodal_config # Initialize the vision tower only up to the required feature layer - self.vision_tower = _init_vision_tower(config) + self.vision_tower = init_vision_tower_for_llava(config, quant_config) self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index ca7c2be5a038e..2ec51dc4647f5 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -395,7 +395,7 @@ def __init__( self.version = get_version_by_config(self.config) self.llm = self.init_llm(config, cache_config, quant_config) - self.vpm = self.init_vision_module() + self.vpm = self.init_vision_module(config, quant_config) param_dtype = torch.get_default_dtype() self.vpm.to(dtype=param_dtype) self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else @@ -647,7 +647,11 @@ def init_llm( ) -> nn.Module: raise NotImplementedError - def init_vision_module(self) -> nn.Module: + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: raise NotImplementedError def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: @@ -693,7 +697,11 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: # TODO :refactor this vision model try: import timm @@ -817,8 +825,13 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: - model = Idefics2VisionTransformer(self.config.vision_config) + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model @@ -929,9 +942,13 @@ def init_llm( quant_config=quant_config), name="model") - def init_vision_module(self) -> nn.Module: - - model = Idefics2VisionTransformer(self.config.vision_config) + def init_vision_module( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + ) -> nn.Module: + model = Idefics2VisionTransformer(config.vision_config, + quant_config=quant_config) if self.config.drop_vision_last_layer: model.encoder.layers = model.encoder.layers[:-1] return model diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 378231f14455a..23e2b520e5b40 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -379,9 +379,13 @@ def forward( class MllamaVisionEncoderLayer(nn.Module): - def __init__(self, - config: config_mllama.MllamaVisionConfig, - is_gated: bool = False): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + is_gated: bool = False, + ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -390,7 +394,9 @@ def __init__(self, self.intermediate_size = config.intermediate_size self.self_attn = MllamaVisionSdpaAttention(config) - self.mlp = CLIPMLP(config) + self.mlp = CLIPMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") self.input_layernorm = nn.LayerNorm(self.hidden_size, eps=config.norm_eps) @@ -427,16 +433,23 @@ def forward( class MllamaVisionEncoder(nn.Module): - def __init__(self, - config: config_mllama.MllamaVisionConfig, - num_layers=32, - is_gated=False, - output_hidden_states=None): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + num_layers: int = 32, + is_gated: bool = False, + output_hidden_states=None, + prefix: str = "", + ) -> None: super().__init__() self.config = config self.layers = nn.ModuleList([ - MllamaVisionEncoderLayer(config, is_gated) - for _ in range(num_layers) + MllamaVisionEncoderLayer(config, + quant_config=quant_config, + is_gated=is_gated, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_layers) ]) self.output_hidden_states = output_hidden_states or [] @@ -463,8 +476,14 @@ def forward( class MllamaVisionModel(nn.Module): - def __init__(self, config: config_mllama.MllamaVisionConfig): + def __init__( + self, + config: config_mllama.MllamaVisionConfig, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() + self.image_size = config.image_size self.patch_size = config.patch_size self.max_num_tiles = config.max_num_tiles @@ -500,12 +519,19 @@ def __init__(self, config: config_mllama.MllamaVisionConfig): # encoders self.transformer = MllamaVisionEncoder( config, + quant_config, config.num_hidden_layers, is_gated=False, - output_hidden_states=config.intermediate_layers_indices) - self.global_transformer = MllamaVisionEncoder(config, - config.num_global_layers, - is_gated=True) + output_hidden_states=config.intermediate_layers_indices, + prefix=f"{prefix}.transformer", + ) + self.global_transformer = MllamaVisionEncoder( + config, + quant_config, + config.num_global_layers, + is_gated=True, + prefix=f"{prefix}.global_transformer", + ) def apply_class_embedding(self, hidden_state: torch.Tensor) -> torch.Tensor: @@ -648,6 +674,7 @@ def __init__( config: Optional[config_mllama.MllamaTextConfig] = None, layer_idx: Optional[int] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ): super().__init__() self.config = config @@ -673,6 +700,7 @@ def __init__( self.num_key_value_heads, bias=False, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.o_proj = RowParallelLinear( self.num_heads * self.head_dim, @@ -680,6 +708,7 @@ def __init__( bias=False, input_is_parallel=True, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) # vllm.model_executor.layers.layernorm.RMSNorm has precision issue, # use huggingface's instead @@ -692,6 +721,7 @@ def __init__( self.head_dim, self.scaling, self.num_local_key_value_heads, + prefix=f"{prefix}.attn", ) def forward( @@ -791,15 +821,21 @@ class MllamaCrossAttentionDecoderLayer(torch.nn.Module): """Cross-attention transformer block with tanh-gated attention and feedforward.""" - def __init__(self, config: config_mllama.MllamaTextConfig, layer_idx: int, - quant_config: Optional[QuantizationConfig]) \ - -> None: + def __init__( + self, + config: config_mllama.MllamaTextConfig, + layer_idx: int, + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() + self.layer_idx = layer_idx self.cross_attn = MllamaTextCrossAttention( config=config, layer_idx=layer_idx, quant_config=quant_config, + prefix=f"{prefix}.cross_attn", ) self.input_layernorm = RMSNorm(config.hidden_size, @@ -811,6 +847,7 @@ def __init__(self, config: config_mllama.MllamaTextConfig, layer_idx: int, intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -854,10 +891,15 @@ class MllamaTextModel(nn.Module): config_class = config_mllama.MllamaTextConfig base_model_prefix = "model" - def __init__(self, config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig]): + def __init__( + self, + config: config_mllama.MllamaTextConfig, + cache_config: Optional[CacheConfig], + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() + self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8, @@ -869,13 +911,20 @@ def __init__(self, config: config_mllama.MllamaTextConfig, if layer_idx in self.cross_attention_layers: layers.append( MllamaCrossAttentionDecoderLayer( - config, layer_idx, quant_config=quant_config)) + config, + layer_idx, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}", + )) else: # TODO: force LlamaDecoderLayer to config.attention_bias=False layers.append( - LlamaDecoderLayer(config, - cache_config=cache_config, - quant_config=quant_config)) + LlamaDecoderLayer( + config, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}", + )) self.layers = nn.ModuleList(layers) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) @@ -932,12 +981,19 @@ class MllamaForCausalLM(nn.Module): "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer" ] - def __init__(self, config: config_mllama.MllamaTextConfig, - cache_config: Optional[CacheConfig], - quant_config: Optional[QuantizationConfig]): + def __init__( + self, + config: config_mllama.MllamaTextConfig, + cache_config: Optional[CacheConfig], + quant_config: Optional[QuantizationConfig], + prefix: str = "", + ) -> None: super().__init__() self.vocab_size = config.vocab_size - self.model = MllamaTextModel(config, cache_config, quant_config) + self.model = MllamaTextModel(config, + cache_config, + quant_config, + prefix=f"{prefix}.model") self.lm_head = ParallelLMHead( config.vocab_size, config.hidden_size, @@ -994,11 +1050,13 @@ def __init__(self, config.pad_token_id if config.pad_token_id is not None else -1 self.image_size = config.vision_config.image_size - self.vision_model = MllamaVisionModel(config.vision_config) + self.vision_model = MllamaVisionModel(config.vision_config, + quant_config) self.language_model = MllamaForCausalLM( config.text_config, cache_config=cache_config, quant_config=quant_config, + prefix="language_model", ) self.multi_modal_projector = nn.Linear( config.vision_config.vision_output_dim, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index a52e3cb6039be..3e3c3b05879fb 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -4,10 +4,13 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- +from typing import Optional + import torch.nn as nn from transformers import PretrainedConfig from vllm.inputs import INPUT_REGISTRY +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY from .intern_vit import InternVisionModel @@ -56,9 +59,11 @@ def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: ) def _init_vision_model(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], num_hidden_layers: int): # We added additional dummy heads to the original num of heads to make # the number of heads divisible by 8. return InternVisionModel(config.vision_config, + quant_config=quant_config, num_hidden_layers_override=num_hidden_layers, num_dummy_heads=7) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 7806cd6ab4608..7a62a098a4525 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -142,7 +142,8 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config - self.vision_tower = SiglipVisionModel(config.vision_config) + self.vision_tower = SiglipVisionModel(config.vision_config, + quant_config) self.multi_modal_projector = PaliGemmaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, projection_dim=config.vision_config.projection_dim) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 9a1083520efd2..855a9b17585a4 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -70,7 +70,8 @@ projection_dim=768) -def _init_img_processor(hf_config: PretrainedConfig): +def _init_img_processor(hf_config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]): clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG layer_idx = hf_config.img_processor.get('layer_idx', -2) @@ -82,7 +83,10 @@ def _init_img_processor(hf_config: PretrainedConfig): num_hidden_layers = layer_idx + 1 img_processor = CLIPVisionModel( - clip_config, num_hidden_layers_override=num_hidden_layers) + clip_config, + quant_config, + num_hidden_layers_override=num_hidden_layers, + ) return img_processor @@ -148,14 +152,15 @@ def get_img_features(self, class Phi3HDImageEmbedding(Phi3ImageEmbeddingBase): """Phi3 Image embedding with HD transform.""" - def __init__(self, config: PretrainedConfig) -> None: + def __init__(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]) -> None: super().__init__() # n_embed or hidden_size hidden_size = config.n_embd if hasattr( config, 'n_embd') else config.hidden_size - self.img_processor = _init_img_processor(config) + self.img_processor = _init_img_processor(config, quant_config) image_dim_out = config.img_processor['image_dim_out'] self.num_img_tokens = config.img_processor['num_img_tokens'] @@ -535,7 +540,7 @@ def __init__(self, ) # TODO: Optionally initializes this for supporting input embeddings. - self.vision_embed_tokens = Phi3HDImageEmbedding(config) + self.vision_embed_tokens = Phi3HDImageEmbedding(config, quant_config) self.language_model = LlamaForCausalLM(config, cache_config, quant_config) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index f33871c0d5acc..18dbee94e10b0 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -767,9 +767,17 @@ def input_processor_for_pixtral_hf( class PixtralHFMLP(nn.Module): - def __init__(self, config: PixtralVisionConfig): + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + prefix: str = "", + ) -> None: super().__init__() + assert config.intermediate_size is not None + # TODO: Use quant_config and prefix after optimizing this self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) @@ -787,8 +795,15 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class PixtralHFAttention(nn.Module): - def __init__(self, config: PixtralVisionConfig): + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + prefix: str = "", + ) -> None: super().__init__() + self.config = config assert not config.hidden_size % config.num_attention_heads self.n_heads = config.num_attention_heads @@ -796,6 +811,7 @@ def __init__(self, config: PixtralVisionConfig): self.scale = self.head_dim**-0.5 + # TODO: Use quant_config and prefix after optimizing this self.q_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=False) @@ -840,11 +856,22 @@ def forward( class PixtralHFTransformerBlock(nn.Module): - def __init__(self, config: PixtralVisionConfig): + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + prefix: str = "", + ) -> None: super().__init__() + self.attention_norm = RMSNorm(config.hidden_size, eps=1e-5) - self.attention = PixtralHFAttention(config) - self.feed_forward = PixtralHFMLP(config) + self.attention = PixtralHFAttention(config, + quant_config=quant_config, + prefix=f"{prefix}.attention") + self.feed_forward = PixtralHFMLP(config, + quant_config=quant_config, + prefix=f"{prefix}.feed_forward") self.ffn_norm = RMSNorm(config.hidden_size, eps=1e-5) def forward( @@ -864,11 +891,27 @@ def forward( class PixtralHFTransformer(nn.Module): - def __init__(self, config: PixtralVisionConfig): + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + prefix: str = "", + ) -> None: super().__init__() - self.layers = torch.nn.ModuleList() - for _ in range(config.num_hidden_layers): - self.layers.append(PixtralHFTransformerBlock(config)) + + if num_hidden_layers_override is None: + num_hidden_layers = config.num_hidden_layers + else: + num_hidden_layers = num_hidden_layers_override + + self.layers = nn.ModuleList([ + PixtralHFTransformerBlock(config=config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) + ]) def forward( self, @@ -883,7 +926,15 @@ def forward( class PixtralHFVisionModel(nn.Module): - def __init__(self, config: PixtralVisionConfig): + def __init__( + self, + config: PixtralVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + *, + num_hidden_layers_override: Optional[int] = None, + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() self.config = config @@ -895,7 +946,24 @@ def __init__(self, config: PixtralVisionConfig): bias=False, ) self.ln_pre = RMSNorm(config.hidden_size, eps=1e-5) - self.transformer = PixtralHFTransformer(config) + self.transformer = PixtralHFTransformer( + config, + quant_config, + num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.transformer", + ) + + num_hidden_layers = config.num_hidden_layers + if len(self.transformer.layers) > config.num_hidden_layers: + raise ValueError( + f"The original encoder only has {num_hidden_layers} " + f"layers, but you requested {len(self.transformer.layers)} " + "layers.") + + if require_post_norm is True: + msg = "PixtralHFVisionModel does not have post-layernorm" + raise ValueError(msg) + self.dtype = next(self.parameters()).dtype self.device = next(self.parameters()).device self.patch_positional_embedding = PixtralRotaryEmbedding( diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index e717ab108c77b..91277b0ccd145 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -248,8 +248,10 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads @@ -266,12 +268,14 @@ def __init__( head_size=self.head_dim, total_num_heads=self.num_heads, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.out_proj = RowParallelLinear( input_size=self.embed_dim, output_size=self.embed_dim, quant_config=quant_config, + prefix=f"{prefix}.out_proj", ) self.tp_size = get_tensor_model_parallel_world_size() @@ -314,8 +318,10 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config self.activation_fn = get_act_fn(config.hidden_act) @@ -326,11 +332,13 @@ def __init__( config.hidden_size, config.intermediate_size, quant_config=quant_config if quantizable else None, + prefix=f"{prefix}.fc1", ) self.fc2 = RowParallelLinear( config.intermediate_size, config.hidden_size, quant_config=quant_config if quantizable else None, + prefix=f"{prefix}.fc2", ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -346,15 +354,20 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.embed_dim = config.hidden_size num_heads = config.num_attention_heads tp_size = get_tensor_model_parallel_world_size() if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.self_attn = SiglipParallelAttention(config, - quant_config=quant_config) + self.self_attn = SiglipParallelAttention( + config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) else: self.self_attn = SiglipSdpaAttention(config) @@ -363,6 +376,7 @@ def __init__( self.mlp = SiglipMLP( config, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) @@ -392,8 +406,10 @@ def __init__( config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, num_hidden_layers_override: Optional[int] = None, - ): + prefix: str = "", + ) -> None: super().__init__() + self.config = config if num_hidden_layers_override is None: @@ -402,8 +418,10 @@ def __init__( num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - SiglipEncoderLayer(config, quant_config=quant_config) - for _ in range(num_hidden_layers) + SiglipEncoderLayer(config, + quant_config=quant_config, + prefix=f"{prefix}.layers.{layer_idx}") + for layer_idx in range(num_hidden_layers) ]) def forward( @@ -424,7 +442,8 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, - ): + prefix: str = "", + ) -> None: super().__init__() self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) @@ -433,7 +452,9 @@ def __init__( config.hidden_size, config.num_attention_heads, batch_first=True) self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = SiglipMLP(config=config, quant_config=quant_config) + self.mlp = SiglipMLP(config=config, + quant_config=quant_config, + prefix=f"{prefix}.mlp") def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: batch_size = hidden_state.shape[0] @@ -454,9 +475,13 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, + *, num_hidden_layers_override: Optional[int] = None, - ): + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() + self.config = config embed_dim = config.hidden_size @@ -465,26 +490,34 @@ def __init__( config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, + prefix=f"{prefix}.encoder", ) + num_hidden_layers = config.num_hidden_layers if len(self.encoder.layers) > config.num_hidden_layers: raise ValueError( - f"The original encoder only has {config.num_hidden_layers} " + f"The original encoder only has {num_hidden_layers} " f"layers, but you requested {len(self.encoder.layers)} layers." ) - elif len(self.encoder.layers) == config.num_hidden_layers: + + # If possible, skip post_layernorm to conserve memory + if require_post_norm is None: + require_post_norm = len(self.encoder.layers) == num_hidden_layers + + if require_post_norm: self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) else: - # post_layernorm is unused when we extract intermediate features - # In this case, we can skip it to conserve memory self.post_layernorm = None self.use_head = (True if not hasattr(config, "vision_use_head") else config.vision_use_head) if self.use_head: self.head = SiglipMultiheadAttentionPoolingHead( - config=config, quant_config=quant_config) + config=config, + quant_config=quant_config, + prefix=f"{prefix}.head", + ) def forward( self, @@ -517,8 +550,11 @@ def __init__( self, config: SiglipVisionConfig, quant_config: Optional[QuantizationConfig] = None, + *, num_hidden_layers_override: Optional[int] = None, - ): + require_post_norm: Optional[bool] = None, + prefix: str = "", + ) -> None: super().__init__() num_heads = config.num_attention_heads @@ -529,6 +565,8 @@ def __init__( config, quant_config, num_hidden_layers_override=num_hidden_layers_override, + require_post_norm=require_post_norm, + prefix=f"{prefix}.vision_model", ) def get_input_embeddings(self) -> nn.Module: From 31a08f5bd231c2ac547e9bb6b6490282d2e76f83 Mon Sep 17 00:00:00 2001 From: Alex Brooks Date: Wed, 23 Oct 2024 08:05:18 -0600 Subject: [PATCH 14/15] [Model] Add min_pixels / max_pixels to Qwen2VL as mm_processor_kwargs (#9612) Signed-off-by: Alex-Brooks --- examples/offline_inference_vision_language.py | 5 + .../vision_language/test_qwen2_vl.py | 160 ++++++++++++++++++ vllm/model_executor/models/qwen2_vl.py | 89 ++++++++-- 3 files changed, 236 insertions(+), 18 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/test_qwen2_vl.py diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 610cc31db9c4e..83d2548a506e4 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -267,6 +267,11 @@ def run_qwen2_vl(question: str, modality: str): model=model_name, max_model_len=8192, max_num_seqs=5, + # Note - mm_processor_kwargs can also be passed to generate/chat calls + mm_processor_kwargs={ + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + }, ) prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py new file mode 100644 index 0000000000000..d3de5fb26d4b8 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -0,0 +1,160 @@ +from typing import Any, Dict, Tuple + +import pytest +import torch +from PIL.Image import Image +from transformers import AutoTokenizer + +from vllm.inputs import InputContext, token_inputs +from vllm.multimodal import MultiModalRegistry + +from ....conftest import _ImageAssets +from ...utils import build_model_context + +MODEL = "Qwen/Qwen2-VL-2B-Instruct" +MIN_PIXELS = "min_pixels" +MAX_PIXELS = "max_pixels" + + +# Fixtures lazy import to avoid initializing CUDA during test collection +# NOTE: Qwen2vl supports multiple input modalities, so it registers multiple +# input mappers. +@pytest.fixture() +def image_input_mapper_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import ( + image_input_mapper_for_qwen2_vl) + return image_input_mapper_for_qwen2_vl + + +@pytest.fixture() +def input_processor_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import ( + input_processor_for_qwen2_vl) + return input_processor_for_qwen2_vl + + +@pytest.fixture() +def qwen2_vl_context() -> InputContext: + return build_model_context(model_name=MODEL) + + +@pytest.fixture() +def get_max_qwen2_vl_image_tokens(): + from vllm.model_executor.models.qwen2_vl import ( + get_max_qwen2_vl_image_tokens) + return get_max_qwen2_vl_image_tokens + + +@pytest.fixture() +def dummy_data_for_qwen2_vl(): + from vllm.model_executor.models.qwen2_vl import dummy_data_for_qwen2_vl + return dummy_data_for_qwen2_vl + + +@pytest.mark.parametrize("mm_processor_kwargs,expected_max_tokens", [ + ({}, 1225), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 324), +]) +def test_qwen2_vl_max_image_tokens(get_max_qwen2_vl_image_tokens, + qwen2_vl_context: InputContext, + mm_processor_kwargs: Dict[str, Any], + expected_max_tokens: int): + """Ensure that the max token calc handles min/max pixels properly.""" + actual_max_tokens = get_max_qwen2_vl_image_tokens(qwen2_vl_context, + **mm_processor_kwargs) + assert actual_max_tokens == expected_max_tokens + + +@pytest.mark.parametrize("mm_processor_kwargs,token_count,img_size", [ + [{}, 1225, (980, 980)], + [{ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 324, (504, 504)], +]) +def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl, + qwen2_vl_context: InputContext, + mm_processor_kwargs: Dict[str, Any], + token_count: int, img_size: Tuple[int, int]): + """Ensure that the dummy data handles min/max pixels properly.""" + seq_len = 3000 + hf_config = qwen2_vl_context.get_hf_config() + image_token_id = hf_config.image_token_id + + # NOTE: video value is required, but isn't actually used + # when making the dummy data except for error handling currently + seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, { + "image": 1, + "video": 0 + }, **mm_processor_kwargs) + + # Ensure we have the right number of placeholders for min/max pixel values + assert seq_data.get_token_ids().count(image_token_id) == token_count + + # Ensure the images were resized correctly + image = mm_data["image"] + assert isinstance(image, Image) + assert image.size == img_size + + +@pytest.mark.parametrize("mm_processor_kwargs,num_placeholders", [ + ({}, 1426), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, 330), +]) +def test_input_processor(input_processor_for_qwen2_vl, + qwen2_vl_context: InputContext, + image_assets: _ImageAssets, num_placeholders: int, + mm_processor_kwargs: Dict[str, Any]): + """Ensure that the image processor handles min/max pixels properly.""" + tokenizer = AutoTokenizer.from_pretrained(MODEL) + prompt = "<|vision_start|><|image_pad|><|vision_end|>" + + image = image_assets[0].pil_image + hf_config = qwen2_vl_context.get_hf_config() + image_token_id = hf_config.image_token_id + + inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt), + prompt=prompt, + multi_modal_data={"image": [image]}) + + processed_inputs = input_processor_for_qwen2_vl(qwen2_vl_context, inputs, + **mm_processor_kwargs) + assert processed_inputs["prompt_token_ids"].count( + image_token_id) == num_placeholders + assert len(processed_inputs["multi_modal_data"]["image"]) == 1 + + +@pytest.mark.parametrize("mm_processor_kwargs,pixels_shape", [ + ({}, [5704, 1176]), + ({ + MIN_PIXELS: 64**2, + MAX_PIXELS: 512**2 + }, [1320, 1176]), +]) +def test_image_mapper_override(qwen2_vl_context: InputContext, + image_assets: _ImageAssets, + mm_processor_kwargs: Dict[str, Any], + pixels_shape: Tuple[int, int]): + """Ensure that the image mapper handles min/max pixels properly.""" + mm_registry = MultiModalRegistry() + mm_registry.init_mm_limits_per_prompt(qwen2_vl_context.model_config) + + image = image_assets[0].pil_image + + mapped_output = mm_registry.map_input( + qwen2_vl_context.model_config, + {"image": image}, + mm_processor_kwargs=mm_processor_kwargs, + ) + + # Dimension 0 of pixel values should match the product of image_grid_thw + actual_pixels_shape = mapped_output["pixel_values"].shape + assert list(actual_pixels_shape) == pixels_shape + assert actual_pixels_shape[0] == torch.prod( + mapped_output["image_grid_thw"]) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 9cca6b65e3277..3dc955b12ba0e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -549,6 +549,9 @@ def mm_input_mapper_for_qwen2_vl( ctx: InputContext, data: MultiModalData[object], data_type_key: str, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, ) -> MultiModalInputs: """Input mapper for Qwen2-VL.""" if data_type_key == "image" and isinstance(data, dict): @@ -557,8 +560,19 @@ def mm_input_mapper_for_qwen2_vl( "image_grid_thw": data.get("image_grid_thw"), }) model_config = ctx.model_config + # Handle mm processor kwargs; we pass these at creation time + # because preprocess() in transformers doesn't expose them + mm_processor_kwargs = {} + if min_pixels: + mm_processor_kwargs["min_pixels"] = min_pixels + if max_pixels: + mm_processor_kwargs["max_pixels"] = max_pixels + image_processor = cached_get_image_processor( - model_config.model, trust_remote_code=model_config.trust_remote_code) + model_config.model, + trust_remote_code=model_config.trust_remote_code, + **mm_processor_kwargs, + ) if image_processor is None: raise RuntimeError("No HuggingFace processor is available " "to process the image object") @@ -631,25 +645,36 @@ def _get_max_image_info( image_processor, data_type_key: str = "image", mm_count: int = 1, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, ): + # Limit min / max pixels unless they're explicitly provided + if min_pixels is None: + min_pixels = max(image_processor.min_pixels, 28 * 28) + if max_pixels is None: + max_pixels = min(image_processor.max_pixels, 1280 * 28 * 28) + return _get_vision_info( image_processor, height=9999999, width=9999999, - - # Limit min / max pixels. - min_pixels=max(image_processor.min_pixels, 28 * 28), - max_pixels=min(image_processor.max_pixels, 1280 * 28 * 28), + min_pixels=min_pixels, + max_pixels=max_pixels, data_type_key=data_type_key, mm_count=mm_count, ) -def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: +def get_max_qwen2_vl_mm_tokens(ctx: InputContext, + data_type_key: str, + *, + min_pixels=None, + max_pixels=None) -> int: image_processor = cached_get_image_processor(ctx.model_config.model) max_resized_height, max_resized_width, max_llm_image_tokens = \ _get_max_image_info(image_processor, data_type_key=data_type_key, - mm_count=1) + mm_count=1, min_pixels=min_pixels, + max_pixels=max_pixels) return max_llm_image_tokens @@ -660,14 +685,20 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext, data_type_key: str) -> int: def dummy_data_for_qwen2_vl( - ctx: InputContext, seq_len: int, mm_counts: Mapping[str, int] + ctx: InputContext, + seq_len: int, + mm_counts: Mapping[str, int], + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None ) -> Tuple[SequenceData, Optional[MultiModalDataDict]]: image_processor = cached_get_image_processor(ctx.model_config.model) num_images = mm_counts["image"] max_resized_height, max_resized_width, max_llm_image_tokens = \ _get_max_image_info(image_processor, data_type_key="image", - mm_count=num_images) + mm_count=num_images, min_pixels=min_pixels, + max_pixels=max_pixels) if seq_len - max_llm_image_tokens - 2 < 0: raise RuntimeError( f"Qwen2-VL cannot process {num_images} images in a prompt, " @@ -678,10 +709,11 @@ def dummy_data_for_qwen2_vl( num_videos = mm_counts["video"] max_resized_height, max_resized_width, max_llm_video_tokens = \ _get_max_image_info(image_processor, data_type_key="video", - mm_count=num_videos) + mm_count=num_videos, min_pixels=min_pixels, + max_pixels=max_pixels) if seq_len - max_llm_video_tokens - 2 < 0: raise RuntimeError( - f"Qwen2-VL cannot process {num_images} videos in a prompt, " + f"Qwen2-VL cannot process {num_videos} videos in a prompt, " "please increase max_model_len or reduce video limit by " "--limit-mm-per-prompt.") @@ -706,6 +738,8 @@ def _get_llm_num_vision_tokens( mm_inputs: list, data_type_key: str, image_processor, + min_pixels: int, + max_pixels: int, ): """Get number of vision tokens of multimodal inputs. @@ -715,12 +749,13 @@ def _get_llm_num_vision_tokens( image = to_numpy_array(mm_inputs[0]) input_data_format = infer_channel_dimension_format(image) height, width = get_image_size(image, channel_dim=input_data_format) + _, _, llm_num_vision_tokens = _get_vision_info( image_processor, height=height, width=width, - min_pixels=image_processor.min_pixels, - max_pixels=image_processor.max_pixels, + min_pixels=min_pixels, + max_pixels=max_pixels, do_resize=image_processor.do_resize, data_type_key=data_type_key, mm_count=len(mm_inputs), @@ -730,7 +765,8 @@ def _get_llm_num_vision_tokens( def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, data_type_key: str, image_processor: Any, - prompt_token_ids: List[int]) -> List[int]: + prompt_token_ids: List[int], min_pixels: Optional[int], + max_pixels: Optional[int]) -> List[int]: """ Expand pad tokens for multi-modal inputs (e.g., images or videos). @@ -741,6 +777,8 @@ def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, data_type_key (str): The type of the multi-modal input. image_processor (Any): The image processor used to process the inputs. prompt_token_ids (List[int]): The list of token IDs in the prompt. + min_pixels (int): min pixels to used for img processing + max_pixels (int): max pixels to be used for img processing Returns: List[int]: The list of token IDs for the multi-modal inputs. @@ -757,6 +795,8 @@ def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, [data] if data_type_key == "image" else data, data_type_key=data_type_key, image_processor=image_processor, + min_pixels=min_pixels, + max_pixels=max_pixels, ) if cnt == 0: end_idx = indices[cnt] @@ -773,6 +813,9 @@ def _expand_pad_tokens(inputs: list, token_id: int, make_batched_fn: Callable, def input_processor_for_qwen2_vl( ctx: InputContext, inputs: DecoderOnlyInputs, + *, + min_pixels: Optional[int] = None, + max_pixels: Optional[int] = None, ) -> DecoderOnlyInputs: multi_modal_data = inputs.get("multi_modal_data", None) if multi_modal_data is None: @@ -783,6 +826,10 @@ def input_processor_for_qwen2_vl( processor = cached_get_processor(ctx.model_config.model) image_processor = processor.image_processor + # Apply processor kwarg overrides for image processor options + min_pixels = min_pixels if min_pixels else image_processor.min_pixels + max_pixels = max_pixels if max_pixels else image_processor.max_pixels + hf_config = ctx.get_hf_config(Qwen2VLConfig) # To avoid redundant processing of vision objects (resize, rescale, etc.), @@ -830,16 +877,22 @@ def input_processor_for_qwen2_vl( else: prompt_token_ids = _expand_pad_tokens(image_inputs, hf_config.image_token_id, - make_batched_images, "image", + make_batched_images, + "image", image_processor, - prompt_token_ids) + prompt_token_ids, + min_pixels=min_pixels, + max_pixels=max_pixels) if video_inputs is not None: prompt_token_ids = _expand_pad_tokens(video_inputs, hf_config.video_token_id, - make_batched_videos, "video", + make_batched_videos, + "video", image_processor, - prompt_token_ids) + prompt_token_ids, + min_pixels=min_pixels, + max_pixels=max_pixels) return token_inputs( prompt_token_ids=prompt_token_ids, From e7116c017c86cb547f4d1888edaf13a9be2a4562 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 23 Oct 2024 22:09:04 +0800 Subject: [PATCH 15/15] [Bugfix] Fix `_init_vision_model` in NVLM_D model (#9611) Co-authored-by: Isotr0py <2037008807@qq.com> --- vllm/model_executor/models/nvlm_d.py | 37 +++++++++++++++++++++------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 3e3c3b05879fb..df4fd0a3256e9 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -58,12 +58,31 @@ def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False), ) - def _init_vision_model(self, config: PretrainedConfig, - quant_config: Optional[QuantizationConfig], - num_hidden_layers: int): - # We added additional dummy heads to the original num of heads to make - # the number of heads divisible by 8. - return InternVisionModel(config.vision_config, - quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers, - num_dummy_heads=7) + def _init_vision_model( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + *, + is_mono: bool, + prefix: str, + ): + if not is_mono: + vision_feature_layer = config.select_layer + if vision_feature_layer < 0: + num_hidden_layers = config.vision_config.num_hidden_layers \ + + vision_feature_layer + 1 + else: + num_hidden_layers = vision_feature_layer + 1 + + # We added additional dummy heads to the original num of heads to + # make the number of heads divisible by 8. + return InternVisionModel( + config.vision_config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers, + num_dummy_heads=7, + prefix=prefix, + ) + else: + msg = "Monolith mode is not applicable to NVLM_D" + raise NotImplementedError(msg)