From 176fcb1c71655d825d2363e5f1468fa248fe783b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= Date: Wed, 13 Nov 2024 00:36:51 +0800 Subject: [PATCH 01/20] [Bugfix] Fix QwenModel argument (#10262) Signed-off-by: Jie Fu --- vllm/model_executor/models/qwen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index cc70099361dd2..5acd87146c54e 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -1068,7 +1068,7 @@ def __new__( config = vllm_config.model_config.hf_config # Initialize VL if hasattr(config, "visual"): - return QWenVL(vllm_config) + return QWenVL(vllm_config=vllm_config) # Initialize LLM else: - return QWenLLM(vllm_config) + return QWenLLM(vllm_config=vllm_config) From 47db6ec8310129699a62567b61d8ed380636b053 Mon Sep 17 00:00:00 2001 From: zifeitong Date: Tue, 12 Nov 2024 08:42:28 -0800 Subject: [PATCH 02/20] [Frontend] Add per-request number of cached token stats (#10174) --- tests/prefix_caching/test_prefix_caching.py | 24 ++++++++++++-- vllm/entrypoints/openai/api_server.py | 1 + vllm/entrypoints/openai/cli_args.py | 5 +++ vllm/entrypoints/openai/protocol.py | 5 +++ vllm/entrypoints/openai/run_batch.py | 6 ++++ vllm/entrypoints/openai/serving_chat.py | 35 +++++++++++++-------- vllm/outputs.py | 19 +++++++---- vllm/sequence.py | 14 +++++++-- vllm/worker/model_runner.py | 3 ++ 9 files changed, 89 insertions(+), 23 deletions(-) diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index fd6564bbfe630..50723dbb610ac 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -27,6 +27,7 @@ @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("cached_position", [0, 1]) +@pytest.mark.parametrize("block_size", [16]) def test_mixed_requests( hf_runner, vllm_runner, @@ -36,11 +37,12 @@ def test_mixed_requests( dtype: str, max_tokens: int, cached_position: int, + block_size: int, monkeypatch, ) -> None: """ Test the case when some sequences have the prefix cache hit - and the others don't. The cached position determines where + and the others don't. The cached position determines where the sequence is at among the batch of prefills. """ override_backend_env_variable(monkeypatch, backend) @@ -53,12 +55,30 @@ def test_mixed_requests( model, dtype=dtype, enable_prefix_caching=True, + block_size=block_size, ) as vllm_model: # Run the first prompt so the cache is populated vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens) # Run all the promopts - vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) + greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) + req_outputs = vllm_model.model.generate(example_prompts, greedy_params) + + # Verify number of cached tokens + for i in range(len(req_outputs)): + if i == cached_position: + expected_num_cached_tokens = ( + len(req_outputs[i].prompt_token_ids) // + block_size) * block_size + else: + expected_num_cached_tokens = 0 + assert req_outputs[ + i].num_cached_tokens == expected_num_cached_tokens + + vllm_outputs = [ + (output.prompt_token_ids + list(output.outputs[0].token_ids), + output.prompt + output.outputs[0].text) for output in req_outputs + ] check_outputs_equal( outputs_0_lst=hf_outputs, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 3e4070a25cf90..6a24cdbc6a18f 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -540,6 +540,7 @@ def init_app_state( return_tokens_as_token_ids=args.return_tokens_as_token_ids, enable_auto_tools=args.enable_auto_tool_choice, tool_parser=args.tool_call_parser, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.task == "generate" else None state.openai_serving_completion = OpenAIServingCompletion( engine_client, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 74ea41344bece..eb08a89293370 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -228,6 +228,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=False, help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint" ) + parser.add_argument( + "--enable-prompt-tokens-details", + action='store_true', + default=False, + help="If set to True, enable prompt_tokens_details in usage.") return parser diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 0e0bb66c057df..820aefd8800d9 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -99,10 +99,15 @@ class ModelList(OpenAIBaseModel): data: List[ModelCard] = Field(default_factory=list) +class PromptTokenUsageInfo(OpenAIBaseModel): + cached_tokens: Optional[int] = None + + class UsageInfo(OpenAIBaseModel): prompt_tokens: int = 0 total_tokens: int = 0 completion_tokens: Optional[int] = 0 + prompt_tokens_details: Optional[PromptTokenUsageInfo] = None class RequestResponseMetadata(BaseModel): diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py index 0d016d949d22b..1b422a93263b2 100644 --- a/vllm/entrypoints/openai/run_batch.py +++ b/vllm/entrypoints/openai/run_batch.py @@ -78,6 +78,11 @@ def parse_args(): help="Port number for the Prometheus metrics server " "(only needed if enable-metrics is set).", ) + parser.add_argument( + "--enable-prompt-tokens-details", + action='store_true', + default=False, + help="If set to True, enable prompt_tokens_details in usage.") return parser.parse_args() @@ -217,6 +222,7 @@ async def main(args): prompt_adapters=None, request_logger=request_logger, chat_template=None, + enable_prompt_tokens_details=args.enable_prompt_tokens_details, ) if model_config.task == "generate" else None openai_serving_embedding = OpenAIServingEmbedding( engine, diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 9551b4f2091dd..74867d8de8843 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -18,8 +18,8 @@ ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, - DeltaToolCall, ErrorResponse, FunctionCall, RequestResponseMetadata, - ToolCall, UsageInfo) + DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo, + RequestResponseMetadata, ToolCall, UsageInfo) from vllm.entrypoints.openai.serving_engine import (BaseModelPath, LoRAModulePath, OpenAIServing, @@ -49,7 +49,8 @@ def __init__(self, chat_template: Optional[str], return_tokens_as_token_ids: bool = False, enable_auto_tools: bool = False, - tool_parser: Optional[str] = None): + tool_parser: Optional[str] = None, + enable_prompt_tokens_details: bool = False): super().__init__(engine_client=engine_client, model_config=model_config, base_model_paths=base_model_paths, @@ -80,6 +81,8 @@ def __init__(self, f"tool_parser:'{tool_parser}' which has not " "been registered") from e + self.enable_prompt_tokens_details = enable_prompt_tokens_details + async def create_chat_completion( self, request: ChatCompletionRequest, @@ -252,6 +255,7 @@ async def chat_completion_stream_generator( previous_num_tokens = [0] * num_choices finish_reason_sent = [False] * num_choices num_prompt_tokens = 0 + num_cached_tokens = None if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam): tool_choice_function_name = request.tool_choice.function.name @@ -305,6 +309,7 @@ async def chat_completion_stream_generator( # the result_generator, it needs to be sent as the FIRST # response (by the try...catch). if first_iteration: + num_cached_tokens = res.num_cached_tokens # Send first response for each request.n (index) with # the role role = self.get_chat_request_role(request) @@ -530,11 +535,13 @@ async def chat_completion_stream_generator( # is sent, send the usage if include_usage: completion_tokens = sum(previous_num_tokens) - final_usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + completion_tokens, - ) + final_usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=num_prompt_tokens + + completion_tokens) + if self.enable_prompt_tokens_details and num_cached_tokens: + final_usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=num_cached_tokens) final_usage_chunk = ChatCompletionStreamResponse( id=request_id, @@ -702,11 +709,13 @@ async def chat_completion_full_generator( num_prompt_tokens += len(final_res.encoder_prompt_token_ids) num_generated_tokens = sum( len(output.token_ids) for output in final_res.outputs) - usage = UsageInfo( - prompt_tokens=num_prompt_tokens, - completion_tokens=num_generated_tokens, - total_tokens=num_prompt_tokens + num_generated_tokens, - ) + usage = UsageInfo(prompt_tokens=num_prompt_tokens, + completion_tokens=num_generated_tokens, + total_tokens=num_prompt_tokens + + num_generated_tokens) + if self.enable_prompt_tokens_details and final_res.num_cached_tokens: + usage.prompt_tokens_details = PromptTokenUsageInfo( + cached_tokens=final_res.num_cached_tokens) request_metadata.final_usage_info = usage diff --git a/vllm/outputs.py b/vllm/outputs.py index abfdb7d328126..badf50d0602d6 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -83,10 +83,11 @@ class RequestOutput: finished: Whether the whole request is finished. metrics: Metrics associated with the request. lora_request: The LoRA request that was used to generate the output. - encoder_prompt: The encoder prompt string of the request; - None if decoder-only - encoder_prompt_token_ids: The token IDs of the encoder prompt; - None if decoder-only + encoder_prompt: The encoder prompt string of the request. + None if decoder-only. + encoder_prompt_token_ids: The token IDs of the encoder prompt. + None if decoder-only. + num_cached_tokens: The number of tokens with prefix cache hit. """ def __init__( @@ -101,6 +102,7 @@ def __init__( lora_request: Optional[LoRARequest] = None, encoder_prompt: Optional[str] = None, encoder_prompt_token_ids: Optional[List[int]] = None, + num_cached_tokens: Optional[int] = None, ) -> None: self.request_id = request_id self.prompt = prompt @@ -112,6 +114,7 @@ def __init__( self.lora_request = lora_request self.encoder_prompt = encoder_prompt self.encoder_prompt_token_ids = encoder_prompt_token_ids + self.num_cached_tokens = num_cached_tokens @classmethod def new( @@ -192,6 +195,8 @@ def from_seq_group( outputs = [] include_prompt = True + # num_cached_tokens should be the same for all the sequences + num_cached_tokens = None for i, seq in enumerate(top_n_seqs): output_text = seq.get_output_text_to_return( text_buffer_length, delta) @@ -199,6 +204,7 @@ def from_seq_group( output_token_ids = seq.get_output_token_ids_to_return(delta) num_output_tokens = 1 if isinstance(output_token_ids, int) else len(output_token_ids) + num_cached_tokens = seq.data.get_num_cached_tokens() output_logprobs = seq.output_logprobs if include_logprobs else None @@ -272,7 +278,7 @@ def from_seq_group( init_args = (seq_group.request_id, prompt, prompt_token_ids, prompt_logprobs, outputs, finished, seq_group.metrics, seq_group.lora_request, encoder_prompt, - encoder_prompt_token_ids) + encoder_prompt_token_ids, num_cached_tokens) if use_cache: request_output = seq_group.cached_request_output @@ -293,7 +299,8 @@ def __repr__(self) -> str: f"outputs={self.outputs}, " f"finished={self.finished}, " f"metrics={self.metrics}, " - f"lora_request={self.lora_request})") + f"lora_request={self.lora_request}, " + f"num_cached_tokens={self.num_cached_tokens})") class EmbeddingRequestOutput: diff --git a/vllm/sequence.py b/vllm/sequence.py index 7d7ddc7ec4447..1370cb5c4f9d2 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -167,6 +167,8 @@ class SequenceData(msgspec.Struct, ...] = msgspec.field(default_factory=tuple) # The number of tokens that are computed (that run against the model). _num_computed_tokens: int = 0 + # The number of tokens with prefix cache hit. + _num_cached_tokens: int = 0 _stage: SequenceStage = SequenceStage.PREFILL _cached_all_token_ids: List[int] = msgspec.field(default_factory=list) @@ -323,6 +325,14 @@ def update_num_computed_tokens(self, num_new_computed_tokens: int): if self.get_num_uncomputed_tokens() == 0: self._stage = SequenceStage.DECODE + def get_num_cached_tokens(self) -> int: + """Return the number of tokens with prefix cache hit.""" + return self._num_cached_tokens + + def update_num_cached_tokens(self, num_cached_tokens: int): + """Update the number of tokens with prefix cache hit.""" + self._num_cached_tokens = num_cached_tokens + def reset_state_for_recompute(self) -> None: """Reset the number of computed tokens from this sequence. It is supposed to be called when a sequence needs to be started from @@ -379,7 +389,7 @@ def __repr__(self) -> str: class Sequence: """Stores the data, status, and block information of a sequence. - + The sequence is constructed from the :data:`DecoderOnlyInputs` (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) instance passed in through the :code:`inputs` constructor argument. @@ -906,7 +916,7 @@ class SequenceGroupMetadata( multi_modal_data: Multi modal data. mm_processor_kwargs: Multimodal input processor / mapper overrides. encoder_seq_data: Optional sequence data for encoder prompt - (SequenceGroup.encoder_seq). Should be None + (SequenceGroup.encoder_seq). Should be None unless you are working with an encoder/decoder model. cross_block_table: Optional cross-attention block table associated diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index e1446192ce3d6..2da02f21f8342 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -542,6 +542,9 @@ def _compute_for_prefix_cache_hit( # this may be larger than the sequence length if chunked # prefill is enabled. prefix_cache_len = len(computed_block_nums) * self.block_size + seq_group_metadata.seq_data[inter_data.seq_ids[ + seq_idx]].update_num_cached_tokens(prefix_cache_len) + # The number of so far computed prompt tokens in this sequence. context_len = inter_data.context_lens[seq_idx] # The total number of prompt tokens in this sequence. From 7c65527918cd16286961a2a779e15743ca41ab0e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Nov 2024 08:57:14 -0800 Subject: [PATCH 03/20] [V1] Use pickle for serializing EngineCoreRequest & Add multimodal inputs to EngineCoreRequest (#10245) Signed-off-by: Woosuk Kwon --- vllm/v1/engine/__init__.py | 9 +++++++-- vllm/v1/engine/core.py | 3 ++- vllm/v1/engine/core_client.py | 3 ++- vllm/v1/engine/processor.py | 5 ++++- vllm/v1/serial_utils.py | 10 ++++++++++ 5 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 vllm/v1/serial_utils.py diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index 8bc16651faf97..edfb8bd7c2fc1 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -1,10 +1,11 @@ import enum from dataclasses import dataclass -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union import msgspec from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict from vllm.sampling_params import RequestOutputKind, SamplingParams @@ -22,7 +23,8 @@ class DetokenizerRequest: include_stop_str_in_output: bool -class EngineCoreRequest(msgspec.Struct, omit_defaults=True): +@dataclass +class EngineCoreRequest: # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput, # but this object is currently not playing well with msgspec @@ -33,6 +35,9 @@ class EngineCoreRequest(msgspec.Struct, omit_defaults=True): # always be tokenized? prompt: Optional[str] prompt_token_ids: List[int] + mm_data: Optional[MultiModalDataDict] + mm_placeholders: Optional[MultiModalPlaceholderDict] + mm_processor_kwargs: Optional[Dict[str, Any]] sampling_params: SamplingParams eos_token_id: Optional[int] arrival_time: float diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index f9d3473d0131c..808c3936b6c35 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -19,6 +19,7 @@ EngineCoreRequest, EngineCoreRequestType) from vllm.v1.executor.gpu_executor import GPUExecutor from vllm.v1.request import Request, RequestStatus +from vllm.v1.serial_utils import PickleEncoder from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -315,7 +316,7 @@ def process_input_socket(self, input_path: str): """Input socket IO thread.""" # Msgpack serialization decoding. - decoder_add_req = msgpack.Decoder(EngineCoreRequest) + decoder_add_req = PickleEncoder() decoder_abort_req = msgpack.Decoder(list[str]) with self.make_socket(input_path, zmq.constants.PULL) as socket: diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index f9e4677fb8c59..09801e20e16ca 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -11,6 +11,7 @@ from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType) from vllm.v1.engine.core import EngineCore, EngineCoreProc +from vllm.v1.serial_utils import PickleEncoder logger = init_logger(__name__) @@ -115,7 +116,7 @@ def __init__( **kwargs, ): # Serialization setup. - self.encoder = msgspec.msgpack.Encoder() + self.encoder = PickleEncoder() self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs) # ZMQ setup. diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index d92e622810389..5f13cbf2e4036 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -91,7 +91,10 @@ def process_inputs( # Make Request for EngineCore. engine_core_request = EngineCoreRequest( request_id, processed_inputs.get("prompt"), - processed_inputs.get("prompt_token_ids"), sampling_params, + processed_inputs.get("prompt_token_ids"), + processed_inputs.get("multi_modal_data"), + processed_inputs.get("multi_modal_placeholders"), + processed_inputs.get("mm_processor_kwargs"), sampling_params, eos_token_id, arrival_time, lora_request) return detokenizer_request, engine_core_request diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py new file mode 100644 index 0000000000000..b1cd5c11834f8 --- /dev/null +++ b/vllm/v1/serial_utils.py @@ -0,0 +1,10 @@ +import pickle + + +class PickleEncoder: + + def encode(self, obj): + return pickle.dumps(obj) + + def decode(self, data): + return pickle.loads(data) From b41fb9d3b10dcf187ac0501ca80ede96d387617f Mon Sep 17 00:00:00 2001 From: sroy745 <142070531+sroy745@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:53:57 -0800 Subject: [PATCH 04/20] [Encoder Decoder] Update Mllama to run with both FlashAttention and XFormers (#9982) Signed-off-by: Sourashis Roy --- tests/encoder_decoder/test_e2e_correctness.py | 9 +- .../vision_language/test_mllama.py | 100 +++++++++++------- tests/test_config.py | 2 + vllm/model_executor/models/mllama.py | 52 ++++++--- vllm/worker/enc_dec_model_runner.py | 34 ++---- 5 files changed, 117 insertions(+), 80 deletions(-) diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py index f2d7e9fd78cf3..fa5d6a69a9bc8 100644 --- a/tests/encoder_decoder/test_e2e_correctness.py +++ b/tests/encoder_decoder/test_e2e_correctness.py @@ -7,7 +7,7 @@ import pytest from transformers import AutoModelForSeq2SeqLM -from vllm.attention.selector import (_Backend, +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, global_force_attn_backend_context_manager) from vllm.platforms import current_platform from vllm.sequence import SampleLogprobs @@ -34,6 +34,13 @@ def vllm_to_hf_output( return output_ids, hf_output_str, out_logprobs +@pytest.fixture(autouse=True) +def clear_cache(): + """Fixture to clear backend cache before each test.""" + _cached_get_attn_backend.cache_clear() # Clear the cache + yield # This allows the test to run + + @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"]) @pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index 7f82347841cdb..a3b1c0950d9a2 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -4,6 +4,8 @@ from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) +from vllm.attention.selector import (_Backend, _cached_get_attn_backend, + global_force_attn_backend_context_manager) from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs @@ -14,6 +16,8 @@ _LIMIT_IMAGE_PER_PROMPT = 3 +LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN] + HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ "stop_sign": "<|image|><|begin_of_text|>The meaning of the image is", @@ -221,6 +225,13 @@ def process(hf_inputs: BatchEncoding, **kwargs): ) +@pytest.fixture(autouse=True) +def clear_cache(): + """Fixture to clear backend cache before each test.""" + _cached_get_attn_backend.cache_clear() # Clear the cache + yield # This allows the test to run + + @large_gpu_test(min_gb=48) @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( @@ -244,20 +255,26 @@ def process(hf_inputs: BatchEncoding, **kwargs): @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, model, sizes, dtype, max_tokens, - num_logprobs) -> None: - run_test( - hf_runner, - vllm_runner, - image_assets, - model, - sizes=sizes, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + num_logprobs, + attn_backend: _Backend) -> None: + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + run_test( + hf_runner, + vllm_runner, + image_assets, + model, + sizes=sizes, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) @large_gpu_test(min_gb=48) @@ -265,9 +282,10 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets, @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, - model, dtype, max_tokens, - num_logprobs) -> None: + model, dtype, max_tokens, num_logprobs, + attn_backend: _Backend) -> None: stop_sign = image_assets[0].pil_image cherry_blossom = image_assets[1].pil_image @@ -291,17 +309,20 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, cherry_blossom.resize((512, 1024)), ], ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + _run_test( + hf_runner, + vllm_runner, + inputs, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) @large_gpu_test(min_gb=48) @@ -309,8 +330,10 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets, @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) +@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS) def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, - dtype, max_tokens, num_logprobs) -> None: + dtype, max_tokens, num_logprobs, + attn_backend: _Backend) -> None: stop_sign = image_assets[0].pil_image cherry_blossom = image_assets[1].pil_image @@ -325,14 +348,17 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, [stop_sign], [stop_sign, cherry_blossom], ])] - - _run_test( - hf_runner, - vllm_runner, - inputs, - model, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - tensor_parallel_size=1, - ) + with global_force_attn_backend_context_manager(attn_backend): + if attn_backend == _Backend.FLASH_ATTN: + # Flash Attention works only with bfloat16 data-type + dtype = 'bfloat16' + _run_test( + hf_runner, + vllm_runner, + inputs, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) diff --git a/tests/test_config.py b/tests/test_config.py index 36c426d6c51f6..df382d22d83ec 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -243,6 +243,8 @@ def test_rope_customization(): assert longchat_model_config.max_model_len == 4096 +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Encoder Decoder models not supported on ROCm.") @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [ ("facebook/opt-125m", False), ("facebook/bart-base", True), diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index e5c1d28e6e7ea..db7ee7b2d8537 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -32,6 +32,8 @@ import vllm.distributed.parallel_state as ps from vllm.attention import Attention, AttentionMetadata, AttentionType +from vllm.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.attention.backends.xformers import XFormersMetadata from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size @@ -799,12 +801,13 @@ def forward( q = self.q_norm(q) if attention_mask is not None: - output = self.attention_with_mask(q, k, v, kv_cache, - attention_mask, - kv_range_for_decode, - attn_metadata) + output = self._attention_with_mask(q, k, v, kv_cache, + attention_mask, + kv_range_for_decode, + attn_metadata) else: - output = self.attn(q, + output = self.attn(q.view(-1, + self.num_local_heads * self.head_dim), k, v, kv_cache, @@ -813,7 +816,7 @@ def forward( out, _ = self.o_proj(output) return out - def attention_with_mask( + def _attention_with_mask( self, q: torch.Tensor, k: torch.Tensor, @@ -824,14 +827,35 @@ def attention_with_mask( attn_metadata: AttentionMetadata, ) -> torch.Tensor: # Skip writing kv-cache for the initial profiling run. - if len(kv_cache.shape) == 3: - key_cache, value_cache = PagedAttention.split_kv_cache( - kv_cache, self.num_local_key_value_heads, self.head_dim) - cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) - cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) - PagedAttention.write_to_paged_cache( - cached_k, cached_v, key_cache, value_cache, - attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) + if len(kv_cache.shape) > 1: + if isinstance(attn_metadata, FlashAttentionMetadata): + cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) + cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) + torch.ops._C_cache_ops.reshape_and_cache_flash( + cached_k, + cached_v, + kv_cache[0], + kv_cache[1], + attn_metadata. + cross_slot_mapping, # type: ignore[union-attr] + "auto", + 1.0, + 1.0, + ) + elif isinstance(attn_metadata, XFormersMetadata): + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_local_key_value_heads, self.head_dim) + cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode]) + cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode]) + PagedAttention.write_to_paged_cache( + cached_k, cached_v, key_cache, value_cache, + attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0) + else: + raise ValueError( + f"Unsupported AttentionMetadata {type(attn_metadata)} " + f"class found. Expected the AttentionMetadata to " + f"be either XFormersMetadata or FlashAttentionMetadata.") + # We have to call torch.sdpa for prefill when using a # custom cross-attention mask. Because the mask is not a # standard causal mask, neither a block diagonal mask which diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 008e0c9745994..82824faa6629a 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -9,15 +9,13 @@ AttentionMetadata) from vllm.attention.backends.utils import PAD_SLOT_ID from vllm.attention.selector import (_Backend, get_env_variable_attn_backend, - get_global_forced_attn_backend, - global_force_attn_backend) -from vllm.config import ModelConfig, VllmConfig + get_global_forced_attn_backend) +from vllm.config import VllmConfig from vllm.forward_context import set_forward_context from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.model_loader.utils import get_architecture_class_name from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs, MultiModalRegistry) from vllm.sampling_params import SamplingParams @@ -35,11 +33,6 @@ logger = init_logger(__name__) -# The Mllama model has PagedAttention specific logic because of which it -# can only be run with the XFORMERS backend -# TODO Make Mllama model work with Flash Attention backend. -_XFORMERS_ONLY_ENCODER_DECODER_ARCHS = ["MllamaForConditionalGeneration"] - @dataclasses.dataclass(frozen=True) class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata): @@ -97,7 +90,7 @@ def __init__( models) but these arguments are present here for compatibility with the base-class constructor. ''' - self._maybe_force_supported_attention_backend(vllm_config.model_config) + self._maybe_force_supported_attention_backend() super().__init__( vllm_config=vllm_config, @@ -108,12 +101,7 @@ def __init__( # Crash for unsupported encoder/scenarios assert_enc_dec_mr_supported_scenario(self) - def _is_xformers_only_encoder_decoder_model(self, - model: ModelConfig) -> bool: - return get_architecture_class_name( - model) in _XFORMERS_ONLY_ENCODER_DECODER_ARCHS - - def _maybe_force_supported_attention_backend(self, model: ModelConfig): + def _maybe_force_supported_attention_backend(self): ''' Force vLLM to use the XFormers attention backend, which is currently the only supported option. @@ -128,23 +116,13 @@ def raise_backend_err(): maybe_global_forced_backend = get_global_forced_attn_backend() is_forced_by_global = maybe_global_forced_backend is not None is_forced_by_env_var = maybe_env_var_forced_backend is not None - - if not (is_forced_by_global or is_forced_by_env_var) \ - and self._is_xformers_only_encoder_decoder_model(model): - # The user has not already specified an attention backend - # override - logger.info( - "Encoder-Decoder Model Architecture %s requires XFormers " - "backend; overriding backend auto-selection and " - "forcing XFormers.", get_architecture_class_name(model)) - global_force_attn_backend(_Backend.XFORMERS) - elif is_forced_by_global: + if is_forced_by_global: # noqa: SIM102 # Backend override enforced by global variable takes # precedence over vLLM backend environment variable. if maybe_global_forced_backend not in\ [_Backend.XFORMERS, _Backend.FLASH_ATTN]: raise_backend_err() - elif is_forced_by_env_var: + elif is_forced_by_env_var: # noqa: SIM102 # Backend override enforced by vLLM backend # environment variable if maybe_env_var_forced_backend not in\ From 8a06428c70657b3310a317b3caf3c562b0e042ae Mon Sep 17 00:00:00 2001 From: Umesh Date: Tue, 12 Nov 2024 11:08:40 -0800 Subject: [PATCH 05/20] [LoRA] Adds support for bias in LoRA (#5733) Signed-off-by: Umesh Deshpande Co-authored-by: Umesh Deshpande --- tests/lora/conftest.py | 5 + tests/lora/test_lora_bias_e2e.py | 52 ++++++ tests/lora/test_utils.py | 14 +- vllm/config.py | 1 + vllm/engine/arg_utils.py | 5 + vllm/lora/fully_sharded_layers.py | 33 ++++ vllm/lora/layers.py | 296 +++++++++++++++++++++++++++++- vllm/lora/lora.py | 17 +- vllm/lora/models.py | 36 +++- vllm/lora/utils.py | 17 +- 10 files changed, 456 insertions(+), 20 deletions(-) create mode 100644 tests/lora/test_lora_bias_e2e.py diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 816d3986fe333..29ecf37808205 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -152,6 +152,11 @@ def sql_lora_files(sql_lora_huggingface_id): return snapshot_download(repo_id=sql_lora_huggingface_id) +@pytest.fixture(scope="session") +def lora_bias_files(): + return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias") + + @pytest.fixture(scope="session") def mixtral_lora_files(): # Note: this module has incorrect adapter_config.json to test diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py new file mode 100644 index 0000000000000..c2520c847d873 --- /dev/null +++ b/tests/lora/test_lora_bias_e2e.py @@ -0,0 +1,52 @@ +from typing import List + +import pytest + +import vllm +from vllm.lora.request import LoRARequest + +MODEL_PATH = "ibm-granite/granite-3b-code-base" + + +def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: + prompts = [ + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 + "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 + ] + sampling_params = vllm.SamplingParams(temperature=0, + max_tokens=256, + stop=["[/assistant]"]) + outputs = llm.generate( + prompts, + sampling_params, + lora_request=LoRARequest(str(lora_id), lora_id, lora_path) + if lora_id else None) + generated_texts: List[str] = [] + for output in outputs: + generated_text = output.outputs[0].text + generated_texts.append(generated_text) + return generated_texts + + +@pytest.mark.parametrize("lora_bias", [True]) +@pytest.mark.parametrize("fully_sharded", [True, False]) +def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool): + llm = vllm.LLM(MODEL_PATH, + enable_lora=True, + max_num_seqs=16, + max_lora_rank=8, + max_loras=1, + enable_lora_bias=lora_bias, + tensor_parallel_size=1, + fully_sharded_loras=fully_sharded) + + print("lora adapter created") + output1 = do_sample(llm, lora_bias_files, lora_id=0) + + print("lora") + output2 = do_sample(llm, lora_bias_files, lora_id=1) + + if lora_bias: + assert output1 != output2 + else: + assert output1 == output2 diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py index db02bacdb6439..85110b8fa8cd2 100644 --- a/tests/lora/test_utils.py +++ b/tests/lora/test_utils.py @@ -12,36 +12,40 @@ def test_parse_fine_tuned_lora_name_valid(): fixture = { - ("base_model.model.lm_head.lora_A.weight", "lm_head", True), - ("base_model.model.lm_head.lora_B.weight", "lm_head", False), + ("base_model.model.lm_head.lora_A.weight", "lm_head", True, False), + ("base_model.model.lm_head.lora_B.weight", "lm_head", False, False), ( "base_model.model.model.embed_tokens.lora_embedding_A", "model.embed_tokens", True, + False, ), ( "base_model.model.model.embed_tokens.lora_embedding_B", "model.embed_tokens", False, + False, ), ( "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight", "model.layers.9.mlp.down_proj", True, + False, ), ( "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight", "model.layers.9.mlp.down_proj", False, + False, ), } - for name, module_name, is_lora_a in fixture: - assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name) + for name, module_name, is_lora_a, is_bias in fixture: + assert (module_name, is_lora_a, + is_bias) == parse_fine_tuned_lora_name(name) def test_parse_fine_tuned_lora_name_invalid(): fixture = { - "weight", "base_model.weight", "base_model.model.weight", } diff --git a/vllm/config.py b/vllm/config.py index b354fb61d7b7e..5ba1c41fcaac1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1687,6 +1687,7 @@ class LoRAConfig: # This is a constant. lora_vocab_padding_size: ClassVar[int] = 256 long_lora_scaling_factors: Optional[Tuple[float]] = None + bias_enabled: bool = False def __post_init__(self): # Setting the maximum rank to 256 should be able to satisfy the vast diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 1591059a89f92..27f62b0008578 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -143,6 +143,7 @@ class EngineArgs: limit_mm_per_prompt: Optional[Mapping[str, int]] = None mm_processor_kwargs: Optional[Dict[str, Any]] = None enable_lora: bool = False + enable_lora_bias: bool = False max_loras: int = 1 max_lora_rank: int = 16 enable_prompt_adapter: bool = False @@ -584,6 +585,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument('--enable-lora', action='store_true', help='If True, enable handling of LoRA adapters.') + parser.add_argument('--enable-lora-bias', + action='store_true', + help='If True, enable bias for LoRA adapters.') parser.add_argument('--max-loras', type=int, default=EngineArgs.max_loras, @@ -1148,6 +1152,7 @@ def create_engine_config(self) -> VllmConfig: and parallel_config.use_ray), policy=self.scheduling_policy) lora_config = LoRAConfig( + bias_enabled=self.enable_lora_bias, max_lora_rank=self.max_lora_rank, max_loras=self.max_loras, fully_sharded_loras=self.fully_sharded_loras, diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py index a7887a048746a..04fc635828d4d 100644 --- a/vllm/lora/fully_sharded_layers.py +++ b/vllm/lora/fully_sharded_layers.py @@ -70,6 +70,14 @@ def apply(self, x: torch.Tensor, self.lora_b_stacked, add_input=True) # now have column partitioned output + + if self.bias_stacked is not None: + self.bias_stacked = self.bias_stacked.view( + -1, self.bias_stacked.shape[-1]) + self.bias_stacked = self.bias_stacked[ + self.punica_wrapper.token_lora_indices] + output += self.bias_stacked + output = output.view(*out_orig_shape) return output @@ -121,6 +129,15 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora): left_offset = 0 for idx in range(n): shard_size = layer.lora_b_stacked[idx].shape[2] + + if layer.bias_stacked is not None: + bias = layer.bias_stacked[idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[layer.punica_wrapper.token_lora_indices] + bias[layer.punica_wrapper.token_lora_indices == -1] = 0 + output[:, left_offset:left_offset + shard_size] += bias + layer.punica_wrapper.add_expand_slice( output, buffers[idx], @@ -295,6 +312,15 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = lora_b[:, start_idx:end_idx] return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + if bias is None: + return bias + shard_size = self.bias_stacked.shape[2] + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + bias = bias[start_idx:end_idx] + return bias + def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) @@ -318,6 +344,13 @@ def apply(self, x: torch.Tensor) -> torch.Tensor: # reduced before being used shard_size = self.lora_b_stacked.shape[2] start_idx = self.tp_rank * shard_size + + if self.bias_stacked is not None: + bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1]) + bias = bias[self.punica_wrapper.token_lora_indices] + bias[self.punica_wrapper.token_lora_indices == -1] = 0 + output += bias + self.punica_wrapper.add_expand_slice(output, buffer, self.lora_b_stacked, start_idx, shard_size) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 6254c67596e65..7429c60e0222d 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -67,6 +67,63 @@ def dec(*args, **kwargs): return dec +def apply_bias( + indices: torch.Tensor, + output: torch.Tensor, + bias_stacked: torch.Tensor, +): + """Applies bias to output + + Input shapes: + bias_stacked: (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, output_dim) + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1]) + bias_stacked = bias_stacked[indices] + bias_stacked[indices == -1] = 0 + output += bias_stacked + + return output.view_as(org_output) + + +def apply_bias_packed_nslice( + indices: torch.Tensor, + output: torch.Tensor, + output_slices: Tuple[int, ...], + bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor], +): + """Applies bias to output + + Input shapes: + bias_stacked: 3 element tuple of (num_loras, output_dim) + indices: (batch_size) + output: (batch_size, q_slice_size + 2*kv_slice_size) + output_slices: n-1 element tuple of (slice_size...), + where n is number of slices + """ + org_output = output + output = output.view(-1, output.shape[-1]) + indices = indices.view(-1) + + offset_left = 0 + for slice_idx, slice in enumerate(output_slices): + bias = bias_stacked[slice_idx] + if bias is not None: + bias = bias.view(-1, bias.shape[-1]) + bias = bias[indices] + bias[indices == -1] = 0 + output[:, offset_left:offset_left + slice] += bias + + offset_left += slice + + return output.view_as(org_output) + + @dataclass class LoRAMapping(AdapterMapping): is_prefill: bool = False @@ -105,6 +162,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): """Overwrites lora tensors at index.""" ... @@ -203,6 +261,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_( @@ -299,10 +358,22 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + max_loras, + 1, + self.output_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def set_lora( self, @@ -310,6 +381,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) @@ -319,10 +391,21 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -401,11 +484,25 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) + + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + max_loras, + 1, + self.output_size, + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None + self.output_dim = self.lora_b_stacked.shape[2] def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: return lora_a @@ -418,18 +515,30 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = lora_b[:, start_idx:end_idx] return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + if bias is None: + return bias + tensor_model_parallel_rank = get_tensor_model_parallel_rank() + shard_size = self.output_dim + start_idx = tensor_model_parallel_rank * shard_size + end_idx = (tensor_model_parallel_rank + 1) * shard_size + bias = bias[start_idx:end_idx] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -437,10 +546,21 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -534,6 +654,17 @@ def create_lora_weights( dtype=lora_config.lora_dtype, device=self.device, ) for _ in range(n_slices)) + if lora_config.bias_enabled: + self.bias_stacked = tuple( + torch.zeros( + max_loras, + 1, + self.output_size // 2, + dtype=lora_config.lora_dtype, + device=self.device, + ) for _ in range(n_slices)) + else: + self.bias_stacked = None self.output_dim = self.lora_b_stacked[0].shape[2] @@ -542,6 +673,9 @@ def reset_lora(self, index: int): self.lora_a_stacked[1][index] = 0 self.lora_b_stacked[0][index] = 0 self.lora_b_stacked[1][index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[0][index] = 0 + self.bias_stacked[1][index] = 0 def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -562,18 +696,32 @@ def slice_lora_b( ] return lora_b + def slice_bias( + self, bias: List[Union[torch.Tensor, + None]]) -> List[Union[torch.Tensor, None]]: + if bias[0] is None or bias[1] is None: + return bias + shard_size = self.output_dim + start_idx = self.tp_rank * shard_size + end_idx = (self.tp_rank + 1) * shard_size + bias = [bias[0][start_idx:end_idx], bias[1][start_idx:end_idx]] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) if lora_a[0] is not None: self.lora_a_stacked[0][ @@ -582,6 +730,10 @@ def set_lora( self.lora_b_stacked[0][ index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_( lora_b[0].T, non_blocking=True) + if bias is not None and bias[0] is not None: + self.bias_stacked[0][index, + 0, :bias[0].shape[0]].copy_(bias[0].T, + non_blocking=True) if lora_a[1] is not None: self.lora_a_stacked[1][ index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_( @@ -589,10 +741,22 @@ def set_lora( self.lora_b_stacked[1][ index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_( lora_b[1].T, non_blocking=True) + if bias is not None and bias[1] is not None: + self.bias_stacked[1][index, + 0, :bias[1].shape[0]].copy_(bias[1].T, + non_blocking=True) def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias_packed_nslice( + self.indices, + output, + (self.output_dim, self.output_dim), + self.bias_stacked, + ) self.punica_wrapper.add_lora_packed_nslice( output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, (self.output_dim, self.output_dim)) @@ -654,17 +818,35 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1) return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + bias_q = bias[self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + k_offset = self.q_proj_total_size + bias_k = bias[k_offset + + self.kv_proj_shard_size * self.kv_shard_id:k_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + v_offset = k_offset + self.kv_proj_total_size + bias_v = bias[v_offset + + self.kv_proj_shard_size * self.kv_shard_id:v_offset + + self.kv_proj_shard_size * (self.kv_shard_id + 1)] + bias = torch.cat([bias_q, bias_k, bias_v], dim=1) + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -672,6 +854,10 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) @classmethod @_not_fully_sharded_can_replace @@ -768,6 +954,32 @@ def create_lora_weights( device=self.device, ), ) + if lora_config.bias_enabled: + self.bias_stacked = ( + torch.zeros( + max_loras, + 1, + self.q_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + torch.zeros( + max_loras, + 1, + self.kv_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + torch.zeros( + max_loras, + 1, + self.kv_proj_shard_size, + dtype=lora_config.lora_dtype, + device=self.device, + ), + ) + else: + self.bias_stacked = None self.output_slices = ( self.q_proj_shard_size, @@ -787,6 +999,10 @@ def reset_lora(self, index: int): self.lora_b_stacked[1][index] = 0 self.lora_a_stacked[2][index] = 0 self.lora_b_stacked[2][index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[0][index] = 0 + self.bias_stacked[1][index] = 0 + self.bias_stacked[2][index] = 0 def slice_lora_a( self, lora_a: List[Union[torch.Tensor, None]] @@ -812,18 +1028,40 @@ def slice_lora_b( lora_b = [lora_b_q, lora_b_k, lora_b_v] return lora_b + def slice_bias( + self, bias: List[Union[torch.Tensor, + None]]) -> List[Union[torch.Tensor, None]]: + bias_q, bias_k, bias_v = bias + if bias_q is not None: + bias_q = bias_q[self.q_proj_shard_size * + self.q_shard_id:self.q_proj_shard_size * + (self.q_shard_id + 1)] + if bias_k is not None: + bias_k = bias_k[self.kv_proj_shard_size * + self.kv_shard_id:self.kv_proj_shard_size * + (self.kv_shard_id + 1)] + if bias_v is not None: + bias_v = bias_v[self.kv_proj_shard_size * + self.kv_shard_id:self.kv_proj_shard_size * + (self.kv_shard_id + 1)] + bias = [bias_q, bias_k, bias_v] + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) if lora_b[0] is not None: lora_b_q = lora_b[0] @@ -854,9 +1092,28 @@ def set_lora( index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_( lora_a[2].T, non_blocking=True) + if bias is not None: + if bias[0] is not None: + self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_( + bias[0].T, non_blocking=True) + if bias[1] is not None: + self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_( + bias[1].T, non_blocking=True) + if bias[2] is not None: + self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_( + bias[2].T, non_blocking=True) + def apply(self, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias_packed_nslice( + self.indices, + output, + self.output_slices, + self.bias_stacked, + ) self.punica_wrapper.add_lora_packed_nslice(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0, @@ -919,9 +1176,27 @@ def create_lora_weights( device=self.device, ) + if lora_config.bias_enabled: + self.bias_stacked = torch.zeros( + ( + max_loras, + 1, + self.output_size, + ), + dtype=lora_config.lora_dtype, + device=self.device, + ) + else: + self.bias_stacked = None + # Lazily initialized + self.indices: torch.Tensor + self.indices_len: List[int] + def reset_lora(self, index: int): self.lora_a_stacked[index] = 0 self.lora_b_stacked[index] = 0 + if self.lora_config.bias_enabled: + self.bias_stacked[index] = 0 def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: tensor_model_parallel_rank = get_tensor_model_parallel_rank() @@ -934,18 +1209,24 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor: def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor: return lora_b + def slice_bias(self, bias: torch.Tensor) -> torch.Tensor: + return bias + def set_lora( self, index: int, lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) if self.base_layer.tp_size > 1: lora_a = self.slice_lora_a(lora_a) lora_b = self.slice_lora_b(lora_b) + if bias is not None: + bias = self.slice_bias(bias) self.lora_a_stacked[index, 0, :lora_a.shape[1], :lora_a.shape[0]].copy_( @@ -953,9 +1234,20 @@ def set_lora( self.lora_b_stacked[index, 0, :lora_b.shape[1], :lora_b.shape[0]].copy_( lora_b.T, non_blocking=True) + if bias is not None: + self.bias_stacked[index, + 0, :bias.shape[0]].copy_(bias.T, + non_blocking=True) def apply(self, x: torch.Tensor) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x) + if self.bias_stacked is not None: + self.indices = self.punica_wrapper.token_lora_indices + output = apply_bias( + self.indices, + output, + self.bias_stacked, + ) self.punica_wrapper.add_lora(output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0) return output @@ -1132,6 +1424,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): self.reset_lora(index) self.lora_a_stacked[index, @@ -1199,7 +1492,7 @@ def _get_logits( neginf=float("-inf"))) logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + - lora_logits.shape[1], ] = lora_logits + lora_logits.shape[1]] = lora_logits # LogitsProcessorWithLoRA always using bgmv self.punica_wrapper.add_lora_logits(logits, hidden_states, @@ -1276,6 +1569,7 @@ def set_lora( lora_a: torch.Tensor, lora_b: torch.Tensor, embeddings_tensor: Optional[torch.Tensor], + bias: Optional[torch.Tensor] = None, ): ... diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py index 14081b5ba441c..b648312ba76ec 100644 --- a/vllm/lora/lora.py +++ b/vllm/lora/lora.py @@ -17,6 +17,7 @@ def __init__( lora_alpha: int, lora_a: torch.Tensor, lora_b: torch.Tensor, + bias: Optional[torch.Tensor] = None, embeddings_tensor: Optional[torch.Tensor] = None, scaling: Optional[float] = None, ) -> None: @@ -25,6 +26,7 @@ def __init__( self.lora_alpha = lora_alpha self.lora_a = lora_a self.lora_b = lora_b + self.bias = bias self.embeddings_tensor = embeddings_tensor if scaling is None: @@ -66,7 +68,8 @@ def create_dummy_lora_weights( rank: int, dtype: torch.dtype, device: torch.types.Device, - embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights": + embeddings_tensor_dim: Optional[int] = None, + bias_enabled: Optional[bool] = False) -> "LoRALayerWeights": pin_memory = str(device) == "cpu" and is_pin_memory_available() lora_a = torch.zeros([input_dim, rank], dtype=dtype, @@ -76,6 +79,14 @@ def create_dummy_lora_weights( dtype=dtype, device=device, pin_memory=pin_memory) + if bias_enabled: + bias = torch.zeros([output_dim], + dtype=dtype, + device=device, + pin_memory=pin_memory) + else: + bias = None + embeddings_tensor = torch.rand( 10, embeddings_tensor_dim, @@ -88,6 +99,7 @@ def create_dummy_lora_weights( lora_alpha=1, lora_a=lora_a, lora_b=lora_b, + bias=bias, embeddings_tensor=embeddings_tensor, ) @@ -102,6 +114,7 @@ def __init__( lora_alphas: List[Optional[int]], lora_a: List[Optional[torch.Tensor]], lora_b: List[Optional[torch.Tensor]], + bias: Optional[List[Optional[torch.Tensor]]] = None, scaling: Optional[List[float]] = None, ) -> None: super().__init__( @@ -110,6 +123,7 @@ def __init__( lora_alpha=0, lora_a=lora_a, lora_b=lora_b, + bias=bias, scaling=scaling, # type: ignore embeddings_tensor=None, ) @@ -141,6 +155,7 @@ def pack( [lora.lora_alpha if lora is not None else None for lora in loras], [lora.lora_a if lora is not None else None for lora in loras], [lora.lora_b if lora is not None else None for lora in loras], + [lora.bias if lora is not None else None for lora in loras], scaling=[ 1 if lora is not None else None # type: ignore for lora in loras diff --git a/vllm/lora/models.py b/vllm/lora/models.py index eafc3a43a2846..2ffefe61427e3 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -4,7 +4,7 @@ import os import re from dataclasses import dataclass, field -from typing import Any, Callable, Dict, List, Optional, Type +from typing import Any, Callable, Dict, List, Optional, Sequence, Type import safetensors.torch import torch @@ -119,7 +119,8 @@ def from_lora_tensors( pin_memory = str(device) == "cpu" and is_pin_memory_available() loras: Dict[str, LoRALayerWeights] = {} for tensor_name, tensor in tensors.items(): - module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name) + module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name( + tensor_name) if module_name not in loras: lora_embeddings_tensor = None if embeddings: @@ -136,8 +137,16 @@ def from_lora_tensors( lora_embeddings_tensor.pin_memory()) loras[module_name] = LoRALayerWeights(module_name, rank, lora_alpha, None, None, + None, lora_embeddings_tensor) - if is_lora_a: + if is_bias: + loras[module_name].bias = tensor.to(device=device, + dtype=dtype).t() + bias = tensor.to(device=device, dtype=dtype).t() + if pin_memory: + bias = bias.pin_memory() + loras[module_name].bias = bias + elif is_lora_a: loras[module_name].lora_a = tensor.to(device=device, dtype=dtype).t() if pin_memory: @@ -215,7 +224,7 @@ def from_local_checkpoint( with safetensors.safe_open(lora_tensor_path, framework="pt") as f: # type: ignore for lora_module in f.keys(): # noqa - module_name, _ = parse_fine_tuned_lora_name(lora_module) + module_name, _, _ = parse_fine_tuned_lora_name(lora_module) part_name = module_name.split(".")[-1] if part_name not in expected_lora_modules: unexpected_modules.append(module_name) @@ -386,8 +395,19 @@ def activate_adapter( module_lora = lora_model.get_lora(module_name) if module_lora: module_lora.optimize() + # Bias is not explicitly enabled with the flag enable_lora_bias. + bias = module_lora.bias + if ((torch.is_tensor(bias) or + (isinstance(bias, Sequence) and any(b is not None + for b in bias))) + and not self.lora_config.bias_enabled): + module_lora.bias = None + raise ValueError( + f"Adapter bias cannot be used for {module_name}" + " without --enable-lora-bias.") module.set_lora(index, module_lora.lora_a, module_lora.lora_b, - module_lora.embeddings_tensor) + module_lora.embeddings_tensor, + module_lora.bias) else: module.reset_lora(index) return True @@ -509,6 +529,7 @@ def create_dummy_lora( """Create zero-initialized LoRAModel for warmup.""" model = LoRAModel(lora_id, rank, {}, scaling_factor) for module_name, module in self.model.named_modules(): + bias_enabled = self.lora_config.bias_enabled if (not self._match_target_modules(module_name) or not isinstance(module, BaseLayerWithLoRA) or isinstance(module, LinearScalingRotaryEmbeddingWithLora) @@ -536,7 +557,8 @@ def create_dummy_lora( rank, module.lora_a_stacked.dtype, "cpu", - embeddings_tensor_dim=embeddings_tensor_dim) + embeddings_tensor_dim=embeddings_tensor_dim, + bias_enabled=bias_enabled) else: lora = LoRALayerWeights.create_dummy_lora_weights( module_name, @@ -545,6 +567,7 @@ def create_dummy_lora( rank, module.lora_a_stacked.dtype, "cpu", + bias_enabled=bias_enabled, ) lora.optimize() else: @@ -559,6 +582,7 @@ def create_dummy_lora( rank, module.lora_a_stacked[i].dtype, "cpu", + bias_enabled=bias_enabled, ) lora.optimize() subloras.append(lora) diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index a780429f413d3..5876494ce2824 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -91,7 +91,7 @@ def replace_submodule(model: nn.Module, module_name: str, return new_module -def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: +def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool, bool]: """Parse the name of lora weights. args: @@ -101,15 +101,18 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]: Tuple(module_name, is_lora_a): module_name: the name of the module, e.g. model.dense1, is_lora_a whether the tensor is lora_a or lora_b. + is_bias whether the tensor is lora bias. """ parts = name.split(".") + if parts[-1] == "weight" and (parts[-2] == "lora_A" + or parts[-2] == "lora_B"): + return ".".join(parts[2:-2]), parts[-2] == "lora_A", False - if len(parts) >= 2 and parts[0] == "base_model" and parts[1] == "model": - if parts[-1] == "weight": - if parts[-2] == "lora_A" or parts[-2] == "lora_B": - return ".".join(parts[2:-2]), parts[-2] == "lora_A" - elif parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": - return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A" + if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B": + return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A", False + + if parts[-1] == "bias": + return ".".join(parts[2:-2]), False, True raise ValueError(f"{name} is unsupported LoRA weight") From 1f55e0571350f3dd2c04638e13e52d8ed557d93e Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Nov 2024 13:39:56 -0800 Subject: [PATCH 06/20] [V1] Enable Inductor when using piecewise CUDA graphs (#10268) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu_model_runner.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2c40853742ac9..db676e2819bf4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -404,14 +404,17 @@ def execute_model( def load_model(self) -> None: if self.use_cuda_graph: - # FIXME(woosuk): Currently, we do not use inductor to reduce the - # compilation time and any potential issues with the inductor. - os.environ["VLLM_CUSTOM_OPS"] = "all" + # NOTE(woosuk): Currently, we use inductor because the piecewise + # CUDA graphs do not work properly with the custom CUDA kernels. + # FIXME(woosuk): Disable inductor to reduce the compilation time + # and avoid any potential issues with the inductor. + os.environ["VLLM_CUSTOM_OPS"] = "none" set_compilation_config( CompilationConfig( use_cudagraph=True, non_cudagraph_ops=["vllm.unified_v1_flash_attention"], - use_inductor=False, + use_inductor=True, + enable_fusion=False, )) logger.info("Starting to load model %s...", self.model_config.model) From 96ae0eaeb270be8741abb30f2251670b4554e886 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 12 Nov 2024 14:34:39 -0800 Subject: [PATCH 07/20] [doc] fix location of runllm widget (#10266) Signed-off-by: youkaichao --- docs/source/_static/custom.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js index dac40ca2cfe75..18b502c786e1d 100644 --- a/docs/source/_static/custom.js +++ b/docs/source/_static/custom.js @@ -8,7 +8,9 @@ document.addEventListener("DOMContentLoaded", function () { script.setAttribute("version", "stable"); script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. script.setAttribute("runllm-name", "vLLM"); - script.setAttribute("runllm-position", "TOP_RIGHT"); + script.setAttribute("runllm-position", "BOTTOM_RIGHT"); + script.setAttribute("runllm-position-y", "20%"); + script.setAttribute("runllm-position-x", "3%"); script.setAttribute("runllm-assistant-id", "207"); script.async = true; From 18081451f9f5dd3ae476ff1e217d5573832b2604 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 12 Nov 2024 14:43:52 -0800 Subject: [PATCH 08/20] [doc] improve debugging doc (#10270) Signed-off-by: youkaichao --- docs/source/getting_started/debugging.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 060599680be25..77bf550601346 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -20,6 +20,10 @@ Hangs loading a model from disk If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory. +.. note:: + + To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck. + Model is too large ---------------------------------------- If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism `_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example `_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. From 377b74fe877c7eb4632c2ca0778b9da9a5db8ae6 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 12 Nov 2024 15:06:48 -0800 Subject: [PATCH 09/20] Revert "[ci][build] limit cmake version" (#10271) --- Dockerfile.neuron | 2 +- Dockerfile.ppc64le | 2 +- docs/source/getting_started/cpu-installation.rst | 2 +- pyproject.toml | 2 +- requirements-build.txt | 2 +- requirements-tpu.txt | 2 +- requirements-xpu.txt | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Dockerfile.neuron b/Dockerfile.neuron index 47e40e015239a..2143315d2a078 100644 --- a/Dockerfile.neuron +++ b/Dockerfile.neuron @@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi RUN python3 -m pip install -U \ - 'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ -r requirements-neuron.txt ENV VLLM_TARGET_DEVICE neuron diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index c2a40000aab4b..b19c6ddec7948 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \ # These packages will be in rocketce eventually RUN --mount=type=cache,target=/root/.cache/pip \ pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \ - 'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ + 'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \ torch==2.3.1 \ -r requirements-cpu.txt \ xformers uvloop==0.20.0 diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index 6bf170b164fb8..69530fd778c55 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -62,7 +62,7 @@ Build from source .. code-block:: console $ pip install --upgrade pip - $ pip install cmake>=3.26,<=3.30 wheel packaging ninja "setuptools-scm>=8" numpy + $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu - Finally, build and install vLLM CPU backend: diff --git a/pyproject.toml b/pyproject.toml index 3be401daa44c7..3c8c46cc8621e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] # Should be mirrored in requirements-build.txt requires = [ - "cmake>=3.26,<=3.30", + "cmake>=3.26", "ninja", "packaging", "setuptools>=61", diff --git a/requirements-build.txt b/requirements-build.txt index 64b92861df25d..fec01caaf25ef 100644 --- a/requirements-build.txt +++ b/requirements-build.txt @@ -1,5 +1,5 @@ # Should be mirrored in pyproject.toml -cmake>=3.26,<=3.30 +cmake>=3.26 ninja packaging setuptools>=61 diff --git a/requirements-tpu.txt b/requirements-tpu.txt index 94a3225dcf479..f9a0770804e55 100644 --- a/requirements-tpu.txt +++ b/requirements-tpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt # Dependencies for TPU -cmake>=3.26,<=3.30 +cmake>=3.26 ninja packaging setuptools-scm>=8 diff --git a/requirements-xpu.txt b/requirements-xpu.txt index 479cb4bb18484..e41295792283f 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -2,7 +2,7 @@ -r requirements-common.txt ray >= 2.9 -cmake>=3.26,<=3.30 +cmake>=3.26 ninja packaging setuptools-scm>=8 From 112fa0bbe5e5354f592a42913a4e6d72e0407b93 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Nov 2024 16:17:20 -0800 Subject: [PATCH 10/20] [V1] Fix CI tests on V1 engine (#10272) Signed-off-by: Woosuk Kwon --- tests/v1/engine/test_engine_core.py | 3 +++ tests/v1/engine/test_engine_core_client.py | 3 +++ vllm/v1/engine/core.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 8451aac33acc4..b3692b594326a 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -27,6 +27,9 @@ def make_request() -> EngineCoreRequest: request_id=uuid.uuid4(), prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, + mm_data=None, + mm_placeholders=None, + mm_processor_kwargs=None, sampling_params=SamplingParams(), eos_token_id=None, arrival_time=time.time(), diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index d582101a1164f..7b241bf836a0e 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -29,6 +29,9 @@ def make_request(params: SamplingParams) -> EngineCoreRequest: request_id=str(uuid.uuid4()), prompt=PROMPT, prompt_token_ids=PROMPT_TOKENS, + mm_data=None, + mm_placeholders=None, + mm_processor_kwargs=None, sampling_params=params, eos_token_id=None, arrival_time=time.time(), diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 808c3936b6c35..428483bdb29cb 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -317,7 +317,7 @@ def process_input_socket(self, input_path: str): # Msgpack serialization decoding. decoder_add_req = PickleEncoder() - decoder_abort_req = msgpack.Decoder(list[str]) + decoder_abort_req = PickleEncoder() with self.make_socket(input_path, zmq.constants.PULL) as socket: while True: From 0d4ea3fb5c8c499b70cea8b1deee3e34a147cff1 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Tue, 12 Nov 2024 17:36:08 -0800 Subject: [PATCH 11/20] [core][distributed] use tcp store directly (#10275) Signed-off-by: youkaichao --- tests/distributed/test_utils.py | 26 ++++++++++++++++---------- vllm/distributed/utils.py | 28 +++++++++++++--------------- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py index 5d77d8abb4718..50444d3abfaf2 100644 --- a/tests/distributed/test_utils.py +++ b/tests/distributed/test_utils.py @@ -43,12 +43,15 @@ def test_cuda_device_count_stateless(): def cpu_worker(rank, WORLD_SIZE, port1, port2): - pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}", + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, rank=rank, world_size=WORLD_SIZE) if rank <= 2: - pg2 = StatelessProcessGroup.create( - init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3) + pg2 = StatelessProcessGroup.create(host="127.0.0.1", + port=port2, + rank=rank, + world_size=3) data = torch.tensor([rank]) data = pg1.broadcast_obj(data, src=2) assert data.item() == 2 @@ -62,14 +65,17 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2): def gpu_worker(rank, WORLD_SIZE, port1, port2): torch.cuda.set_device(rank) - pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}", + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, rank=rank, world_size=WORLD_SIZE) pynccl1 = PyNcclCommunicator(pg1, device=rank) pynccl1.disabled = False if rank <= 2: - pg2 = StatelessProcessGroup.create( - init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3) + pg2 = StatelessProcessGroup.create(host="127.0.0.1", + port=port2, + rank=rank, + world_size=3) pynccl2 = PyNcclCommunicator(pg2, device=rank) pynccl2.disabled = False data = torch.tensor([rank]).cuda() @@ -89,7 +95,8 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2): def broadcast_worker(rank, WORLD_SIZE, port1, port2): - pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}", + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, rank=rank, world_size=WORLD_SIZE) if rank == 2: @@ -101,7 +108,8 @@ def broadcast_worker(rank, WORLD_SIZE, port1, port2): def allgather_worker(rank, WORLD_SIZE, port1, port2): - pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}", + pg1 = StatelessProcessGroup.create(host="127.0.0.1", + port=port1, rank=rank, world_size=WORLD_SIZE) data = pg1.all_gather_obj(rank) @@ -109,8 +117,6 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2): pg1.barrier() -# TODO: investigate why this test is flaky. It hangs during initialization. -@pytest.mark.skip("Skip the test because it is flaky.") @multi_gpu_test(num_gpus=4) @pytest.mark.parametrize( "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker]) diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index a77b41322f376..dcfcb848cbe06 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -9,7 +9,7 @@ from typing import Any, Deque, Dict, Optional, Sequence, Tuple import torch -from torch.distributed.rendezvous import rendezvous +from torch.distributed import TCPStore import vllm.envs as envs from vllm.logger import init_logger @@ -97,7 +97,6 @@ class StatelessProcessGroup: group. Only use it to communicate metadata between processes. For data-plane communication, create NCCL-related objects. """ - prefix: str rank: int world_size: int store: torch._C._distributed_c10d.Store @@ -127,7 +126,7 @@ def __post_init__(self): def send_obj(self, obj: Any, dst: int): """Send an object to a destination rank.""" self.expire_data() - key = f"{self.prefix}/send_to/{dst}/{self.send_dst_counter[dst]}" + key = f"send_to/{dst}/{self.send_dst_counter[dst]}" self.store.set(key, pickle.dumps(obj)) self.send_dst_counter[dst] += 1 self.entries.append((key, time.time())) @@ -147,8 +146,7 @@ def recv_obj(self, src: int) -> Any: """Receive an object from a source rank.""" obj = pickle.loads( self.store.get( - f"{self.prefix}/send_to/{self.rank}/{self.recv_src_counter[src]}" - )) + f"send_to/{self.rank}/{self.recv_src_counter[src]}")) self.recv_src_counter[src] += 1 return obj @@ -159,14 +157,14 @@ def broadcast_obj(self, obj: Optional[Any], src: int) -> Any: """ if self.rank == src: self.expire_data() - key = (f"{self.prefix}/broadcast_from/{src}/" + key = (f"broadcast_from/{src}/" f"{self.broadcast_send_counter}") self.store.set(key, pickle.dumps(obj)) self.broadcast_send_counter += 1 self.entries.append((key, time.time())) return obj else: - key = (f"{self.prefix}/broadcast_from/{src}/" + key = (f"broadcast_from/{src}/" f"{self.broadcast_recv_src_counter[src]}") recv_obj = pickle.loads(self.store.get(key)) self.broadcast_recv_src_counter[src] += 1 @@ -194,7 +192,8 @@ def barrier(self): @staticmethod def create( - init_method: str, + host: str, + port: int, rank: int, world_size: int, data_expiration_seconds: int = 3600, @@ -214,15 +213,14 @@ def create( can call `StatelessProcessGroup.create` to form a group, and then process A, B, C, and D can call `StatelessProcessGroup.create` to form another group. """ # noqa - from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT - timeout = _DEFAULT_PG_TIMEOUT - - store, rank, world_size = next( - rendezvous(init_method, rank, world_size, timeout=timeout)) - store.set_timeout(timeout) + store = TCPStore( + host_name=host, + port=port, + world_size=world_size, + is_master=(rank == 0), + ) return StatelessProcessGroup( - prefix=init_method, rank=rank, world_size=world_size, store=store, From bbd3e86926f15e59e4c62246b4b3185e71fe7ff2 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Tue, 12 Nov 2024 20:53:13 -0800 Subject: [PATCH 12/20] [V1] Support VLMs with fine-grained scheduling (#9871) Signed-off-by: Woosuk Kwon Co-authored-by: Roger Wang --- vllm/model_executor/models/gpt2.py | 11 +- vllm/model_executor/models/llama.py | 7 +- vllm/model_executor/models/llava.py | 46 +++--- vllm/model_executor/models/opt.py | 7 +- vllm/model_executor/models/phi3v.py | 63 +++++--- vllm/model_executor/models/qwen2.py | 7 +- vllm/v1/core/encoder_cache_manager.py | 48 ++++++ vllm/v1/core/scheduler.py | 205 +++++++++++++++++++++++--- vllm/v1/engine/core.py | 10 ++ vllm/v1/engine/mm_input_mapper.py | 39 +++++ vllm/v1/request.py | 41 +++++- vllm/v1/worker/gpu_model_runner.py | 154 ++++++++++++++++--- 12 files changed, 542 insertions(+), 96 deletions(-) create mode 100644 vllm/v1/core/encoder_cache_manager.py create mode 100644 vllm/v1/engine/mm_input_mapper.py diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index fcff7ec2e01eb..adf2a7a51f737 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -216,9 +216,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + inputs_embeds: Optional[torch.Tensor], ) -> Union[torch.Tensor, IntermediateTensors]: if get_pp_group().is_first_rank: - inputs_embeds = self.wte(input_ids) + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds else: @@ -263,6 +265,9 @@ def __init__( self.make_empty_intermediate_tensors = ( self.transformer.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.transformer.wte(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -270,9 +275,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2472128976d88..8aed0fead18f9 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -538,6 +538,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): normalize=False, softmax=False) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -545,9 +548,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return model_output def compute_logits( diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index ca963fa1c52ea..af712bf8f9506 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -17,6 +17,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.base import NestedTensors from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -448,6 +449,25 @@ def _process_image_input(self, image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) + def process_mm_inputs(self, **kwargs): + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.config.image_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -455,6 +475,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LLaVA-1.5. @@ -494,24 +515,13 @@ def forward( """ if intermediate_tensors is not None: inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - else: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - # for `torch.compile` integration - input_ids = None + elif inputs_embeds is None: + vision_embeddings = self.process_mm_inputs(**kwargs) + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index 58b6107eba347..997fe642439e6 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -360,6 +360,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -367,9 +370,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 4b5dc944bce4b..de03d28638cda 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -39,6 +39,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.base import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import is_list_of @@ -500,15 +501,20 @@ def input_processor_for_phi3v(ctx: InputContext, # TODO: Move this to utils or integrate with clip. new_token_ids: List[int] = [] + placeholder_ranges: List[PlaceholderRange] = [] placeholder_idx = 0 while merged_token_ids: token_id = merged_token_ids.pop(0) if token_id == _IMAGE_TOKEN_ID: - new_token_ids.extend( - repeat_and_pad_token( - _IMAGE_TOKEN_ID, - repeat_count=image_feature_size[placeholder_idx], - )) + replacement_ids = repeat_and_pad_token( + _IMAGE_TOKEN_ID, + repeat_count=image_feature_size[placeholder_idx], + ) + placeholder_ranges.append({ + "offset": len(new_token_ids), + "length": len(replacement_ids) + }) + new_token_ids.extend(replacement_ids) placeholder_idx += 1 else: new_token_ids.append(token_id) @@ -516,7 +522,8 @@ def input_processor_for_phi3v(ctx: InputContext, # NOTE: Create a defensive copy of the original inputs return token_inputs(prompt_token_ids=new_token_ids, prompt=new_prompt, - multi_modal_data=multi_modal_data) + multi_modal_data=multi_modal_data, + multi_modal_placeholders={"image": placeholder_ranges}) @MULTIMODAL_REGISTRY.register_image_input_mapper() @@ -669,32 +676,42 @@ def _process_image_input( return image_embeds + def process_mm_inputs(self, **kwargs): + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.embed_tokens(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.image_token_id) + return inputs_embeds + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object): if intermediate_tensors is not None: inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.embed_tokens(input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.image_token_id) - else: - inputs_embeds = self.language_model.model.embed_tokens( - input_ids) - - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - # for `torch.compile` integration - input_ids = None + elif inputs_embeds is None: + vision_embeddings = self.process_mm_inputs(**kwargs) + # always pass the input via `inputs_embeds` + # to make sure the computation graph is consistent + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 2195ce49aa9a7..b623c576bb673 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -441,6 +441,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.make_empty_intermediate_tensors = ( self.model.make_empty_intermediate_tensors) + def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: + return self.model.get_input_embeddings(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -448,9 +451,11 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + inputs_embeds) return hidden_states def compute_logits( diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py new file mode 100644 index 0000000000000..845bd5ea05e3c --- /dev/null +++ b/vllm/v1/core/encoder_cache_manager.py @@ -0,0 +1,48 @@ +from typing import Dict, List, Set, Tuple + +from vllm.v1.request import Request + + +class EncoderCacheManager: + + def __init__(self, cache_size: int): + self.cache_size = cache_size + self.num_free_slots = cache_size + # req_id -> cached input ids + self.cached: Dict[str, Set[int]] = {} + # List of [req_id, input_id] + self.freed: List[Tuple[str, int]] = [] + + def has_cache(self, request: Request, input_id: int) -> bool: + req_id = request.request_id + return req_id in self.cached and input_id in self.cached[req_id] + + def can_allocate(self, request: Request, input_id: int) -> bool: + num_tokens = request.get_num_encoder_tokens(input_id) + return num_tokens <= self.num_free_slots + + def allocate(self, request: Request, input_id: int) -> None: + req_id = request.request_id + if req_id not in self.cached: + self.cached[req_id] = set() + self.cached[req_id].add(input_id) + self.num_free_slots -= request.get_num_encoder_tokens(input_id) + + def get_cached_input_ids(self, request: Request) -> Set[int]: + return self.cached.get(request.request_id, set()) + + def free(self, request: Request, input_id: int) -> None: + req_id = request.request_id + if req_id not in self.cached: + return + + self.cached[req_id].discard(input_id) + if len(self.cached[req_id]) == 0: + del self.cached[req_id] + self.num_free_slots += request.get_num_encoder_tokens(input_id) + self.freed.append((req_id, input_id)) + + def get_freed_ids(self) -> List[Tuple[str, int]]: + freed = self.freed + self.freed = [] + return freed diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index ee860e792281d..ba50a9786d805 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -1,16 +1,21 @@ from collections import deque from dataclasses import dataclass -from typing import Deque, Dict, Iterable, List, Optional, Set, Union +from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set, + Tuple, Union) from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.logger import init_logger -from vllm.multimodal import MultiModalDataDict from vllm.sampling_params import SamplingParams +from vllm.v1.core.encoder_cache_manager import EncoderCacheManager from vllm.v1.core.kv_cache_manager import KVCacheManager from vllm.v1.engine import EngineCoreOutput from vllm.v1.outputs import ModelRunnerOutput from vllm.v1.request import Request, RequestStatus +if TYPE_CHECKING: + from vllm.multimodal import MultiModalKwargs + from vllm.multimodal.base import PlaceholderRange + logger = init_logger(__name__) @@ -61,12 +66,20 @@ def __init__( # Request id -> RunningRequestData self.running_reqs_data: Dict[str, RunningRequestData] = {} - def schedule(self) -> "SchedulerOutput": - scheduled_new_reqs: List[Request] = [] - scheduled_resumed_reqs: List[Request] = [] - scheduled_running_reqs: List[Request] = [] - preempted_reqs: List[Request] = [] + # Encoder-related. + # NOTE(woosuk): Here, "encoder" includes the vision encoder (and + # projector if needed). Currently, we assume that the encoder also + # has the Transformer architecture (e.g., ViT). + # FIXME(woosuk): Below are placeholder values. We need to calculate the + # actual values from the configurations. + self.max_num_encoder_input_tokens = 2048 + # NOTE(woosuk): For the models without encoder (e.g., text-only models), + # the encoder cache will not be initialized and used, regardless of + # the cache size. This is because the memory space for the encoder cache + # is preallocated in the profiling run. + self.encoder_cache_manager = EncoderCacheManager(cache_size=2048) + def schedule(self) -> "SchedulerOutput": # NOTE(woosuk) on the scheduling algorithm: # There's no "decoding phase" nor "prefill phase" in the scheduler. # Each request just has the num_computed_tokens and num_tokens, @@ -74,23 +87,45 @@ def schedule(self) -> "SchedulerOutput": # At each step, the scheduler tries to assign tokens to the requests # so that each request's num_computed_tokens can catch up its # num_tokens. This is general enough to cover chunked prefills, - # prefix caching, and the "jump forward" optimization in the future. + # prefix caching, and the "jump decoding" optimization in the future. + + scheduled_new_reqs: List[Request] = [] + scheduled_resumed_reqs: List[Request] = [] + scheduled_running_reqs: List[Request] = [] + preempted_reqs: List[Request] = [] req_to_new_block_ids: Dict[str, List[int]] = {} num_scheduled_tokens: Dict[str, int] = {} token_budget = self.max_num_scheduled_tokens + # Encoder-related. + scheduled_encoder_inputs: Dict[str, List[int]] = {} + encoder_budget = self.max_num_encoder_input_tokens # First, schedule the RUNNING requests. + # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be + # in the "partial" state, where the request has some tokens computed + # but not all. The constraint is due to the persistent batch in the + # V1 model runner. + # TODO(woosuk): Remove this constraint after refactoring model runner. + has_partial_request = False req_index = 0 while req_index < len(self.running): - if token_budget == 0: - break - + # Only the last request in the RUNNING queue can be "partial". + assert not has_partial_request + assert token_budget > 0 request = self.running[req_index] num_new_tokens = request.num_tokens - request.num_computed_tokens num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 + # Schedule encoder inputs. + encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = ( + self._try_schedule_encoder_inputs(request, + request.num_computed_tokens, + num_new_tokens, + encoder_budget)) + assert num_new_tokens > 0 + while True: new_blocks = self.kv_cache_manager.append_slots( request, num_new_tokens) @@ -106,22 +141,40 @@ def schedule(self) -> "SchedulerOutput": preempted_reqs.append(preempted_req) if preempted_req == request: # No more request to preempt. + can_schedule = False break else: # The request can be scheduled. - scheduled_running_reqs.append(request) - - req_to_new_block_ids[request.request_id] = [ - b.block_id for b in new_blocks - ] - num_scheduled_tokens[request.request_id] = num_new_tokens - token_budget -= num_new_tokens - req_index += 1 + can_schedule = True break + if not can_schedule: + break + + # Schedule the request. + scheduled_running_reqs.append(request) + req_to_new_block_ids[request.request_id] = [ + b.block_id for b in new_blocks + ] + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + req_index += 1 + has_partial_request = (request.num_computed_tokens + num_new_tokens + < request.num_tokens) + + # Encoder-related. + if encoder_inputs_to_schedule: + scheduled_encoder_inputs[request.request_id] = ( + encoder_inputs_to_schedule) + # Allocate the encoder cache. + for i in encoder_inputs_to_schedule: + self.encoder_cache_manager.allocate(request, i) + encoder_budget = new_encoder_budget # Next, schedule the WAITING requests. if not preempted_reqs: while self.waiting: + if has_partial_request: + break if len(self.running) == self.max_num_running_reqs: break if token_budget == 0: @@ -149,12 +202,21 @@ def schedule(self) -> "SchedulerOutput": computed_blocks.pop() num_new_tokens = min(num_new_tokens, token_budget) assert num_new_tokens > 0 + + # Schedule encoder inputs. + (encoder_inputs_to_schedule, num_new_tokens, + new_encoder_budget) = self._try_schedule_encoder_inputs( + request, num_computed_tokens, num_new_tokens, + encoder_budget) + if num_new_tokens == 0: + # The request cannot be scheduled. + break + new_blocks = self.kv_cache_manager.allocate_slots( request, num_new_tokens, computed_blocks) if new_blocks is None: # The request cannot be scheduled. break - request.num_computed_tokens = num_computed_tokens self.waiting.popleft() self.running.append(request) @@ -172,6 +234,18 @@ def schedule(self) -> "SchedulerOutput": num_scheduled_tokens[request.request_id] = num_new_tokens token_budget -= num_new_tokens request.status = RequestStatus.RUNNING + request.num_computed_tokens = num_computed_tokens + has_partial_request = (num_computed_tokens + num_new_tokens < + request.num_tokens) + + # Encoder-related. + if encoder_inputs_to_schedule: + scheduled_encoder_inputs[request.request_id] = ( + encoder_inputs_to_schedule) + # Allocate the encoder cache. + for i in encoder_inputs_to_schedule: + self.encoder_cache_manager.allocate(request, i) + encoder_budget = new_encoder_budget # Check if the scheduling constraints are satisfied. total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) @@ -205,12 +279,14 @@ def schedule(self) -> "SchedulerOutput": scheduled_running_reqs=running_reqs_data, num_scheduled_tokens=num_scheduled_tokens, total_num_scheduled_tokens=total_num_scheduled_tokens, + scheduled_encoder_inputs=scheduled_encoder_inputs, preempted_req_ids=preempted_req_ids, # finished_req_ids is an existing state in the scheduler, # instead of being newly scheduled in this step. # It contains the request IDs that are finished in between # the previous and the current steps. finished_req_ids=self.finished_req_ids, + free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(), ) self.finished_req_ids = set() @@ -234,6 +310,72 @@ def _make_running_request_data( self.running_reqs_data[request.request_id] = req_data return req_data + def _try_schedule_encoder_inputs( + self, + request: Request, + num_computed_tokens: int, + num_new_tokens: int, + encoder_budget: int, + ) -> Tuple[List[int], int, int]: + """ + Determine which encoder inputs need to be scheduled in the current step, + and update `num_new_tokens` and encoder token budget accordingly. + + An encoder input will be scheduled if: + - Its output tokens overlap with the range of tokens being computed + in this step, i.e., + [num_computed_tokens, num_computed_tokens + num_new_tokens). + - It is not already computed and stored in the encoder cache. + - There is sufficient encoder token budget to process it. + - The encoder cache has space to store it. + + If an encoder input cannot be scheduled due to cache or budget + limitations, the method adjusts `num_new_tokens` to schedule only the + decoder tokens up to just before the unschedulable encoder input. + """ + if not request.has_encoder_inputs(): + return [], num_new_tokens, encoder_budget + + encoder_inputs_to_schedule: List[int] = [] + mm_positions = request.mm_positions + assert mm_positions is not None + assert len(mm_positions) > 0 + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info["offset"] + num_encoder_tokens = pos_info["length"] + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, num_computed_tokens + num_new_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_new_tokens: + # The encoder input is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder input is already computed and stored + # in the decoder's KV cache. + continue + + if self.encoder_cache_manager.has_cache(request, i): + # The encoder input is already computed and cached. + continue + if not self.encoder_cache_manager.can_allocate(request, i): + # The encoder cache is full. We can only schedule the decoder + # tokens just before the encoder input. + num_new_tokens = start_pos - num_computed_tokens + break + if num_encoder_tokens > encoder_budget: + # The encoder budget is exhausted. We can only schedule the + # decoder tokens up until the encoder input. + # NOTE(woosuk): We assume that the encoder tokens should be + # processed altogether, as the encoder usually uses + # bidirectional attention. + num_new_tokens = start_pos - num_computed_tokens + break + + encoder_budget -= num_encoder_tokens + encoder_inputs_to_schedule.append(i) + return encoder_inputs_to_schedule, num_new_tokens, encoder_budget + def update_from_output( self, scheduler_output: "SchedulerOutput", @@ -251,6 +393,17 @@ def update_from_output( # the request generates output tokens. Otherwise, we ignore the # sampler output for the request. assert request.num_computed_tokens <= request.num_tokens + + cached_encoder_input_ids = ( + self.encoder_cache_manager.get_cached_input_ids(request)) + for input_id in list(cached_encoder_input_ids): + start_pos = request.mm_positions[input_id]["offset"] + num_tokens = request.mm_positions[input_id]["length"] + if start_pos + num_tokens <= request.num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + self.encoder_cache_manager.free(request, input_id) + if request.num_computed_tokens == request.num_tokens: req_index = model_runner_output.req_id_to_index[req_id] # NOTE(woosuk): Currently, we assume that each request @@ -355,7 +508,8 @@ class NewRequestData: req_id: str prompt_token_ids: List[int] prompt: Optional[str] - multi_modal_data: Optional[MultiModalDataDict] + mm_inputs: List["MultiModalKwargs"] + mm_positions: List["PlaceholderRange"] sampling_params: SamplingParams block_ids: List[int] num_computed_tokens: int @@ -369,9 +523,10 @@ def from_request( ) -> "NewRequestData": return cls( req_id=request.request_id, - prompt_token_ids=request.inputs["prompt_token_ids"], - prompt=request.inputs.get("prompt"), - multi_modal_data=request.inputs.get("multi_modal_data"), + prompt_token_ids=request.prompt_token_ids, + prompt=request.prompt, + mm_inputs=request.mm_inputs, + mm_positions=request.mm_positions, sampling_params=request.sampling_params, block_ids=block_ids, num_computed_tokens=num_computed_tokens, @@ -429,6 +584,8 @@ class SchedulerOutput: num_scheduled_tokens: Dict[str, int] total_num_scheduled_tokens: int + scheduled_encoder_inputs: Dict[str, List[int]] preempted_req_ids: Set[str] finished_req_ids: Set[str] + free_encoder_input_ids: List[Tuple[str, int]] diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 428483bdb29cb..35ed131d50de9 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -17,6 +17,7 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType) +from vllm.v1.engine.mm_input_mapper import MMInputMapper from vllm.v1.executor.gpu_executor import GPUExecutor from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import PickleEncoder @@ -65,6 +66,9 @@ def __init__( vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks + # Set up multimodal input mapper (e.g., convert PIL images to tensors). + self.mm_input_mapper = MMInputMapper(vllm_config.model_config) + # Setup scheduler. self.scheduler = Scheduler(vllm_config.scheduler_config, vllm_config.cache_config, @@ -93,6 +97,12 @@ def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" req = Request.from_engine_core_request(request) + # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may + # take 10-50 ms, which can cause a spike in the latency. We should + # consider moving this to a separate thread. + if req.mm_data: + req.mm_inputs = self.mm_input_mapper.process_inputs( + req.mm_data, req.mm_processor_kwargs) self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py new file mode 100644 index 0000000000000..594c973678235 --- /dev/null +++ b/vllm/v1/engine/mm_input_mapper.py @@ -0,0 +1,39 @@ +from typing import Any, Dict, List, Optional + +from vllm.config import ModelConfig +from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, + MultiModalKwargs, MultiModalRegistry) + + +class MMInputMapper: + + def __init__( + self, + model_config: ModelConfig, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, + ): + self.mm_registry = mm_registry + self.multi_modal_input_mapper = mm_registry.create_input_mapper( + model_config) + self.mm_registry.init_mm_limits_per_prompt(model_config) + + def process_inputs( + self, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Dict[str, Any]], + ) -> List[MultiModalKwargs]: + image_inputs = mm_data["image"] + if not isinstance(image_inputs, list): + image_inputs = [image_inputs] + + # Process each image input separately so that later we can schedule + # them in a fine-grained manner. + mm_inputs: List[MultiModalKwargs] = [] + num_images = len(image_inputs) + for i in range(num_images): + mm_input = self.multi_modal_input_mapper( + {"image": [image_inputs[i]]}, + mm_processor_kwargs=mm_processor_kwargs, + ) + mm_inputs.append(mm_input) + return mm_inputs diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 00e5aea92a8df..f35cf738c89bf 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -3,6 +3,7 @@ from vllm.inputs.data import DecoderOnlyInputs from vllm.lora.request import LoRARequest +from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingParams from vllm.sequence import RequestMetrics from vllm.v1.engine import EngineCoreRequest @@ -47,14 +48,30 @@ def __init__( self._all_token_ids: List[int] = self.prompt_token_ids.copy() self.num_computed_tokens = 0 + # Raw multimodal data before the mm input mapper (e.g., PIL images). + self.mm_data = inputs.get("multi_modal_data") + self.mm_processor_kwargs = inputs.get("mm_processor_kwargs") + mm_positions = inputs.get("multi_modal_placeholders") + if mm_positions: + # FIXME(woosuk): Support other modalities. + self.mm_positions = mm_positions.get("image", []) + else: + self.mm_positions = [] + # Output of the mm input mapper (e.g., image tensors). + self.mm_inputs: List[MultiModalKwargs] = [] + @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": - return cls( request_id=request.request_id, - inputs=DecoderOnlyInputs(type="token", - prompt_token_ids=request.prompt_token_ids, - prompt=request.prompt), + inputs=DecoderOnlyInputs( + type="token", + prompt_token_ids=request.prompt_token_ids, + prompt=request.prompt, + multi_modal_data=request.mm_data, + multi_modal_placeholders=request.mm_placeholders, + mm_processor_kwargs=request.mm_processor_kwargs, + ), sampling_params=request.sampling_params, eos_token_id=request.eos_token_id, arrival_time=request.arrival_time, @@ -96,9 +113,21 @@ def is_finished(self) -> bool: def get_finished_reason(self) -> Union[str, None]: return RequestStatus.get_finished_reason(self.status) + def has_encoder_inputs(self) -> bool: + return self.mm_data is not None + + @property + def num_encoder_inputs(self) -> int: + return len(self.mm_positions) + + def get_num_encoder_tokens(self, input_id: int) -> int: + assert input_id < len(self.mm_positions) + num_tokens = self.mm_positions[input_id]["length"] + return num_tokens + class RequestStatus(enum.IntEnum): - """Status of a sequence.""" + """Status of a request.""" WAITING = 0 RUNNING = 1 PREEMPTED = 2 @@ -119,7 +148,7 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]: # Mapping of finished statuses to their finish reasons. -# NOTE: The ignored sequences are the sequences whose prompt lengths +# NOTE: The ignored requests are the requests whose prompt lengths # are longer than the model's length cap. Therefore, the stop # reason should also be "length" as in OpenAI API. _FINISHED_REASON_MAP = { diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index db676e2819bf4..81480786a09e1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1,7 +1,7 @@ import os import time from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, Set +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple import numpy as np import torch @@ -14,9 +14,10 @@ from vllm.compilation.levels import CompilationLevel from vllm.config import VllmConfig from vllm.forward_context import set_forward_context +from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model -from vllm.multimodal import MultiModalDataDict +from vllm.multimodal import MultiModalKwargs from vllm.plugins import set_compilation_config from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv, @@ -27,6 +28,7 @@ from vllm.v1.sample.metadata import SamplingMetadata if TYPE_CHECKING: + from vllm.multimodal.base import PlaceholderRange from vllm.v1.core.scheduler import SchedulerOutput logger = init_logger(__name__) @@ -37,8 +39,8 @@ class GPUModelRunner: def __init__( self, vllm_config: VllmConfig, + input_registry: InputRegistry = INPUT_REGISTRY, ): - # TODO: use ModelRunnerBase.__init__(self, vllm_config=vllm_config) self.vllm_config = vllm_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -75,10 +77,16 @@ def __init__( parallel_config) self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) self.head_size = model_config.get_head_size() + self.hidden_size = model_config.get_hidden_size() + + # Multi-modal data support + self.input_registry = input_registry # Lazy initialization # self.model: nn.Module # Set after load_model self.kv_caches: List[torch.Tensor] = [] + # req_id -> (input_id -> encoder_output) + self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {} # Request states. self.requests: Dict[str, CachedRequestState] = {} @@ -96,18 +104,28 @@ def __init__( and not self.model_config.enforce_eager) # TODO(woosuk): Provide an option to tune the max cudagraph batch size. self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)] - self.input_ids = torch.zeros(self.max_num_tokens, - dtype=torch.int32, - device=self.device) self.positions = torch.zeros(self.max_num_tokens, dtype=torch.int64, device=self.device) + self.inputs_embeds = torch.zeros( + (self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=self.device) def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Remove stopped requests from the cached states. # Keep the states of the pre-empted requests. for req_id in scheduler_output.finished_req_ids: self.requests.pop(req_id, None) + self.encoder_cache.pop(req_id, None) + + # Free the cached encoder outputs. + for req_id, input_id in scheduler_output.free_encoder_input_ids: + encoder_outputs = self.encoder_cache.get(req_id) + if encoder_outputs is not None: + encoder_outputs.pop(input_id, None) + if not encoder_outputs: + self.encoder_cache.pop(req_id, None) # Remove the requests from the persistent batch. stopped_req_ids = set().union( @@ -156,7 +174,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: req_id=req_id, prompt_token_ids=req_data.prompt_token_ids, prompt=req_data.prompt, - multi_modal_data=req_data.multi_modal_data, + mm_inputs=req_data.mm_inputs, + mm_positions=req_data.mm_positions, sampling_params=sampling_params, generator=generator, block_ids=req_data.block_ids, @@ -285,11 +304,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): seq_start_loc_np[0] = 0 np.cumsum(seq_lens, out=seq_start_loc_np[1:]) - self.input_ids[:total_num_scheduled_tokens].copy_(input_ids, - non_blocking=True) + input_ids = input_ids.to(self.device, non_blocking=True) self.positions[:total_num_scheduled_tokens].copy_(positions, non_blocking=True) - query_start_loc = query_start_loc.to(self.device, non_blocking=True) seq_start_loc = seq_start_loc.to(self.device, non_blocking=True) slot_mapping = slot_mapping.to(self.device, non_blocking=True).long() @@ -308,7 +325,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # token from the partial request. # TODO: Support prompt logprobs. logits_indices = query_start_loc[1:] - 1 - return attn_metadata, logits_indices + return input_ids, attn_metadata, logits_indices def _prepare_sampling( self, @@ -325,13 +342,91 @@ def _prepare_sampling( sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy) return sampling_metadata + def _execute_encoder(self, scheduler_output: "SchedulerOutput"): + scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs + if not scheduled_encoder_inputs: + return + + # Batch the multi-modal inputs. + mm_inputs: List[MultiModalKwargs] = [] + req_input_ids: List[Tuple[int, int]] = [] + for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): + req_state = self.requests[req_id] + for input_id in encoder_input_ids: + mm_inputs.append(req_state.mm_inputs[input_id]) + req_input_ids.append((req_id, input_id)) + batched_mm_inputs = MultiModalKwargs.batch(mm_inputs) + batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, + device=self.device) + + # Run the encoder. + # `encoder_outputs` is either of the following: + # 1. A tensor of shape [num_images, feature_size, hidden_size] + # in case when feature_size is fixed across all images. + # 2. A list (length: num_images) of tensors, each of shape + # [feature_size, hidden_size] in case when the feature size is + # dynamic depending on input images. + encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs) + + # Cache the encoder outputs. + for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): + if req_id not in self.encoder_cache: + self.encoder_cache[req_id] = {} + self.encoder_cache[req_id][input_id] = output + + def _gather_encoder_outputs( + self, + scheduler_output: "SchedulerOutput", + ) -> List[torch.Tensor]: + encoder_outputs: List[torch.Tensor] = [] + num_reqs = self.input_batch.num_reqs + for req_id in self.input_batch.req_ids[:num_reqs]: + num_scheduled_tokens = scheduler_output.num_scheduled_tokens[ + req_id] + req_state = self.requests[req_id] + num_computed_tokens = req_state.num_computed_tokens + mm_positions = req_state.mm_positions + for i, pos_info in enumerate(mm_positions): + start_pos = pos_info["offset"] + num_encoder_tokens = pos_info["length"] + + # The encoder output is needed if the two ranges overlap: + # [num_computed_tokens, + # num_computed_tokens + num_scheduled_tokens) and + # [start_pos, start_pos + num_encoder_tokens) + if start_pos >= num_computed_tokens + num_scheduled_tokens: + # The encoder output is not needed in this step. + break + if start_pos + num_encoder_tokens <= num_computed_tokens: + # The encoder output is already processed and stored + # in the decoder's KV cache. + continue + + start_idx = max(num_computed_tokens - start_pos, 0) + end_idx = min( + num_computed_tokens - start_pos + num_scheduled_tokens, + num_encoder_tokens) + assert start_idx < end_idx + assert req_id in self.encoder_cache + assert i in self.encoder_cache[req_id] + encoder_output = self.encoder_cache[req_id][i] + encoder_outputs.append(encoder_output[start_idx:end_idx]) + return encoder_outputs + @torch.inference_mode() def execute_model( self, scheduler_output: "SchedulerOutput", ) -> ModelRunnerOutput: self._update_states(scheduler_output) - attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) + + # Run the encoder. + self._execute_encoder(scheduler_output) + encoder_outputs = self._gather_encoder_outputs(scheduler_output) + + # Prepare the decoder inputs. + input_ids, attn_metadata, logits_indices = self._prepare_inputs( + scheduler_output) num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): @@ -343,12 +438,26 @@ def execute_model( # Eager mode. num_input_tokens = num_scheduled_tokens + # Get the inputs embeds. + if encoder_outputs: + inputs_embeds = self.model.get_input_embeddings( + input_ids, encoder_outputs) + else: + inputs_embeds = self.model.get_input_embeddings(input_ids) + # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings), + # always use embeddings (rather than token ids) as input to the model. + # TODO(woosuk): Avoid the copy. Optimize. + self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds) + + # Run the decoder. + # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata): hidden_states = self.model( - input_ids=self.input_ids[:num_input_tokens], + input_ids=None, positions=self.positions[:num_input_tokens], kv_caches=self.kv_caches, attn_metadata=None, + inputs_embeds=self.inputs_embeds[:num_input_tokens], ) hidden_states = hidden_states[:num_scheduled_tokens] hidden_states = hidden_states[logits_indices] @@ -440,13 +549,16 @@ def _dummy_run(self, model: nn.Module, num_tokens: int) -> None: with set_forward_context(None): # noqa: SIM117 with set_compile_context(self.cudagraph_batch_sizes): # Trigger compilation for general shape. - model(self.input_ids, - self.positions, - dummy_kv_caches, - attn_metadata=None) + model(input_ids=None, + positions=self.positions, + kv_caches=dummy_kv_caches, + attn_metadata=None, + inputs_embeds=self.inputs_embeds) @torch.inference_mode() def profile_run(self) -> None: + # TODO(woosuk): Profile the max memory usage of the encoder and + # the encoder cache. self._dummy_run(self.model, self.max_num_tokens) torch.cuda.synchronize() @@ -468,10 +580,11 @@ def capture_model(self) -> None: # can reuse the memory pool allocated for the large shapes. for num_tokens in reversed(self.cudagraph_batch_sizes): self.model( - self.input_ids[:num_tokens], - self.positions[:num_tokens], + input_ids=None, + positions=self.positions[:num_tokens], kv_caches=self.kv_caches, attn_metadata=None, + inputs_embeds=self.inputs_embeds[:num_tokens], ) end_time = time.perf_counter() @@ -506,7 +619,8 @@ class CachedRequestState: req_id: str prompt_token_ids: List[int] prompt: Optional[str] - multi_modal_data: Optional["MultiModalDataDict"] + mm_inputs: List[MultiModalKwargs] + mm_positions: List["PlaceholderRange"] sampling_params: SamplingParams generator: Optional[torch.Generator] From 56a955e7748e497d8c24c79a76c75f3f982fab4a Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Wed, 13 Nov 2024 00:54:10 -0500 Subject: [PATCH 13/20] Bump to compressed-tensors v0.8.0 (#10279) Signed-off-by: Dipika --- requirements-common.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index ef5ed8b645158..acb766d25a2d9 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -31,4 +31,4 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. -compressed-tensors == 0.7.1 # required for compressed-tensors +compressed-tensors == 0.8.0 # required for compressed-tensors \ No newline at end of file From 032fcf16ae9d924cc98a083c3c8464173f87a49e Mon Sep 17 00:00:00 2001 From: Xin Yang <105740670+xyang16@users.noreply.github.com> Date: Tue, 12 Nov 2024 21:54:52 -0800 Subject: [PATCH 14/20] [Doc] Fix typo in arg_utils.py (#10264) Signed-off-by: Xin Yang --- vllm/engine/arg_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 27f62b0008578..31aa8c5908719 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -626,8 +626,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: type=int, default=EngineArgs.max_cpu_loras, help=('Maximum number of LoRAs to store in CPU memory. ' - 'Must be >= than max_num_seqs. ' - 'Defaults to max_num_seqs.')) + 'Must be >= than max_loras. ' + 'Defaults to max_loras.')) parser.add_argument( '--fully-sharded-loras', action='store_true', From 3945c82346dae3129213607663bfd17edd905fef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=94=B5=E8=84=91=E6=98=9F=E4=BA=BA?= Date: Wed, 13 Nov 2024 15:07:22 +0800 Subject: [PATCH 15/20] [Model] Add support for Qwen2-VL video embeddings input & multiple image embeddings input with varied resolutions (#10221) Signed-off-by: imkero --- docs/source/models/supported_models.rst | 2 +- .../vision_language/test_qwen2_vl.py | 428 ++++++++++++++++++ vllm/model_executor/models/qwen2_vl.py | 180 ++++++-- 3 files changed, 578 insertions(+), 32 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/test_qwen2_vl.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 5a474043078db..ca894819f2c26 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -538,7 +538,7 @@ Text Generation - ✅︎ * - :code:`Qwen2VLForConditionalGeneration` - Qwen2-VL - - T + I\ :sup:`E+` + V\ :sup:`+` + - T + I\ :sup:`E+` + V\ :sup:`E+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - ✅︎ - ✅︎ diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py new file mode 100644 index 0000000000000..718c675b86fb4 --- /dev/null +++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py @@ -0,0 +1,428 @@ +from typing import Any, List, Optional, Tuple, Type, TypedDict, Union + +import numpy.typing as npt +import pytest +import torch +from PIL import Image + +from vllm.entrypoints.llm import LLM +from vllm.multimodal.utils import (rescale_image_size, rescale_video_size, + sample_frames_from_video) + +from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput, + PromptVideoInput, VllmRunner) +from ...utils import check_logprobs_close + +models = ["Qwen/Qwen2-VL-2B-Instruct"] +target_dtype = "half" + +IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>" +VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>" + + +def qwen2_vl_chat_template(*query): + return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501 + + +IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + "What is the biggest text's content in this image?", + ), + "cherry_blossom": + qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + "What is the season shown in this image? ", + "Reply with a short sentence (no more than 20 words)", + ), +}) + +VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ + "sample_demo_1": + qwen2_vl_chat_template( + VIDEO_PLACEHOLDER, + "Describe this video with a short sentence ", + "(no more than 20 words)", + ), +}) + +MULTIIMAGE_PROMPT = qwen2_vl_chat_template( + IMAGE_PLACEHOLDER, + IMAGE_PLACEHOLDER, + "Describe these two images separately. ", + "For each image, reply with a short sentence ", + "(no more than 10 words).", +) + + +class Qwen2VLPromptImageEmbeddingInput(TypedDict): + image_embeds: torch.Tensor + image_grid_thw: torch.Tensor + + +class Qwen2VLPromptVideoEmbeddingInput(TypedDict): + video_embeds: torch.Tensor + video_grid_thw: torch.Tensor + + +def batch_make_image_embeddings( + image_batches: List[Union[Image.Image, List[Image.Image]]], processor, + llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]: + """batched image embeddings for Qwen2-VL + + This will infer all images' embeddings in a single batch, + and split the result according to input batches. + + image_batches: + - Single-image batches: `List[Image.Image]` + - Multiple-image batches: `List[List[Image.Image]]]` + + returns: `List[Qwen2VLPromptImageEmbeddingInput]` + """ + + image_batches_: List[Any] = image_batches[:] + + # convert single-image batches to multiple-image batches + for idx in range(len(image_batches_)): + if not isinstance(image_batches_[idx], list): + image_batches_[idx] = [image_batches_[idx]] + + assert isinstance(image_batches_[idx], list) + + # append all images into a list (as a batch) + images: List[Image.Image] = [] + for image_batch in image_batches_: + images += image_batch + + # image to pixel values + image_processor = processor.image_processor + + preprocess_result = image_processor \ + .preprocess(images=images, return_tensors="pt") \ + .data + pixel_values = preprocess_result["pixel_values"] + image_grid_thw = preprocess_result["image_grid_thw"] + + # pixel values to embeddinds & grid_thws + with torch.no_grad(): + visual = llm.llm_engine.model_executor.driver_worker. \ + model_runner.model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + image_grid_thw_on_device = image_grid_thw.to(visual.device, + dtype=torch.int64) + image_embeds = visual(pixel_values_on_device, + grid_thw=image_grid_thw_on_device) + + # split into original batches + result: List[Qwen2VLPromptImageEmbeddingInput] = [] + image_counter = 0 + embed_counter = 0 + for image_batch in image_batches_: + cur_batch_image_count = len(image_batch) + merge_size = image_processor.merge_size + cur_batch_embed_len = sum([ + grid_thw.prod() // merge_size // merge_size + for grid_thw in image_grid_thw[image_counter:image_counter + + cur_batch_image_count] + ]) + + result.append({ + "image_embeds": + image_embeds[embed_counter:embed_counter + cur_batch_embed_len], + "image_grid_thw": + image_grid_thw[image_counter:image_counter + + cur_batch_image_count], + }) + + embed_counter += cur_batch_embed_len + image_counter += cur_batch_image_count + + # ensure we don't lost any images or embeddings + assert embed_counter == image_embeds.size(0) + assert image_counter == image_grid_thw.size(0) + assert len(image_batches) == len(result) + + return result + + +def batch_make_video_embeddings( + video_batches: PromptVideoInput, processor, + llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]: + """batched video embeddings for Qwen2-VL + + A NDArray represents a single video's all frames. + + This will infer all videos' embeddings in a single batch, + and split the result according to input batches. + + video_batches: + - Single-video batches: `List[NDArray]` + - Multiple-video batches: `List[List[NDArray]]` + """ + + video_batches_: List[Any] = video_batches[:] + + for idx in range(len(video_batches_)): + if not isinstance(video_batches_[idx], list): + single_video_batch: List[npt.NDArray] = [video_batches_[idx]] + video_batches_[idx] = single_video_batch + + assert isinstance(video_batches_[idx], list) + + # append all videos into a list (as a batch) + videos: List[npt.NDArray] = [] + for video_batch in video_batches_: + videos += video_batch + + # video to pixel values + image_processor = processor.image_processor + + preprocess_result = image_processor \ + .preprocess(images=None, videos=videos, return_tensors="pt") \ + .data + pixel_values = preprocess_result["pixel_values_videos"] + video_grid_thw = preprocess_result["video_grid_thw"] + + # pixel values to embeddinds & grid_thws + with torch.no_grad(): + visual = llm.llm_engine.model_executor.driver_worker.\ + model_runner.model.visual + + pixel_values_on_device = pixel_values.to(visual.device, + dtype=visual.dtype) + video_grid_thw_on_device = video_grid_thw.to(visual.device, + dtype=torch.int64) + video_embeds = visual(pixel_values_on_device, + grid_thw=video_grid_thw_on_device) + + # split into original batches + result: List[Qwen2VLPromptVideoEmbeddingInput] = [] + video_counter = 0 + embed_counter = 0 + for video_batch in video_batches_: + cur_batch_video_count = len(video_batch) + merge_size = image_processor.merge_size + cur_batch_embed_len = sum([ + grid_thw.prod() // merge_size // merge_size + for grid_thw in video_grid_thw[video_counter:video_counter + + cur_batch_video_count] + ]) + + result.append({ + "video_embeds": + video_embeds[embed_counter:embed_counter + cur_batch_embed_len], + "video_grid_thw": + video_grid_thw[video_counter:video_counter + + cur_batch_video_count], + }) + + embed_counter += cur_batch_embed_len + video_counter += cur_batch_video_count + + # ensure we don't lost any videos or embeddings + assert embed_counter == video_embeds.size(0) + assert video_counter == video_grid_thw.size(0) + assert len(video_batches) == len(result) + + return result + + +def run_test( + vllm_runner: Type[VllmRunner], + inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + mm_limit: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + """Inference result should be the same between + original image/video input and image/video embeddings input. + """ + from transformers import AutoProcessor # noqa: F401 + + processor = AutoProcessor.from_pretrained(model) + + # NOTE: + # max_model_len should be greater than image_feature_size + with vllm_runner(model, + task="generate", + max_model_len=4000, + max_num_seqs=3, + dtype=dtype, + limit_mm_per_prompt={ + "image": mm_limit, + "video": mm_limit + }, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend + ) as vllm_model: + + outputs_per_case_for_original_input = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images or None, + videos=videos or None) + for prompts, images, videos in inputs + ] + + outputs_per_case_for_embeddings_input = [ + vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=batch_make_image_embeddings( + images, processor, vllm_model.model) if images else None, + videos=batch_make_video_embeddings( + videos, processor, vllm_model.model) if videos else None) + for prompts, images, videos in inputs + ] + + for outputs_for_original_input, \ + outputs_for_embeddings_input \ + in zip(outputs_per_case_for_original_input, + outputs_per_case_for_embeddings_input): + check_logprobs_close( + outputs_0_lst=outputs_for_original_input, + outputs_1_lst=outputs_for_embeddings_input, + name_0="original_input", + name_1="embeddings_input", + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.5, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model, + size_factors, dtype: str, + max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case: List[Tuple[ + List[str], PromptImageInput, PromptVideoInput]] = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + [], + ) for image, prompt in zip(images, IMAGE_PROMPTS)] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + [], + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.5, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets, + model, size_factors, + dtype: str, max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case: List[Tuple[List[str], PromptImageInput, + PromptVideoInput]] = [( + [MULTIIMAGE_PROMPT for _ in size_factors], + [[ + rescale_image_size(image, factor) + for image in images + ] for factor in size_factors], + [], + )] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=2, + tensor_parallel_size=1, + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # Single-scale + [0.5], + # Single-scale, batched + [0.5, 0.5], + # Multi-scale + [0.25, 0.25, 0.5], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [10]) +def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model, + size_factors, dtype: str, + max_tokens: int, + num_logprobs: int) -> None: + num_frames = 4 + sampled_vids = [ + sample_frames_from_video(asset.np_ndarrays, num_frames) + for asset in video_assets + ] + + inputs_per_case: List[Tuple[ + List[str], PromptImageInput, PromptVideoInput]] = [( + [prompt for _ in size_factors], + [], + [rescale_video_size(video, factor) for factor in size_factors], + ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)] + + run_test( + vllm_runner, + inputs_per_case, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 13109758767df..1b162e7df8578 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -79,7 +79,7 @@ class Qwen2VLImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: torch.Tensor + pixel_values: torch.Tensor """Shape: `(num_patches, num_channels * patch_size * patch_size)` """ @@ -92,9 +92,22 @@ class Qwen2VLImagePixelInputs(TypedDict): class Qwen2VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: torch.Tensor - """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` - `hidden_size` must match the hidden size of language model backbone. + image_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all images' features. + Each tensor holds an image's features. + - `torch.Tensor`: A tensor holding all images' features + (concatenation of all images' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the images. + - `hidden_size` must match the hidden size of language model backbone. + """ + + image_grid_thw: torch.Tensor + """Shape: `(num_images, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. """ @@ -102,7 +115,8 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): Qwen2VLImageEmbeddingInputs] -class Qwen2VLVideoInputs(TypedDict): +class Qwen2VLVideoPixelInputs(TypedDict): + type: Literal["pixel_values_videos"] pixel_values_videos: torch.Tensor """Shape: `(num_patches, @@ -116,6 +130,30 @@ class Qwen2VLVideoInputs(TypedDict): """ +class Qwen2VLVideoEmbeddingInputs(TypedDict): + type: Literal["video_embeds"] + video_embeds: torch.Tensor + """Supported types: + - List[`torch.Tensor`]: A list of tensors holding all videos' features. + Each tensor holds an video's features. + - `torch.Tensor`: A tensor holding all videos' features + (concatenation of all videos' feature tensors). + + Tensor shape: `(num_image_features, hidden_size)` + - `num_image_features` varies based on + the number and resolution of the videos. + - `hidden_size` must match the hidden size of language model backbone. + """ + + video_grid_thw: torch.Tensor + """Shape: `(num_videos, 3)` + This should be in `(grid_t, grid_h, grid_w)` format. + """ + + +Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs, + Qwen2VLVideoEmbeddingInputs] + # === Vision Encoder === # @@ -585,6 +623,12 @@ def mm_input_mapper_for_qwen2_vl( "image_embeds": data.get("image_embeds"), "image_grid_thw": data.get("image_grid_thw"), }) + if data_type_key == "video" and isinstance(data, dict): + return MultiModalKwargs({ + "video_embeds": data.get("video_embeds"), + "video_grid_thw": data.get("video_grid_thw"), + }) + model_config = ctx.model_config # Handle mm processor kwargs; we pass these at creation time # because preprocess() in transformers doesn't expose them @@ -890,16 +934,33 @@ def input_processor_for_qwen2_vl( idx for idx, token in enumerate(prompt_token_ids) if token == hf_config.image_token_id ] - image_cnt = len(image_indices) - embed_dim = image_inputs.get('image_embeds').size(0) - assert embed_dim % image_cnt == 0 - num_pad_tokens = embed_dim // image_cnt + + # ensure all image tokens have grid_thw + assert \ + len(image_indices) == image_inputs["image_grid_thw"].size(0), \ + "image token num does not match image_grid_thw.shape" + + image_counter = 0 + pad_token_counter = 0 for idx, token in enumerate(prompt_token_ids): if idx in image_indices: + grid_thw = image_inputs["image_grid_thw"][image_counter] + grid_t, grid_h, grid_w = grid_thw + num_pad_tokens = (grid_t * grid_h * grid_w // + image_processor.merge_size // + image_processor.merge_size) prompt_token_ids_with_image.extend([token] * num_pad_tokens) + image_counter += 1 + pad_token_counter += num_pad_tokens else: prompt_token_ids_with_image.append(token) + + # ensure all embeddings are used + assert \ + pad_token_counter == image_inputs["image_embeds"].size(0), \ + "image_embeds.shape does not match image_grid_thw" + prompt_token_ids = prompt_token_ids_with_image else: prompt_token_ids = _expand_pad_tokens(image_inputs, @@ -912,14 +973,49 @@ def input_processor_for_qwen2_vl( max_pixels=max_pixels) if video_inputs is not None: - prompt_token_ids = _expand_pad_tokens(video_inputs, - hf_config.video_token_id, - make_batched_videos, - "video", - image_processor, - prompt_token_ids, - min_pixels=min_pixels, - max_pixels=max_pixels) + if isinstance(video_inputs, dict): + prompt_token_ids_with_video = [] + video_indices = [ + idx for idx, token in enumerate(prompt_token_ids) + if token == hf_config.video_token_id + ] + + # ensure all video tokens have grid_thw + assert \ + len(video_indices) == video_inputs["video_grid_thw"].size(0), \ + "video token num does not match video_grid_thw.shape" + + video_counter = 0 + pad_token_counter = 0 + for idx, token in enumerate(prompt_token_ids): + if idx in video_indices: + grid_thw = video_inputs["video_grid_thw"][video_counter] + grid_t, grid_h, grid_w = grid_thw + num_pad_tokens = (grid_t * grid_h * grid_w // + image_processor.merge_size // + image_processor.merge_size) + prompt_token_ids_with_video.extend([token] * + num_pad_tokens) + video_counter += 1 + pad_token_counter += num_pad_tokens + else: + prompt_token_ids_with_video.append(token) + + # ensure all embeddings are used + assert \ + pad_token_counter == video_inputs["video_embeds"].size(0), \ + "video_embeds.shape does not match video_grid_thw" + + prompt_token_ids = prompt_token_ids_with_video + else: + prompt_token_ids = _expand_pad_tokens(video_inputs, + hf_config.video_token_id, + make_batched_videos, + "video", + image_processor, + prompt_token_ids, + min_pixels=min_pixels, + max_pixels=max_pixels) prompt = inputs.get("prompt") if prompt is None: @@ -1051,49 +1147,71 @@ def _parse_and_validate_image_input( f"Got type: {type(pixel_values)}") return Qwen2VLImagePixelInputs(type="pixel_values", - data=pixel_values, + pixel_values=pixel_values, image_grid_thw=image_grid_thw) if image_embeds is not None: image_embeds = self._validate_and_reshape_mm_tensor( image_embeds, "image embeds") + image_grid_thw = self._validate_and_reshape_mm_tensor( + image_grid_thw, "image grid_thw") if not isinstance(image_embeds, torch.Tensor): raise ValueError("Incorrect type of image embeddings. " f"Got type: {type(image_embeds)}") return Qwen2VLImageEmbeddingInputs(type="image_embeds", - data=image_embeds) + image_embeds=image_embeds, + image_grid_thw=image_grid_thw) def _parse_and_validate_video_input( self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]: pixel_values_videos = kwargs.pop("pixel_values_videos", None) + video_embeds = kwargs.pop("video_embeds", None) video_grid_thw = kwargs.pop("video_grid_thw", None) - if pixel_values_videos is None: + if pixel_values_videos is None and video_embeds is None: return None - pixel_values_videos = self._validate_and_reshape_mm_tensor( - pixel_values_videos, "video pixel values") - video_grid_thw = self._validate_and_reshape_mm_tensor( - video_grid_thw, "video grid_thw") - - return Qwen2VLVideoInputs( - pixel_values_videos=pixel_values_videos, - video_grid_thw=video_grid_thw, - ) + if pixel_values_videos is not None: + pixel_values_videos = self._validate_and_reshape_mm_tensor( + pixel_values_videos, "video pixel values") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + return Qwen2VLVideoPixelInputs( + type="pixel_values_videos", + pixel_values_videos=pixel_values_videos, + video_grid_thw=video_grid_thw, + ) + + if video_embeds is not None: + video_embeds = self._validate_and_reshape_mm_tensor( + video_embeds, "video embeds") + video_grid_thw = self._validate_and_reshape_mm_tensor( + video_grid_thw, "video grid_thw") + + if not isinstance(video_embeds, torch.Tensor): + raise ValueError("Incorrect type of video embeddings. " + f"Got type: {type(video_embeds)}") + return Qwen2VLVideoEmbeddingInputs(type="video_embeds", + video_embeds=video_embeds, + video_grid_thw=video_grid_thw) def _process_image_input(self, image_input: Qwen2VLImageInputs) -> torch.Tensor: if image_input["type"] == "image_embeds": - return image_input["data"].type(self.visual.dtype) + return image_input["image_embeds"].type(self.visual.dtype) - pixel_values = image_input["data"].type(self.visual.dtype) + pixel_values = image_input["pixel_values"].type(self.visual.dtype) image_embeds = self.visual(pixel_values, grid_thw=image_input["image_grid_thw"]) return image_embeds def _process_video_input(self, video_input: Qwen2VLVideoInputs) -> torch.Tensor: + if video_input["type"] == "video_embeds": + return video_input["video_embeds"].type(self.visual.dtype) + pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) video_embeds = self.visual(pixel_values_videos, From 1b886aa104248a95720fda7be9f979fc665b3d02 Mon Sep 17 00:00:00 2001 From: Austin Veselka <50646302+FurtherAI@users.noreply.github.com> Date: Wed, 13 Nov 2024 02:28:13 -0600 Subject: [PATCH 16/20] [Model] Adding Support for Qwen2VL as an Embedding Model. Using MrLight/dse-qwen2-2b-mrl-v1 (#9944) Signed-off-by: FurtherAI Co-authored-by: FurtherAI --- docs/source/models/supported_models.rst | 6 + docs/source/models/vlm.rst | 17 ++ ...ai_chat_embedding_client_for_multimodal.py | 123 +++++++++-- examples/template_dse_qwen2_vl.jinja | 7 + tests/conftest.py | 3 + .../vision_language/test_dse_qwen2_vl.py | 209 ++++++++++++++++++ vllm/model_executor/models/qwen2_vl.py | 17 +- vllm/model_executor/models/registry.py | 1 + 8 files changed, 364 insertions(+), 19 deletions(-) create mode 100644 examples/template_dse_qwen2_vl.jinja create mode 100644 tests/models/embedding/vision_language/test_dse_qwen2_vl.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index ca894819f2c26..58ec3acc6aea5 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -584,6 +584,12 @@ Multimodal Embedding - :code:`TIGER-Lab/VLM2Vec-Full` - 🚧 - ✅︎ + * - :code:`Qwen2VLForConditionalGeneration` + - Qwen2-VL-based + - T + I + - :code:`MrLight/dse-qwen2-2b-mrl-v1` + - + - ✅︎ .. important:: Some model architectures support both generation and embedding tasks. diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 112e9db6a41de..bcbe50a25fa09 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -310,4 +310,21 @@ Since the request schema is not defined by OpenAI client, we post a request to t response_json = response.json() print("Embedding output:", response_json["data"][0]["embedding"]) +Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model. + +.. code-block:: bash + + vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \ + --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja + +.. important:: + + Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, + which is handled by the jinja template. + +.. important:: + + Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code + example below for details. + A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py `_. diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py index effb588e1387f..fff82020d9a30 100644 --- a/examples/openai_chat_embedding_client_for_multimodal.py +++ b/examples/openai_chat_embedding_client_for_multimodal.py @@ -1,33 +1,120 @@ +import argparse +import base64 +import io + import requests +from PIL import Image image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" -response = requests.post( - "http://localhost:8000/v1/embeddings", - json={ - "model": - "TIGER-Lab/VLM2Vec-Full", - "messages": [{ + +def vlm2vec(): + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": + "TIGER-Lab/VLM2Vec-Full", + "messages": [{ + "role": + "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": image_url + } + }, + { + "type": "text", + "text": "Represent the given image." + }, + ], + }], + "encoding_format": + "float", + }, + ) + response.raise_for_status() + response_json = response.json() + + print("Embedding output:", response_json["data"][0]["embedding"]) + + +def dse_qwen2_vl(inp: dict): + # Embedding an Image + if inp["dtype"] == "image": + messages = [{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": inp["image_url"], + } + }, { + "type": "text", + "text": "What is shown in this image?" + }] + }] + # Embedding a Text Query + else: + # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image + # of the minimum input size + buffer = io.BytesIO() + image_placeholder = Image.new("RGB", (56, 56)) + image_placeholder.save(buffer, "png") + buffer.seek(0) + image_placeholder = base64.b64encode(buffer.read()).decode('utf-8') + messages = [{ "role": "user", "content": [ { "type": "image_url", "image_url": { - "url": image_url + "url": f"data:image/jpeg;base64,{image_placeholder}", } }, { "type": "text", - "text": "Represent the given image." + "text": f"Query: {inp['content']}" }, - ], - }], - "encoding_format": - "float", - }, -) -response.raise_for_status() -response_json = response.json() - -print("Embedding output:", response_json["data"][0]["embedding"]) + ] + }] + + response = requests.post( + "http://localhost:8000/v1/embeddings", + json={ + "model": "MrLight/dse-qwen2-2b-mrl-v1", + "messages": messages, + "encoding_format": "float", + }, + ) + response.raise_for_status() + response_json = response.json() + + print("Embedding output:", response_json["data"][0]["embedding"]) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + "Script to call a specified VLM through the API. Make sure to serve " + "the model with --task embedding before running this.") + parser.add_argument("model", + type=str, + choices=["vlm2vec", "dse_qwen2_vl"], + required=True, + help="Which model to call.") + args = parser.parse_args() + + if args.model == "vlm2vec": + vlm2vec() + elif args.model == "dse_qwen2_vl": + dse_qwen2_vl({ + "dtye": "image", + "image_url": image_url, + }) + dse_qwen2_vl({ + "dtype": "text", + "content": "What is the weather like today?", + }) diff --git a/examples/template_dse_qwen2_vl.jinja b/examples/template_dse_qwen2_vl.jinja new file mode 100644 index 0000000000000..e7b93fae31770 --- /dev/null +++ b/examples/template_dse_qwen2_vl.jinja @@ -0,0 +1,7 @@ +{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system +You are a helpful assistant.<|im_end|> +{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %} +{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %} +{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %} +{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %} +{% endraw %}{% endif %}<|endoftext|> \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 6cf791dc62ce5..0dc1cc6e83c18 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -243,6 +243,9 @@ def video_assets() -> _VideoAssets: class HfRunner: def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: + if x is None or isinstance(x, (bool, )): + return x + if device is None: device = "cpu" if current_platform.is_cpu() else "cuda" diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py new file mode 100644 index 0000000000000..3dd8cb729f8a6 --- /dev/null +++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py @@ -0,0 +1,209 @@ +from functools import partial +from typing import Callable, Dict, List, Type + +import pytest +import torch +from PIL import Image +from transformers import BatchEncoding, Qwen2VLForConditionalGeneration + +from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner +from ....utils import large_gpu_test +from ..utils import check_embeddings_close + +HF_TEXT_PROMPTS = [ + # T -> X + ( + "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501, + Image.new("RGB", (56, 56))), + # T -> X + ("Query: Retrieve an image of this caption: cherry blossom", + Image.new("RGB", (56, 56))), +] + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "What is shown in this image?", + "cherry_blossom": + "What is shown in this image?" +}) + +MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"] + + +def get_messages(image: Image.Image, text: str, embed_text: bool): + # assert False, 'remember to use outer [] as required' + if embed_text: + messages = [{ + "role": + "user", + "content": [ + { + "type": "image", + "image": Image.new("RGB", (56, 56)), + "resized_height": 1, + "resized_width": 1 + }, # need a dummy image here for an easier process. + { + "type": "text", + "text": text + }, + ] + }] + else: + messages = [{ + "role": + "user", + "content": [{ + "type": "image", + "image": image + }, { + "type": "text", + "text": text + }] + }] + return messages + + +def apply_chat_template_and_add_eos( + messages: List[Dict], + apply_chat_template_fn: Callable, +): + prompt = apply_chat_template_fn( + messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>" + return prompt + + +def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs): + return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs) + + +def _run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + input_texts: List[str], + input_images: PromptImageInput, + embed_texts: List[bool], + model: str, + *, + dtype: str, +) -> None: + '''SET PYTHONPATH''' + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + with vllm_runner(model, + task="embedding", + dtype=dtype, + enforce_eager=True, + max_model_len=8192) as vllm_model: + tokenizer = vllm_model.model.get_tokenizer() + texts = [ + # this is necessary because vllm_model.encode will not apply any + # templating to the prompt, and therefore lacks an image_pad + # token unless one is inserted beforehand (the (28,28) image + # above is converted to an image pad token by the chat template). + apply_chat_template_and_add_eos( + get_messages(image, text, False), + apply_chat_template_fn=tokenizer.apply_chat_template, + ) for text, image in zip(input_texts, input_images) + # vllm will replace the pad token with the actual image, + # which may be a placeholder image, later. + ] + vllm_outputs = vllm_model.encode(texts, images=input_images) + + hf_outputs = [] + with hf_runner(model, + dtype=dtype, + auto_cls=Qwen2VLForConditionalGeneration) as hf_model: + hf_model.postprocess_inputs = partial( + postprocess_inputs, + hf_model, + cache_position=torch.arange( + 0, + 1, # 1 for batch size + requires_grad=False), + use_cache=False) + for text, image, embed_text in zip(input_texts, input_images, + embed_texts): + # dse requires non-standard input processing + # because it needs an image_pad token + messages = get_messages(image, text, embed_text) + prompt = apply_chat_template_and_add_eos( + messages, hf_model.processor.apply_chat_template) + inputs = hf_model.get_inputs( + prompts=[[prompt]], + images=[[image]], + ) + with torch.no_grad(): + outputs = hf_model.model( + **hf_model.wrap_device(inputs[0], + device=hf_model.model.device.type), + return_dict=True, + output_hidden_states=True, + ) + pooled_output = torch.nn.functional.normalize( + outputs.hidden_states[-1][0, -1], p=2, dim=-1) + hf_outputs.append(pooled_output.tolist()) + + check_embeddings_close( + embeddings_0_lst=hf_outputs, + embeddings_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +def test_models_text( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [(text, image_placeholder) + for text, image_placeholder in HF_TEXT_PROMPTS] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + embed_texts = [True] * len(input_texts) + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, # type: ignore + embed_texts, + model, + dtype=dtype, + ) + + +@large_gpu_test(min_gb=48) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +def test_models_image( + hf_runner, + vllm_runner, + image_assets, + model: str, + dtype: str, +) -> None: + input_texts_images = [ + (text, asset.pil_image) + for text, asset in zip(HF_IMAGE_PROMPTS, image_assets) + ] + input_texts = [text for text, _ in input_texts_images] + input_images = [image for _, image in input_texts_images] + embed_texts = [False] * len(input_texts) + + _run_test( + hf_runner, + vllm_runner, + input_texts, + input_images, + embed_texts, + model, + dtype=dtype, + ) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 1b162e7df8578..9a19ccbca3f1e 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -51,6 +51,7 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization import (GPTQConfig, GPTQMarlinConfig, QuantizationConfig) @@ -58,12 +59,13 @@ from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2Model +from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalKwargs) from vllm.multimodal.base import MultiModalData from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import cached_get_tokenizer -from vllm.sequence import IntermediateTensors, SequenceData +from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData from vllm.transformers_utils.config import uses_mrope from vllm.transformers_utils.processor import cached_get_processor @@ -1067,6 +1069,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + pooler_config = vllm_config.model_config.pooler_config multimodal_config = vllm_config.model_config.multimodal_config assert not cache_config.enable_prefix_caching, \ "Qwen2-VL currently does not support prefix caching" @@ -1098,6 +1101,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = get_sampler() + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.LAST, + normalize=True, + softmax=False) self.make_empty_intermediate_tensors = ( make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) @@ -1318,6 +1326,13 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens + def pooler( + self, + hidden_states: torch.Tensor, + pooling_metadata: PoolingMetadata, + ) -> Optional[PoolerOutput]: + return self._pooler(hidden_states, pooling_metadata) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 32750602b988c..f172c06c4a26a 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -109,6 +109,7 @@ # [Multimodal] "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), + "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501, } _MULTIMODAL_MODELS = { From b6dde330198848a4a9903c1f0f97c3235fba0ba0 Mon Sep 17 00:00:00 2001 From: Pavani Majety Date: Wed, 13 Nov 2024 00:29:32 -0800 Subject: [PATCH 17/20] [Core] Flashinfer - Remove advance step size restriction (#10282) --- csrc/prepare_inputs/advance_step.cu | 66 +++++++++++++++++------------ 1 file changed, 38 insertions(+), 28 deletions(-) diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index 46fef79f439fb..bd184ee22682e 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -88,6 +88,7 @@ inline void verify_tensor(std::string const& name, torch::Tensor const& t, } } +/// each thread processes a block per query __global__ void advance_step_flashinfer_kernel( int num_threads, int num_seqs, int num_queries, int block_size, long* input_tokens_ptr, long const* sampled_token_ids_ptr, @@ -134,8 +135,10 @@ __global__ void advance_step_flashinfer_indptr_kernel( int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { int idx = blockIdx.x * num_threads + threadIdx.x; - // Update paged_kv_indptr + if (idx == 0) { + paged_kv_indptr_ptr[idx] = 0; + } if (idx < num_queries) { int sum = 0; for (int i = 0; i <= idx; ++i) { @@ -146,20 +149,33 @@ __global__ void advance_step_flashinfer_indptr_kernel( } __global__ void advance_step_flashinfer_indices_kernel( - int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr, - int64_t const block_tables_stride, int* paged_kv_indices_ptr, + int num_seqs, int num_queries, int const* block_tables_ptr, + int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr, int* paged_kv_indptr_ptr, int* block_table_bound_ptr) { - int idx = blockIdx.x * num_threads + threadIdx.x; - int row = idx / block_tables_stride; - int col = idx % block_tables_stride; - - if (row < num_queries && col < block_table_bound_ptr[row]) { - paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] = - block_tables_ptr[row * block_tables_stride + col]; + // note: max_num_blocks_per_seq = block_tables.stride(0) + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // when cuda graphs are enabled, paged_kv_indptr tensor + // has to be updated for the padded queries + // tid represents a query# for paged_kv_indptr tensor + if (num_queries < tid && tid <= num_seqs) { + paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries]; } - // if cudagraph, fill padded seqs with the last valid seq's indptr - if (num_queries < row && row <= num_seqs) { - paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries]; + + // each thread processes a block_ptr in block_tables + // block_tables shape: [num_queries, max_num_blocks_per_seq] + // paged_kv_indices is flattened block_tables. + for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq); + idx += (gridDim.x * blockDim.x)) { + // block_tables-row = paged_kv_indptr[queryNum] + int queryNum = idx / max_num_blocks_per_seq; + int col = idx % max_num_blocks_per_seq; + if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) { + int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col; + int block_tables_idx = queryNum * max_num_blocks_per_seq + col; + paged_kv_indices_ptr[indices_arr_idx] = + block_tables_ptr[block_tables_idx]; + } } } @@ -247,22 +263,16 @@ void advance_step_flashinfer( int threads; cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev); cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev); - if (logging) { - printf("launching kernel with %d blocks\n", blocks); - } - // TODO(will): support arbitrary block_tables stride - if ((blocks * threads) / block_tables.stride(0) < num_queries) { - TORCH_CHECK(false, - "multi-step: not enough threads to map block_table to" - "FlashInfer's paged_kv_indices on GPU. Try reducing the number " - "of seqs,", - " increasing the block size or take smaller steps.", - " num_queries = ", num_queries, - " block_tables.stride(0) = ", block_tables.stride(0), - " blocks = ", blocks, " max_threads = ", threads); + int block_tables_stride = block_tables.stride(0); + TORCH_CHECK((blocks * threads > num_queries), + "multi-step: not enough threads to map to num_queries = ", + num_queries, " block_tables.stride(0) = ", block_tables.stride(0), + " blocks = ", blocks, " max_threads = ", threads); + if (logging) { + printf("launching kernels with %d blocks and %d threads\n", blocks, + threads); } - advance_step_flashinfer_kernel<<>>( threads, num_seqs, num_queries, block_size, reinterpret_cast(input_tokens.data_ptr()), @@ -281,7 +291,7 @@ void advance_step_flashinfer( reinterpret_cast(block_table_bound.data_ptr())); advance_step_flashinfer_indices_kernel<<>>( - threads, num_seqs, num_queries, + num_seqs, num_queries, reinterpret_cast(block_tables.data_ptr()), block_tables.stride(0), reinterpret_cast(paged_kv_indices.data_ptr()), From d909acf9fe17b7db42d7de61903c0058c8b9b344 Mon Sep 17 00:00:00 2001 From: B-201 Date: Wed, 13 Nov 2024 17:25:59 +0800 Subject: [PATCH 18/20] [Model][LoRA]LoRA support added for idefics3 (#10281) Signed-off-by: B-201 --- docs/source/models/supported_models.rst | 2 +- vllm/model_executor/models/idefics3.py | 55 +++++++++++++++++++++---- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 58ec3acc6aea5..161733c049bbe 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -450,7 +450,7 @@ Text Generation - Idefics3 - T + I - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc. - - + - ✅︎ - * - :code:`InternVLChatModel` - InternVL2 diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 8845b2f58af07..85f23a1da533b 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -33,6 +33,7 @@ from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor @@ -44,7 +45,7 @@ from .idefics2_vision_model import ( Idefics2VisionTransformer as Idefics3VisionTransformer) # yapf: enable -from .interfaces import SupportsMultiModal +from .interfaces import SupportsLoRA, SupportsMultiModal from .llama import LlamaModel from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix, merge_multimodal_embeddings) @@ -58,8 +59,6 @@ class Idefics3ImagePixelInputs(TypedDict): """ Shape: `(batch_size * num_images, num_channels, height, width)` """ - rows: List[int] - cols: List[int] pixel_attention_mask: Optional[torch.BoolTensor] @@ -356,8 +355,15 @@ def dummy_data_for_idefics3( image_seq_len = processor.image_seq_len max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images + if seq_len - max_llm_image_tokens < 0: + raise RuntimeError( + f"Idefics3 cannot process {num_images} images in a prompt, " + "please increase max_model_len or reduce image limit by " + "--limit-mm-per-prompt.") + seq_data = SequenceData.from_prompt_token_counts( - (hf_config.image_token_id, max_llm_image_tokens), (0, seq_len)) + (hf_config.image_token_id, max_llm_image_tokens), + (0, seq_len - max_llm_image_tokens)) width = height = hf_config.vision_config.image_size image = Image.new("RGB", (width, height), color=0) @@ -463,8 +469,6 @@ def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[ImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) - rows = kwargs.pop("rows", None) - cols = kwargs.pop("cols", None) pixel_attention_mask = kwargs.pop("pixel_attention_mask", None) if pixel_values is None and image_embeds is None: @@ -489,8 +493,6 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values( flatten_bn(pixel_values, concat=True)), - rows=rows, - cols=cols, pixel_attention_mask=flatten_bn( pixel_attention_mask, concat=True)) @@ -610,7 +612,33 @@ def forward( @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3) @INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3) -class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal): +class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal, + SupportsLoRA): + packed_modules_mapping = { + "qkv_proj": [ + "q_proj", + "k_proj", + "v_proj", + ], + "gate_up_proj": [ + "gate_proj", + "up_proj", + ], + } + # LoRA specific attributes + supported_lora_modules = [ + # vision_model + "fc1", + "fc2", + "out_proj", + # text_model + "qkv_proj", # same name with vision encoder + "o_proj", + "gate_up_proj", + "down_proj", + ] + embedding_modules = {} + embedding_padding_modules = [] def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() @@ -672,3 +700,12 @@ def sample( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights) + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="model.text_model", + connector="model.connector", + tower_model="model.vision_model") From bb7991aa291054a30f408e626273caa6769a07eb Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Wed, 13 Nov 2024 03:02:56 -0800 Subject: [PATCH 19/20] [V1] Add missing tokenizer options for `Detokenizer` (#10288) Signed-off-by: Roger Wang --- vllm/v1/engine/detokenizer.py | 11 +++++++++-- vllm/v1/engine/llm_engine.py | 7 ++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py index 1dbf8e75ec478..6249d60199a62 100644 --- a/vllm/v1/engine/detokenizer.py +++ b/vllm/v1/engine/detokenizer.py @@ -192,10 +192,17 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str: class Detokenizer: - def __init__(self, tokenizer_name: str): + def __init__(self, + tokenizer_name: str, + tokenizer_mode: str = "auto", + trust_remote_code: bool = False, + revision: Optional[str] = None): # TODO: once we support LoRA, we should should pass the tokenizer # here. We currently have two copies (this + in the LLMEngine). - self.tokenizer = get_tokenizer(tokenizer_name) + self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=trust_remote_code, + revision=revision) # Request id -> IncrementalDetokenizer self.request_states: Dict[str, IncrementalDetokenizer] = {} diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index f37db92e8ea6b..5b45615a1b85b 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -53,7 +53,12 @@ def __init__( input_registry) # Detokenizer (converts EngineCoreOutputs --> RequestOutput) - self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer) + self.detokenizer = Detokenizer( + tokenizer_name=vllm_config.model_config.tokenizer, + tokenizer_mode=vllm_config.model_config.tokenizer_mode, + trust_remote_code=vllm_config.model_config.trust_remote_code, + revision=vllm_config.model_config.tokenizer_revision, + ) # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs) self.engine_core = EngineCoreClient.make_client( From 0b8bb86bf19d68950b4d92a99350e07a26ec0d2c Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 13 Nov 2024 20:39:03 +0800 Subject: [PATCH 20/20] [1/N] Initial prototype for multi-modal processor (#10044) Signed-off-by: DarkLight1337 --- .../models/enabling_multimodal_inputs.rst | 2 +- .../mm_processor_kwargs/test_qwen.py | 2 +- .../{test_base.py => test_inputs.py} | 2 +- tests/multimodal/test_processor_kwargs.py | 37 ++- tests/v1/core/test_prefix_caching.py | 4 +- vllm/config.py | 2 +- vllm/engine/async_llm_engine.py | 4 + vllm/engine/llm_engine.py | 16 +- vllm/engine/multiprocessing/client.py | 6 + vllm/engine/protocol.py | 16 +- vllm/entrypoints/openai/serving_chat.py | 1 - vllm/entrypoints/openai/serving_completion.py | 1 - vllm/inputs/__init__.py | 12 +- vllm/inputs/data.py | 99 ++++++- vllm/inputs/preprocess.py | 143 +++++++-- vllm/inputs/registry.py | 56 +++- vllm/model_executor/models/chatglm.py | 4 +- vllm/model_executor/models/fuyu.py | 3 +- vllm/model_executor/models/h2ovl.py | 3 +- vllm/model_executor/models/internvl.py | 3 +- vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/minicpmv.py | 3 +- vllm/model_executor/models/phi3v.py | 2 +- vllm/model_executor/models/pixtral.py | 3 +- vllm/model_executor/models/qwen.py | 3 +- vllm/model_executor/models/qwen2_vl.py | 6 +- vllm/model_executor/models/utils.py | 2 +- vllm/multimodal/__init__.py | 10 +- vllm/multimodal/audio.py | 12 +- vllm/multimodal/base.py | 188 ++---------- vllm/multimodal/image.py | 10 +- vllm/multimodal/inputs.py | 225 +++++++++++++++ vllm/multimodal/processing.py | 273 ++++++++++++++++++ vllm/multimodal/registry.py | 84 +++++- vllm/multimodal/utils.py | 3 +- vllm/multimodal/video.py | 20 +- vllm/sequence.py | 68 ++--- vllm/v1/engine/async_llm.py | 4 + vllm/v1/engine/llm_engine.py | 4 +- vllm/v1/engine/processor.py | 73 +++-- vllm/v1/request.py | 26 +- vllm/v1/worker/gpu_model_runner.py | 2 +- vllm/worker/cpu_model_runner.py | 41 ++- vllm/worker/hpu_model_runner.py | 6 +- vllm/worker/model_runner.py | 25 +- vllm/worker/neuron_model_runner.py | 22 +- vllm/worker/openvino_model_runner.py | 21 +- vllm/worker/xpu_model_runner.py | 16 +- 48 files changed, 1133 insertions(+), 437 deletions(-) rename tests/multimodal/{test_base.py => test_inputs.py} (97%) create mode 100644 vllm/multimodal/inputs.py create mode 100644 vllm/multimodal/processing.py diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst index 3d0d1aec69845..49b5285c45590 100644 --- a/docs/source/models/enabling_multimodal_inputs.rst +++ b/docs/source/models/enabling_multimodal_inputs.rst @@ -66,7 +66,7 @@ A default mapper is available for each modality in the core vLLM library. This i 3. Register maximum number of multi-modal tokens ------------------------------------------------ -For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data instance +For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item and register it via :meth:`INPUT_REGISTRY.register_dummy_data `. .. code-block:: diff diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py index e6ed87fc8ea08..163220c91a27d 100644 --- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py +++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py @@ -6,7 +6,7 @@ from PIL.Image import Image from vllm.inputs import InputContext, token_inputs -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from .....conftest import IMAGE_ASSETS diff --git a/tests/multimodal/test_base.py b/tests/multimodal/test_inputs.py similarity index 97% rename from tests/multimodal/test_base.py rename to tests/multimodal/test_inputs.py index bfaf2cdeaa8d4..678bbb52b8c2f 100644 --- a/tests/multimodal/test_base.py +++ b/tests/multimodal/test_inputs.py @@ -1,6 +1,6 @@ import torch -from vllm.multimodal.base import MultiModalKwargs, NestedTensors +from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors def assert_nested_tensors_equal(expected: NestedTensors, diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py index 4d3bbd805c152..e6c8793989e13 100644 --- a/tests/multimodal/test_processor_kwargs.py +++ b/tests/multimodal/test_processor_kwargs.py @@ -1,12 +1,12 @@ from array import array -from typing import Mapping +from typing import Callable, Dict, Mapping, Optional from unittest.mock import patch import pytest import torch from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext, - InputRegistry, token_inputs) + InputRegistry, ProcessorInputs, token_inputs) from vllm.multimodal import MultiModalRegistry from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData @@ -34,10 +34,9 @@ def custom_processor(ctx: InputContext, inputs: DecoderOnlyInputs, *, num_crops=DEFAULT_NUM_CROPS): - # For testing purposes, we don't worry about the llm inputs / return - # type validation, and just return the value of the kwarg that we - # clobber. - return num_crops + # For testing purposes, we don't worry about the prompt + return token_inputs(prompt_token_ids=[], + mm_processor_kwargs={"num_crops": num_crops}) with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor", return_value=custom_processor): @@ -109,6 +108,21 @@ def _get_num_crops_info(init_num_crops: int, inference_num_crops: int): return init_kwargs, inference_kwargs, expected_seq_count +def _get_processed_num_crops( + processor: Callable[[ProcessorInputs], ProcessorInputs], + inference_kwargs: Optional[Dict[str, int]], +) -> int: + processed_inputs = processor( + token_inputs(prompt_token_ids=[], + prompt="", + mm_processor_kwargs=inference_kwargs)) + + assert "type" in processed_inputs + assert processed_inputs["type"] == "token" + assert "mm_processor_kwargs" in processed_inputs + return processed_inputs["mm_processor_kwargs"]["num_crops"] + + @pytest.mark.parametrize("init_num_crops,inference_num_crops", [ (None, None), (NUM_CROPS_OVERRIDE, None), @@ -124,10 +138,8 @@ def test_input_processor_kwargs(use_processor_mock, init_num_crops, ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs) processor = dummy_registry.create_input_processor(ctx.model_config) - num_crops_val = processor( - token_inputs(prompt_token_ids=[], - prompt="", - mm_processor_kwargs=inference_kwargs)) + num_crops_val = _get_processed_num_crops(processor, inference_kwargs) + assert num_crops_val == expected_seq_count @@ -153,10 +165,7 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock, processor = dummy_registry.create_input_processor(ctx.model_config) # Should filter out the inference time kwargs - num_crops_val = processor( - token_inputs(prompt_token_ids=[], - prompt="", - mm_processor_kwargs=mm_processor_kwargs)) + num_crops_val = _get_processed_num_crops(processor, mm_processor_kwargs) assert num_crops_val == DEFAULT_NUM_CROPS diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index e5a3b62258dd8..d614d3e67460f 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -1,5 +1,5 @@ """Compare the with and without prefix caching.""" -from vllm.inputs import DecoderOnlyInputs +from vllm.inputs import token_inputs from vllm.sampling_params import SamplingParams from vllm.v1.core.kv_cache_manager import KVCacheManager, Request from vllm.v1.core.kv_cache_utils import hash_block_tokens @@ -8,7 +8,7 @@ def make_request(request_id, prompt_token_ids): return Request( request_id=request_id, - inputs=DecoderOnlyInputs(prompt_token_ids=prompt_token_ids), + inputs=token_inputs(prompt_token_ids=prompt_token_ids), sampling_params=SamplingParams(max_tokens=17), eos_token_id=100, arrival_time=0, diff --git a/vllm/config.py b/vllm/config.py index 5ba1c41fcaac1..002adb4316969 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -107,7 +107,7 @@ class ModelConfig: matches the model name exposed via the APIs. If multiple model names provided, the first name will be used. If not specified, the model name will be the same as `model`. - limit_mm_per_prompt: Maximum number of data instances per modality + limit_mm_per_prompt: Maximum number of data items per modality per prompt. Only applicable for multimodal models. override_neuron_config: Initialize non default neuron config or override default neuron config that are specific to Neuron devices, diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 1a371b52bb64b..5a5388708b1c6 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -19,6 +19,7 @@ from vllm.executor.gpu_executor import GPUExecutorAsync from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import PromptType +from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.guided_decoding import ( @@ -729,6 +730,9 @@ def _error_callback(self, exc: Exception) -> None: self.set_errored(exc) self._request_tracker.propagate_exception(exc) + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.engine.input_preprocessor + async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 69ed6e6bd59d2..f5299746d845d 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -30,7 +30,7 @@ from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, - PromptType) + PromptType, SingletonInputsAdapter) from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger @@ -39,6 +39,7 @@ from vllm.model_executor.guided_decoding import ( get_local_guided_decoding_logits_processor) from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import (EmbeddingRequestOutput, RequestOutput, RequestOutputFactory) from vllm.pooling_params import PoolingParams @@ -226,6 +227,7 @@ def __init__( usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, ) -> None: @@ -335,7 +337,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer: model_config) self.input_preprocessor = InputPreprocessor(model_config, - self.tokenizer) + self.tokenizer, + mm_registry) self.input_registry = input_registry self.input_processor = input_registry.create_input_processor( @@ -851,13 +854,6 @@ def add_request( ) processed_inputs = self.input_processor(preprocessed_inputs) - # This is a bit of a hack - copy the mm_processor_kwargs that were - # used in the input processor to the processed output, since these - # kwargs are presumed to be immutable and the values should be aligned - # between the input processor (here) and the input mapper. - processed_inputs["mm_processor_kwargs"] = preprocessed_inputs.get( - "mm_processor_kwargs") - self._add_processed_request( request_id=request_id, processed_inputs=processed_inputs, @@ -2019,7 +2015,7 @@ def _validate_model_inputs(self, inputs: ProcessorInputs, else: prompt_inputs = inputs - prompt_ids = prompt_inputs.get("prompt_token_ids") + prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids if prompt_ids is None or len(prompt_ids) == 0: raise ValueError("Prompt cannot be empty") diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 882742c2fc61b..fe21c58c775fe 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -31,6 +31,7 @@ # yapf: enable from vllm.envs import VLLM_RPC_TIMEOUT from vllm.inputs import PromptType +from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput @@ -94,6 +95,8 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig, parallel_config=engine_config.parallel_config, enable_lora=bool(engine_config.lora_config), ) + self.input_preprocessor = InputPreprocessor(self.model_config, + self.tokenizer) # Send RPCGenerateRequest to the MQLLMEngine. self.input_socket: Socket = self.context.socket(zmq.constants.PUSH) @@ -345,6 +348,9 @@ async def _check_success(error_message: str, socket: Socket): or response != VLLM_RPC_SUCCESS_STR): raise ValueError(error_message) + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.input_preprocessor + async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None): return await self.tokenizer.get_lora_tokenizer_async(lora_request) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index e0b59d94cfdc3..e15395d75c91f 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -62,7 +62,6 @@ def generate( async def beam_search( self, prompt: PromptType, - model_config: ModelConfig, request_id: str, params: BeamSearchParams, ) -> AsyncGenerator[RequestOutput, None]: @@ -74,13 +73,14 @@ async def beam_search( length_penalty = params.length_penalty include_stop_str_in_output = params.include_stop_str_in_output - tokenizer = await self.get_tokenizer() - input_preprocessor = InputPreprocessor(model_config, tokenizer) + preprocessor = await self.get_input_preprocessor() + tokenizer_group = preprocessor.get_tokenizer_group() + tokenizer = await tokenizer_group.get_lora_tokenizer_async() if is_explicit_encoder_decoder_prompt(prompt): raise NotImplementedError else: - processed_inputs = input_preprocessor._prompt_to_llm_inputs( + processed_inputs = preprocessor._prompt_to_llm_inputs( prompt, request_id=request_id, ) @@ -220,6 +220,7 @@ async def abort(self, request_id: str) -> None: Args: request_id: The unique id of the request. """ + ... @abstractmethod async def get_model_config(self) -> ModelConfig: @@ -228,8 +229,13 @@ async def get_model_config(self) -> ModelConfig: @abstractmethod async def get_decoding_config(self) -> DecodingConfig: - ... """Get the decoding configuration of the vLLM engine.""" + ... + + @abstractmethod + async def get_input_preprocessor(self) -> InputPreprocessor: + """Get the input processor of the vLLM engine.""" + ... @abstractmethod async def get_tokenizer( diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 74867d8de8843..09edaf98f7d17 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -190,7 +190,6 @@ async def create_chat_completion( if isinstance(sampling_params, BeamSearchParams): generator = self.engine_client.beam_search( prompt=engine_prompt, - model_config=self.model_config, request_id=request_id, params=sampling_params, ) diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index db31b1153d97e..936aae8f1c267 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -140,7 +140,6 @@ async def create_completion( if isinstance(sampling_params, BeamSearchParams): generator = self.engine_client.beam_search( prompt=engine_prompt, - model_config=self.model_config, request_id=request_id, params=sampling_params, ) diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 68ac50a2c5a16..54fbd7a321a6f 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,9 +1,11 @@ from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, - SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs, - TokensPrompt, build_explicit_enc_dec_prompt, - to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) -from .registry import DummyData, InputContext, InputRegistry + SingletonInputs, SingletonInputsAdapter, SingletonPrompt, + TextPrompt, TokenInputs, TokensPrompt, + build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, + token_inputs, zip_enc_dec_prompts) +from .registry import (DummyData, InputContext, InputProcessingContext, + InputRegistry) INPUT_REGISTRY = InputRegistry() """ @@ -26,12 +28,14 @@ "EncoderDecoderInputs", "ProcessorInputs", "SingletonInputs", + "SingletonInputsAdapter", "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", "zip_enc_dec_prompts", "INPUT_REGISTRY", "DummyData", "InputContext", + "InputProcessingContext", "InputRegistry", ] diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 46b41f431bec7..07ff9faa50f13 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,10 +1,14 @@ +from dataclasses import dataclass +from functools import cached_property from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, Optional, Tuple, Union, cast) -from typing_extensions import NotRequired, TypedDict, TypeVar +import torch +from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never if TYPE_CHECKING: from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict + from vllm.multimodal.inputs import MultiModalInputsV2 class TextPrompt(TypedDict): @@ -36,13 +40,13 @@ class TokensPrompt(TypedDict): multi_modal_data: NotRequired["MultiModalDataDict"] """ - Optional multi-modal data to pass to the model, + DEPRECATED: Optional multi-modal data to pass to the model, if the model supports it. """ mm_processor_kwargs: NotRequired[Dict[str, Any]] """ - Optional multi-modal processor kwargs to be forwarded to the + DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities have registered mappers etc for the model being considered, we attempt to pass the mm_processor_kwargs to each of them. @@ -176,7 +180,7 @@ def token_inputs( return inputs -DecoderOnlyInputs = TokenInputs +DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"] """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. @@ -191,19 +195,91 @@ class EncoderDecoderInputs(TypedDict): This specifies the required data for encoder-decoder models. """ - encoder: TokenInputs + encoder: Union[TokenInputs, "MultiModalInputsV2"] """The inputs for the encoder portion.""" - decoder: TokenInputs + decoder: Union[TokenInputs, "MultiModalInputsV2"] """The inputs for the decoder portion.""" -SingletonInputs = TokenInputs +SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"] """ A processed :class:`SingletonPrompt` which can be passed to :class:`vllm.sequence.Sequence`. """ + +@dataclass +class SingletonInputsAdapter: + """ + Unified interface to access the components of :class:`SingletonInputs`. + """ + inputs: SingletonInputs + + @cached_property + def prompt(self) -> Optional[str]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("prompt") + + assert_never(inputs) + + @cached_property + def prompt_token_ids(self) -> List[int]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return inputs.get("prompt_token_ids", []) + + assert_never(inputs) + + @cached_property + def prompt_embeds(self) -> Optional[torch.Tensor]: + inputs = self.inputs + + if inputs["type"] == "token" or inputs["type"] == "multimodal": + return None + + assert_never(inputs) + + @cached_property + def multi_modal_data(self) -> "MultiModalDataDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_data", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_kwargs", {}) + + assert_never(inputs) + + @cached_property + def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict": + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("multi_modal_placeholders", {}) + + if inputs["type"] == "multimodal": + return inputs.get("mm_placeholders", {}) + + assert_never(inputs) + + @cached_property + def mm_processor_kwargs(self) -> Dict[str, Any]: + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("mm_processor_kwargs", {}) + + if inputs["type"] == "multimodal": + return {} + + assert_never(inputs) + + ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] """ The inputs to :data:`vllm.inputs.InputProcessor`. @@ -234,10 +310,11 @@ def zip_enc_dec_prompts( ) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]: """ Zip encoder and decoder prompts together into a list of - :class:`ExplicitEncoderDecoderPrompt` instances. mm_processor_kwargs - may also be provided; if a dict is passed, the same dictionary will be - used for every encoder/decoder prompt. If an iterable is provided, it will - be zipped with the encoder/decoder prompts. + :class:`ExplicitEncoderDecoderPrompt` instances. + + ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same + dictionary will be used for every encoder/decoder prompt. If an iterable is + provided, it will be zipped with the encoder/decoder prompts. """ if mm_processor_kwargs is None: mm_processor_kwargs = cast(Dict[str, Any], {}) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 509b0448b9e51..fdf28615fda10 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,11 +1,13 @@ import asyncio -from typing import List, Optional +from typing import List, Mapping, Optional, Union from typing_extensions import assert_never from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2 from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.utils import print_warning_once @@ -23,11 +25,13 @@ def __init__( self, model_config: ModelConfig, tokenizer: Optional[BaseTokenizerGroup], + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: super().__init__() self.model_config = model_config self.tokenizer = tokenizer + self.mm_registry = mm_registry def get_tokenizer_group(self) -> BaseTokenizerGroup: if self.tokenizer is None: @@ -198,14 +202,79 @@ async def _tokenize_prompt_async( prompt=prompt, lora_request=lora_request) + def _can_process_multimodal(self) -> bool: + model_config = self.model_config + + if not model_config.is_multimodal_model: + raise ValueError("Your model does not support multi-modal inputs") + + # Interim measure so we can handle models that have yet to be + # updated to use the new multi-modal processor + can_process_multimodal = self.mm_registry.has_processor(model_config) + if not can_process_multimodal: + logger.info( + "Your model uses the legacy input pipeline instead of the new " + "multi-modal processor. Please note that the legacy pipeline " + "will be removed in a future release. For more details, see: " + "https://github.com/vllm-project/vllm/issues/10114") + + return can_process_multimodal + + def _process_multimodal( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Mapping[str, object]], + lora_request: Optional[LoRARequest], + ) -> MultiModalInputsV2: + """ + Apply the model's multi-modal processor to a multi-modal prompt, + returning the corresponding token IDs and metadata. + """ + tokenizer_group = self.get_tokenizer_group() + tokenizer = tokenizer_group.get_lora_tokenizer(lora_request) + + mm_processor = self.mm_registry.create_processor( + self.model_config, tokenizer) + + if isinstance(prompt, list): + prompt = tokenizer.decode(prompt) + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + + async def _process_multimodal_async( + self, + prompt: Union[str, List[int]], + mm_data: MultiModalDataDict, + mm_processor_kwargs: Optional[Mapping[str, object]], + lora_request: Optional[LoRARequest], + ) -> MultiModalInputsV2: + """Async version of :meth:`_process_multimodal`.""" + tokenizer_group = self.get_tokenizer_group() + tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request + ) + + mm_processor = self.mm_registry.create_processor( + self.model_config, tokenizer) + if isinstance(prompt, list): + logger.warning("Passing `multi_modal_data` in TokensPrompt is" + "deprecated and will be removed in a future update") + prompt = tokenizer.decode(prompt) + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + + return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) + def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, ) -> SingletonInputs: - ''' - Extract the components of any single encoder or decoder input prompt. + """ + Extract the singleton inputs from a prompt. Arguments: @@ -215,12 +284,8 @@ def _prompt_to_llm_inputs( Returns: - * prompt - * prompt_token_ids - * multi_modal_data - * mm_processor_kwargs (request-level input processor/mapper overrides) - ''' - + * :class:`SingletonInputs` instance + """ parsed = parse_singleton_prompt(prompt) if parsed["type"] == "str": @@ -243,6 +308,14 @@ def _prompt_to_llm_inputs( multi_modal_data = tokens_content.get("multi_modal_data") mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + if multi_modal_data is not None and self._can_process_multimodal(): + return self._process_multimodal( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + return token_inputs( prompt_token_ids=prompt_token_ids, multi_modal_data=multi_modal_data, @@ -253,13 +326,22 @@ def _prompt_to_llm_inputs( text_content = parsed["content"] prompt_text = text_content["prompt"] + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + + if multi_modal_data is not None and self._can_process_multimodal(): + return self._process_multimodal( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + prompt_token_ids = self._tokenize_prompt( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = text_content.get("multi_modal_data") - mm_processor_kwargs = text_content.get("mm_processor_kwargs") return token_inputs( prompt=prompt_text, @@ -299,6 +381,14 @@ async def _prompt_to_llm_inputs_async( multi_modal_data = tokens_content.get("multi_modal_data") mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") + if multi_modal_data is not None and self._can_process_multimodal(): + return await self._process_multimodal_async( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + return token_inputs( prompt_token_ids=prompt_token_ids, multi_modal_data=multi_modal_data, @@ -309,13 +399,22 @@ async def _prompt_to_llm_inputs_async( text_content = parsed["content"] prompt_text = text_content["prompt"] + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") + + if multi_modal_data is not None and self._can_process_multimodal(): + return await self._process_multimodal_async( + prompt_text, + multi_modal_data, + mm_processor_kwargs, + lora_request=lora_request, + ) + prompt_token_ids = await self._tokenize_prompt_async( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = text_content.get("multi_modal_data") - mm_processor_kwargs = text_content.get("mm_processor_kwargs") return token_inputs( prompt=prompt_text, @@ -331,7 +430,8 @@ def _build_enc_dec_llm_inputs( encoder_inputs: SingletonInputs, decoder_inputs: Optional[SingletonInputs], ) -> EncoderDecoderInputs: - if encoder_inputs["type"] == "token": + if (encoder_inputs["type"] == "token" + or encoder_inputs["type"] == "multimodal"): pass else: assert_never(encoder_inputs) @@ -340,7 +440,8 @@ def _build_enc_dec_llm_inputs( dec_token_ids = self._prepare_decoder_input_ids_for_generation( None) decoder_inputs = token_inputs(dec_token_ids) - elif decoder_inputs["type"] == "token": + elif (decoder_inputs["type"] == "token" + or decoder_inputs["type"] == "multimodal"): dec_token_ids = self._prepare_decoder_input_ids_for_generation( decoder_inputs["prompt_token_ids"]) decoder_inputs["prompt_token_ids"] = dec_token_ids @@ -361,7 +462,7 @@ def _process_encoder_decoder_prompt( prompt: PromptType, request_id: str, ) -> EncoderDecoderInputs: - ''' + """ For encoder/decoder models only: Process an input prompt into an :class:`EncoderDecoderInputs` instance. @@ -391,8 +492,7 @@ def _process_encoder_decoder_prompt( Returns: * :class:`EncoderDecoderInputs` instance - ''' - + """ encoder_inputs: SingletonInputs decoder_inputs: Optional[SingletonInputs] @@ -460,7 +560,8 @@ def _build_decoder_only_llm_inputs( prompt_inputs: DecoderOnlyInputs, prompt_adapter_request: Optional[PromptAdapterRequest], ) -> DecoderOnlyInputs: - if prompt_inputs["type"] == "token": + if (prompt_inputs["type"] == "token" + or prompt_inputs["type"] == "multimodal"): prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter( prompt_inputs["prompt_token_ids"], prompt_adapter_request=prompt_adapter_request, @@ -477,7 +578,7 @@ def _process_decoder_only_prompt( lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> DecoderOnlyInputs: - ''' + """ For decoder-only models: Process an input prompt into an :class:`DecoderOnlyInputs` instance. @@ -491,7 +592,7 @@ def _process_decoder_only_prompt( Returns: * :class:`DecoderOnlyInputs` instance - ''' + """ prompt_comps = self._prompt_to_llm_inputs( prompt, diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 7d7a797be4f60..68b4756331e6d 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -5,14 +5,17 @@ Optional, Protocol, Type, cast) from torch import nn -from transformers import PretrainedConfig -from typing_extensions import TypeVar +from transformers import PretrainedConfig, ProcessorMixin +from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger +from vllm.transformers_utils.processor import cached_get_processor +from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once, resolve_mm_processor_kwargs) -from .data import ProcessorInputs +from .data import ProcessorInputs, SingletonInputs +from .parse import is_encoder_decoder_inputs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -61,6 +64,19 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]: return self.model_config.hf_image_processor_config +@dataclass(frozen=True) +class InputProcessingContext(InputContext): + tokenizer: AnyTokenizer + """The tokenizer used to tokenize the inputs.""" + + def get_hf_processor(self) -> ProcessorMixin: + return cached_get_processor( + self.model_config.tokenizer, + tokenizer=self.tokenizer, # Override the tokenizer with ours + trust_remote_code=self.model_config.trust_remote_code, + ) + + N = TypeVar("N", bound=Type[nn.Module]) @@ -94,7 +110,7 @@ def __call__( ... -class _MultiModalCounts(UserDict): +class _MultiModalCounts(UserDict[str, int]): """ Wraps `mm_counts` for a more informative error message when attempting to access a plugin that does not exist. @@ -287,6 +303,21 @@ def _get_model_input_processor(self, model_cls: Type[nn.Module]): return self._input_processors_by_model_type \ .get(model_cls, self._default_input_processor) + def _ensure_mm_kwargs( + self, + inputs: SingletonInputs, + mm_processor_kwargs: Dict[str, Any], + ): + if inputs["type"] == "token": + # In case the input processor for that model fails to set it + if "mm_processor_kwargs" not in inputs: + inputs["mm_processor_kwargs"] = mm_processor_kwargs + elif inputs["type"] == "multimodal": + # Be more strict in V2 + assert "mm_kwargs" in inputs + else: + assert_never(inputs["type"]) + def process_input(self, model_config: "ModelConfig", inputs: ProcessorInputs) -> ProcessorInputs: """ @@ -312,8 +343,21 @@ def process_input(self, model_config: "ModelConfig", processor, ) - return processor(InputContext(model_config), inputs, - **mm_processor_kwargs) + processed_inputs = processor( + InputContext(model_config), + inputs, + **mm_processor_kwargs, + ) + + if is_encoder_decoder_inputs(processed_inputs): + self._ensure_mm_kwargs(processed_inputs["encoder"], + mm_processor_kwargs) + self._ensure_mm_kwargs(processed_inputs["decoder"], + mm_processor_kwargs) + else: + self._ensure_mm_kwargs(processed_inputs, mm_processor_kwargs) + + return processed_inputs def create_input_processor(self, model_config: "ModelConfig"): """ diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 08ed84aa9c71a..6ec2d5a2a3909 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -30,8 +30,8 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.multimodal.base import MultiModalData +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 37f38d4d76671..b39dfe706e0df 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -32,8 +32,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.models.persimmon import PersimmonForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py index 767171dad7c7b..df7e768fe14d3 100644 --- a/vllm/model_executor/models/h2ovl.py +++ b/vllm/model_executor/models/h2ovl.py @@ -15,8 +15,7 @@ from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, token_inputs) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.utils import is_list_of diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 77efc9a26ef7a..07165ea688f94 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -25,8 +25,7 @@ from vllm.model_executor.models.intern_vit import (InternVisionModel, InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index af712bf8f9506..005ae5e03cfed 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -17,7 +17,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import NestedTensors +from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index aae534c0b5949..999739ccd98bf 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -51,8 +51,7 @@ from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.models.utils import LLMWrapper from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index de03d28638cda..4db65edc174f1 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -39,7 +39,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import NestedTensors, PlaceholderRange +from vllm.multimodal.inputs import NestedTensors, PlaceholderRange from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import is_list_of diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index 6bd5e119dd2dd..a3e30ea2dd299 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -29,8 +29,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.utils import merge_multimodal_embeddings from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) from vllm.sequence import IntermediateTensors, SequenceData diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 5acd87146c54e..3d26ede722dd1 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -42,8 +42,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalKwargs +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, SequenceData from vllm.utils import is_list_of diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 9a19ccbca3f1e..2335baf459771 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -60,10 +60,10 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.pooling_metadata import PoolingMetadata -from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict, - MultiModalKwargs) -from vllm.multimodal.base import MultiModalData +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_image_processor +from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, + MultiModalKwargs) from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData from vllm.transformers_utils.config import uses_mrope diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index ca4fc8ec952bf..1fc6c1be4b7bb 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -15,7 +15,7 @@ from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models import ModelRegistry -from vllm.multimodal.base import MultiModalPlaceholderMap, NestedTensors +from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors from vllm.utils import is_pin_memory_available diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 14911853abc73..03a5f3a91f7a1 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,7 +1,8 @@ -from .base import (BatchedTensorInputs, MultiModalDataBuiltins, - MultiModalDataDict, MultiModalKwargs, - MultiModalPlaceholderDict, MultiModalPlaceholderMap, - MultiModalPlugin, NestedTensors) +from .base import MultiModalPlaceholderMap, MultiModalPlugin +from .inputs import (BatchedTensorInputs, MultiModalData, + MultiModalDataBuiltins, MultiModalDataDict, + MultiModalKwargs, MultiModalPlaceholderDict, + NestedTensors) from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -15,6 +16,7 @@ __all__ = [ "BatchedTensorInputs", + "MultiModalData", "MultiModalDataBuiltins", "MultiModalDataDict", "MultiModalKwargs", diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py index e71ae5feec1c6..1a230602966d4 100644 --- a/vllm/multimodal/audio.py +++ b/vllm/multimodal/audio.py @@ -1,5 +1,7 @@ from vllm.inputs.registry import InputContext -from vllm.multimodal.base import MultiModalKwargs, MultiModalPlugin + +from .base import MultiModalPlugin +from .inputs import AudioItem, MultiModalData, MultiModalKwargs class AudioPlugin(MultiModalPlugin): @@ -8,8 +10,12 @@ class AudioPlugin(MultiModalPlugin): def get_data_key(self) -> str: return "audio" - def _default_input_mapper(self, ctx: InputContext, data: object, - **mm_processor_kwargs) -> MultiModalKwargs: + def _default_input_mapper( + self, + ctx: InputContext, + data: MultiModalData[AudioItem], + **mm_processor_kwargs, + ) -> MultiModalKwargs: raise NotImplementedError("There is no default audio input mapper") def _default_max_multimodal_tokens(self, ctx: InputContext) -> int: diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index fa514d3fcb3b7..6eec660e42ac4 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,180 +1,23 @@ from abc import ABC, abstractmethod -from collections import UserDict, defaultdict -from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping, - NamedTuple, Optional, Tuple, Type, TypedDict, TypeVar, - Union, cast, final) - -import numpy as np -import torch -import torch.types -from PIL import Image +from collections import defaultdict +from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, + Optional, Sequence, Tuple, Type, TypeVar, Union) + from torch import nn -from typing_extensions import TypeAlias from vllm.inputs import InputContext from vllm.logger import init_logger -from vllm.utils import (JSONTree, get_allowed_kwarg_only_overrides, is_list_of, - json_map_leaves, resolve_mm_processor_kwargs) +from vllm.utils import (get_allowed_kwarg_only_overrides, + resolve_mm_processor_kwargs) if TYPE_CHECKING: from vllm.config import ModelConfig from vllm.sequence import SequenceGroupMetadata -logger = init_logger(__name__) - -NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor] -""" -Uses a list instead of a tensor if the dimensions of each element do not match. -""" - -BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] -""" -A dictionary containing nested tensors which have been batched via -:meth:`MultiModalKwargs.batch`. -""" - - -class _MultiModalKwargsBase(UserDict[str, NestedTensors]): - pass - - -class MultiModalKwargs(_MultiModalKwargsBase): - """ - A dictionary that represents the keyword arguments to - :meth:`~torch.nn.Module.forward`. - """ - - @staticmethod - def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: - """ - Recursively stacks lists of tensors when they all have the same shape. - """ - if isinstance(nested_tensors, torch.Tensor): - return nested_tensors - - if isinstance(nested_tensors, np.ndarray): - return torch.from_numpy(nested_tensors) - - if isinstance(nested_tensors, (int, float)): - return torch.tensor(nested_tensors) +from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs, + PlaceholderRange) - stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] - if not is_list_of(stacked, torch.Tensor, check="all"): - # Only tensors (not lists) can be stacked. - return stacked - - tensors_ = cast(List[torch.Tensor], stacked) - if any(t.shape != tensors_[0].shape for t in tensors_): - # The tensors have incompatible shapes and can't be stacked. - return tensors_ - - return torch.stack(tensors_) - - @staticmethod - def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: - """ - Batch multiple inputs together into a dictionary. - - The resulting dictionary has the same keys as the inputs. - If the corresponding value from each input is a tensor and they all - share the same shape, the output value is a single batched tensor; - otherwise, the output value is a list containing the original value - from each input. - """ - if len(inputs_list) == 0: - return {} - - item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) - - for inputs in inputs_list: - # For models that supports multiple modalities (e.g. Qwen2-VL), - # different modalities will return different data keys, - # so batch() should skip the same key check. - - for k, v in inputs.items(): - item_lists[k].append(v) - - return { - k: MultiModalKwargs._try_stack(item_list) - for k, item_list in item_lists.items() - } - - @staticmethod - def as_kwargs( - batched_inputs: BatchedTensorInputs, - *, - device: torch.types.Device, - ) -> BatchedTensorInputs: - json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) - - json_mapped = json_map_leaves( - lambda x: x.to(device, non_blocking=True), - json_inputs, - ) - - return cast(BatchedTensorInputs, json_mapped) - - -_T = TypeVar("_T") - -MultiModalData: TypeAlias = Union[_T, List[_T]] -""" -Either a single data instance, or a list of data instances. - -The number of data instances allowed per modality is restricted by -`--limit-mm-per-prompt`. -""" - - -@final -class MultiModalDataBuiltins(TypedDict, total=False): - """Modality types that are predefined by vLLM.""" - - image: MultiModalData[Image.Image] - """The input image(s).""" - - audio: MultiModalData[Tuple[np.ndarray, Union[int, float]]] - """The input audio item(s) and corresponding sampling rate(s).""" - - video: MultiModalData[Tuple[np.ndarray]] - """The input video(s).""" - - -MultiModalDataDict = Union[MultiModalDataBuiltins, - Mapping[str, MultiModalData[object]]] -""" -A dictionary containing an item for each modality type to input. - -Note: - This dictionary also accepts modality keys defined outside - :class:`MultiModalDataBuiltins` as long as a customized plugin is registered - through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. - Read more on that :ref:`here `. -""" - - -class PlaceholderRange(TypedDict): - """ - Placeholder location information for multi-modal data. - - For example: - Prompt: AAAA BBBB What is in these images? - Images A and B will have: - A: { "offset": 0, "length": 4 } - B: { "offset": 5, "length": 4 } - """ - - offset: int - """The start index of the placeholder in the prompt.""" - - length: int - """The length of the placeholder.""" - - -MultiModalPlaceholderDict = Mapping[str, List[PlaceholderRange]] -""" -A dictionary containing placeholder ranges. -""" +logger = init_logger(__name__) MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]], MultiModalKwargs] @@ -192,6 +35,7 @@ class PlaceholderRange(TypedDict): model. This does not include tokens that correspond to the input text. """ +_T = TypeVar("_T") N = TypeVar("N", bound=Type[nn.Module]) @@ -224,7 +68,7 @@ def get_data_key(self) -> str: def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[Any], **mm_processor_kwargs, ) -> MultiModalKwargs: """ @@ -273,8 +117,8 @@ def wrapper(model_cls: N) -> N: def map_input( self, model_config: "ModelConfig", - data: MultiModalData[object], - mm_processor_kwargs: Dict[str, Any], + data: MultiModalData[Any], + mm_processor_kwargs: Optional[Dict[str, Any]], ) -> MultiModalKwargs: """ Transform the data into a dictionary of model inputs using the @@ -289,6 +133,7 @@ def map_input( - :ref:`input_processing_pipeline` - :ref:`enabling_multimodal_inputs` """ + # Avoid circular import from vllm.model_executor.model_loader import get_model_architecture @@ -300,6 +145,9 @@ def map_input( raise KeyError(f"No input mapper in {self} is registered for " f"model class {model_cls.__name__}.") + if mm_processor_kwargs is None: + mm_processor_kwargs = {} + # In the case of the default mapper, we have to get resource # processor through its HuggingFace autoclass; since this goes # through **kwargs, we can't inspect it the same way, so we allow @@ -508,7 +356,7 @@ def append_items_from_seq_group( self, positions: range, multi_modal_items: List[_T], - multi_modal_placeholders: List[PlaceholderRange], + multi_modal_placeholders: Sequence[PlaceholderRange], ) -> List[_T]: """ Adds the multi-modal items that intersect ```positions`` to this diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 589b46266b08d..97bbce1ce1570 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -3,14 +3,14 @@ import torch from PIL import Image -from transformers.image_processing_base import BatchFeature from vllm.inputs.registry import InputContext from vllm.logger import init_logger from vllm.transformers_utils.processor import get_image_processor from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalKwargs, MultiModalPlugin +from .base import MultiModalPlugin +from .inputs import ImageItem, MultiModalData, MultiModalKwargs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -41,15 +41,11 @@ def _get_hf_image_processor( def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[ImageItem], **mm_processor_kwargs, ) -> MultiModalKwargs: model_config = ctx.model_config - # Processed by input processor - if isinstance(data, BatchFeature): - return MultiModalKwargs(data.data) - # PIL image if isinstance(data, Image.Image) or is_list_of(data, Image.Image): image_processor = self._get_hf_image_processor( diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py new file mode 100644 index 0000000000000..64a4c58d5509c --- /dev/null +++ b/vllm/multimodal/inputs.py @@ -0,0 +1,225 @@ +from collections import UserDict, defaultdict +from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple, + TypedDict, TypeVar, Union, cast, final) + +import numpy as np +import torch +import torch.types +from PIL.Image import Image +from typing_extensions import TypeAlias + +from vllm.utils import JSONTree, is_list_of, json_map_leaves + +_T = TypeVar("_T") + +# yapf: disable +ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] +""" +A :class:`transformers.image_utils.ImageInput` representing a single image, +which can be passed to a HuggingFace :code:`ImageProcessor`. +""" + +VideoItem: TypeAlias = Union[ + List[Image], + np.ndarray, + torch.Tensor, + List[np.ndarray], + List[torch.Tensor], +] +""" + +A :class:`transformers.image_utils.VideoInput` representing a single video, +which can be passed to a HuggingFace :code:`VideoProcessor`. +""" + +AudioItem: TypeAlias = Union[ + np.ndarray, + List[float], + Tuple[np.ndarray, float], # DEPRECATED: Use mm_processor_kwargs instead +] +""" +Represents a single audio that can be inputted to a HuggingFace +:code:`AudioProcessor`. +""" +# yapf: enable + +MultiModalData: TypeAlias = Union[_T, List[_T]] +""" +Either a single data item, or a list of data items. + +The number of data items allowed per modality is restricted by +:code:`--limit-mm-per-prompt`. +""" + + +@final +class MultiModalDataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: MultiModalData[ImageItem] + """The input image(s).""" + + video: MultiModalData[VideoItem] + """The input video(s).""" + + audio: MultiModalData[AudioItem] + """The input audio(s).""" + + +MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]] +""" +A dictionary containing an entry for each modality type to input. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalDataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + + +class PlaceholderRange(TypedDict): + """ + Placeholder location information for multi-modal data. + + For example: + Prompt: AAAA BBBB What is in these images? + Images A and B will have: + A: { "offset": 0, "length": 4 } + B: { "offset": 5, "length": 4 } + """ + + offset: int + """The start index of the placeholder in the prompt.""" + + length: int + """The length of the placeholder.""" + + +NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor] +""" +Uses a list instead of a tensor if the dimensions of each element do not match. +""" + +BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors] +""" +A dictionary containing nested tensors which have been batched via +:meth:`MultiModalKwargs.batch`. +""" + + +class MultiModalKwargs(UserDict[str, NestedTensors]): + """ + A dictionary that represents the keyword arguments to + :meth:`~torch.nn.Module.forward`. + """ + + @staticmethod + def _try_stack(nested_tensors: NestedTensors) -> NestedTensors: + """ + Stack the inner dimensions that have the same shape in + a nested list of tensors. + + Thus, a dimension represented by a list means that the inner + dimensions are different for each element along that dimension. + """ + if isinstance(nested_tensors, torch.Tensor): + return nested_tensors + + # TODO: Remove these once all models have been migrated + if isinstance(nested_tensors, np.ndarray): + return torch.from_numpy(nested_tensors) + if isinstance(nested_tensors, (int, float)): + return torch.tensor(nested_tensors) + + stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors] + if not is_list_of(stacked, torch.Tensor, check="all"): + # Only tensors (not lists) can be stacked. + return stacked + + tensors_ = cast(List[torch.Tensor], stacked) + if any(t.shape != tensors_[0].shape for t in tensors_): + # The tensors have incompatible shapes and can't be stacked. + return tensors_ + + return torch.stack(tensors_) + + @staticmethod + def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs: + """ + Batch multiple inputs together into a dictionary. + + The resulting dictionary has the same keys as the inputs. + If the corresponding value from each input is a tensor and they all + share the same shape, the output value is a single batched tensor; + otherwise, the output value is a list containing the original value + from each input. + """ + if len(inputs_list) == 0: + return {} + + # We need to consider the case where each item in the batch + # contains different modalities (i.e. different keys). + item_lists: Dict[str, List[NestedTensors]] = defaultdict(list) + + for inputs in inputs_list: + for k, v in inputs.items(): + item_lists[k].append(v) + + return { + k: MultiModalKwargs._try_stack(item_list) + for k, item_list in item_lists.items() + } + + @staticmethod + def as_kwargs( + batched_inputs: BatchedTensorInputs, + *, + device: torch.types.Device, + ) -> BatchedTensorInputs: + json_inputs = cast(JSONTree[torch.Tensor], batched_inputs) + + json_mapped = json_map_leaves( + lambda x: x.to(device, non_blocking=True), + json_inputs, + ) + + return cast(BatchedTensorInputs, json_mapped) + + +MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] +""" +A dictionary containing placeholder ranges. +""" + + +class MultiModalInputsV2(TypedDict): + """ + Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`, + ready to be passed to vLLM internals. + """ + + type: Literal["multimodal"] + """The type of inputs.""" + + prompt: str + """ + The original, unprocessed prompt text. + + Note: + Since prompt text is not required by vLLM internals, we leave this + unprocessed to save CPU computation. You can still call + :code:`tokenizer.decode(prompt_token_ids)` to get the processed text. + """ + + prompt_token_ids: List[int] + """The processed token IDs which includes placeholder tokens.""" + + mm_kwargs: MultiModalKwargs + """Keyword arguments to be directly passed to the model after batching.""" + + mm_placeholders: MultiModalPlaceholderDict + """ + For each modality, information about the placeholder tokens in + :code:`prompt_token_ids`. + """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py new file mode 100644 index 0000000000000..88a924da174a6 --- /dev/null +++ b/vllm/multimodal/processing.py @@ -0,0 +1,273 @@ +from dataclasses import dataclass +from functools import lru_cache, partial +from typing import (Any, Callable, Collection, Generic, List, Mapping, + Optional, TypedDict, TypeVar, final) + +from transformers import BatchFeature +from typing_extensions import TypeAlias + +from vllm.inputs import InputProcessingContext +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer +from vllm.utils import is_list_of + +from .inputs import (AudioItem, ImageItem, MultiModalDataDict, + MultiModalInputsV2, MultiModalKwargs, PlaceholderRange, + VideoItem) + +_T = TypeVar("_T") + +ReplacementFunc: TypeAlias = Callable[[_T, BatchFeature, int], List[int]] +""" +Given the original data item, HF-processed data, and index of the processed +item, output the replacement token IDs to be allocated in vLLM. +""" + + +@dataclass +class ModalityProcessingMetadata(Generic[_T]): + placeholder_replacements: Mapping[str, ReplacementFunc] + """ + A dictionary where each item represents the original placeholder in the + prompt text and the corresponding replacement. + """ + + +class MultiModalProcessingMetadataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: ModalityProcessingMetadata[ImageItem] + video: ModalityProcessingMetadata[VideoItem] + audio: ModalityProcessingMetadata[AudioItem] + + +MultiModalProcessingMetadata: TypeAlias = \ + Mapping[str, ModalityProcessingMetadata[Any]] +""" +A dictionary containing an entry for each modality type to process. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + +MultiModalMultiData: TypeAlias = List[_T] +""" +A list of data items, where the number of data items allowed +per modality is restricted by :code:`--limit-mm-per-prompt`. +""" + + +@final +class MultiModalMultiDataBuiltins(TypedDict, total=False): + """Type annotations for modality types predefined by vLLM.""" + + image: MultiModalMultiData[ImageItem] + """The input images.""" + + video: MultiModalMultiData[VideoItem] + """The input videos.""" + + audio: MultiModalMultiData[AudioItem] + """The input audios.""" + + +MultiModalMultiDataDict: TypeAlias = Mapping[str, MultiModalMultiData[Any]] +""" +A dictionary containing an entry for each modality type to input. + +Note: + This dictionary also accepts modality keys defined outside + :class:`MultiModalMultiDataBuiltins` as long as a customized plugin + is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`. + Read more on that :ref:`here `. +""" + + +def to_multi_format(data: MultiModalDataDict) -> MultiModalMultiDataDict: + """ + Convert a :class:`MultiModalDataDict` containing single data items + to a :class:`MultiModalMultiDataDict` containing multiple data items + per entry. + """ + multi_data: Mapping[str, MultiModalMultiData[Any]] = {} + + for k, v in data.items(): + # yapf: disable + if k == "video": + # Special case since even a single item can be a list + multi_data[k] = v if is_list_of(v, list) else [v] # type: ignore[index] + elif k in ("image", "audio"): + multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] + else: + multi_data[k] = v if isinstance(v, list) else [v] # type: ignore[index] + # yapf: enable + + return multi_data + + +def encode_no_special_tokens( + tokenizer: AnyTokenizer, + text: str, +) -> List[int]: + """ + Backend-agnostic equivalent of HF's + :code:`tokenizer.encode(text, add_special_tokens=False)`. + """ + if isinstance(tokenizer, MistralTokenizer): + return tokenizer.tokenizer.encode(text, bos=False, eos=False) + + return tokenizer.encode(text, add_special_tokens=False) + + +@lru_cache +def candidate_placeholders( + tokenizer: AnyTokenizer, + placeholder_text: str, +) -> Collection[List[int]]: + """Generate token ID sequences that may represent a placeholder text.""" + # When the placeholder text is not mapped to a special token ID, + # it may be tokenized differently based on whether it is at the start/end + # of the string. So, we go through each combination of whether the text + # is at the start and end boundaries of the string + + # Matches the placeholder when it is in the middle of the string + start_id, = encode_no_special_tokens(tokenizer, "a") + end_id, = encode_no_special_tokens(tokenizer, "b") + + candidate_basic = encode_no_special_tokens(tokenizer, placeholder_text) + + start_id_, *candidate_a = encode_no_special_tokens( + tokenizer, + f"a{placeholder_text}", + ) + assert start_id == start_id_ + + start_id_, *candidate_ab, end_id_ = encode_no_special_tokens( + tokenizer, + f"a{placeholder_text}b", + ) + assert start_id == start_id_ and end_id == end_id_ + + *candidate_b, end_id_ = encode_no_special_tokens( + tokenizer, + f"{placeholder_text}b", + ) + assert end_id == end_id_ + + # Remove duplicates (need to convert to tuple to be hashable) + unique_candidates = { + tuple(c) + for c in [candidate_basic, candidate_a, candidate_ab, candidate_b] + } + + # Convert back to list + return [list(c) for c in unique_candidates] + + +def apply_placeholders( + token_ids: List[int], + placeholder_ids: List[int], + get_replacement_ids: Callable[[], List[int]], +) -> Optional[PlaceholderRange]: + """ + Find the first occurrence of :code:`placeholder_ids`, + and replace it with the output of :code:`get_replacement_ids`. + + This function updates :code:`token_ids` in place. + """ + placeholder_length = len(placeholder_ids) + + for start_idx in range(len(token_ids) - placeholder_length + 1): + if token_ids[start_idx:placeholder_length] == placeholder_ids: + token_ids[start_idx:placeholder_length] = get_replacement_ids() + + return PlaceholderRange(offset=start_idx, + length=placeholder_length) + + return None + + +class MultiModalProcessor: + """ + Helper class to process multi-modal inputs to be used in vLLM. + """ + + def __init__( + self, + ctx: InputProcessingContext, + metadata: MultiModalProcessingMetadata, + ) -> None: + super().__init__() + + self.ctx = ctx + self.metadata = metadata + + def __call__( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + return self.apply(prompt, mm_data, mm_processor_kwargs) + + def apply( + self, + prompt: str, + mm_data: MultiModalDataDict, + mm_processor_kwargs: Mapping[str, object], + ) -> MultiModalInputsV2: + tokenizer = self.ctx.tokenizer + hf_processor = self.ctx.get_hf_processor() + + processed_inputs = hf_processor( + text=prompt, # type: ignore + **mm_data, + **mm_processor_kwargs, + ) + new_token_ids, = processed_inputs.pop("input_ids").tolist() + mm_kwargs = MultiModalKwargs(processed_inputs) + + mm_placeholders: Mapping[str, List[PlaceholderRange]] = {} + + for modality, orig_inputs in to_multi_format(mm_data).items(): + assert isinstance(orig_inputs, list) + + metadata = self.metadata[modality] + placeholder_replacements = metadata.placeholder_replacements + + modality_placeholders: List[PlaceholderRange] = [] + + for item_idx, orig_item in enumerate(orig_inputs): + for match_text, replace_fn in placeholder_replacements.items(): + candidates = candidate_placeholders(tokenizer, match_text) + get_replacement_ids = partial( + replace_fn, + orig_item, + processed_inputs, + item_idx, + ) + + for match_ids in candidates: + # TODO(youkaichao): Don't update new_token_ids + placeholders = apply_placeholders( + new_token_ids, + match_ids, + get_replacement_ids, + ) + + if placeholders is not None: + modality_placeholders.append(placeholders) + + # yapf: disable + mm_placeholders[modality] = modality_placeholders # type: ignore[index] + # yapf: enable + + return MultiModalInputsV2( + type="multimodal", + prompt=prompt, + prompt_token_ids=new_token_ids, + mm_kwargs=mm_kwargs, + mm_placeholders=mm_placeholders, + ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index b844c9e1c2e89..b992442d3b314 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,13 +1,20 @@ import functools from collections import UserDict -from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence +from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, + Sequence, Type, TypeVar) +import torch.nn as nn +from typing_extensions import TypeAlias + +from vllm.inputs import InputProcessingContext from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer from .audio import AudioPlugin -from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalKwargs, - MultiModalPlugin, MultiModalTokensCalc, NestedTensors) +from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc from .image import ImagePlugin +from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors +from .processing import MultiModalProcessor from .video import VideoPlugin if TYPE_CHECKING: @@ -15,8 +22,18 @@ logger = init_logger(__name__) +N = TypeVar("N", bound=Type[nn.Module]) + +MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext], + MultiModalProcessor] +""" +Constructs a :class:`MultiModalProcessor` instance from the context. + +The processing metadata should be derived from the context. +""" + -class _MultiModalLimits(UserDict): +class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]): """ Wraps `_limits_by_model` for a more informative error message when attempting to access a model that does not exist. @@ -45,6 +62,9 @@ def __init__( plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None: self._plugins = {p.get_data_key(): p for p in plugins} + self._processor_factories: Dict[Type[nn.Module], + MultiModalProcessorFactory] = {} + # This is used for non-multimodal models self._disabled_limits_per_plugin = {k: 0 for k in self._plugins} @@ -243,3 +263,59 @@ def get_mm_limits_per_prompt( This should be called after :meth:`init_mm_limits_per_prompt`. """ return self._limits_by_model[model_config] + + def register_processor( + self, + factory: MultiModalProcessorFactory, + ): + """ + Register a multi-modal processor to a model class. + + When the model receives multi-modal data, the provided function is + invoked to transform the data into a dictionary of model inputs. + + See also: + - :ref:`input_processing_pipeline` + - :ref:`enabling_multimodal_inputs` + """ + + def wrapper(model_cls: N) -> N: + if model_cls in self._processor_factories: + logger.warning( + "Model class %s already has an input mapper " + "registered to %s. It is overwritten by the new one.", + model_cls, self) + + self._processor_factories[model_cls] = factory + + return model_cls + + return wrapper + + def has_processor(self, model_config: "ModelConfig") -> bool: + """ + Test whether a multi-modal processor is defined for a specific model. + """ + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + return model_cls in self._processor_factories + + def create_processor( + self, + model_config: "ModelConfig", + tokenizer: AnyTokenizer, + ) -> MultiModalProcessor: + """ + Create a multi-modal processor for a specific model and tokenizer. + """ + + # Avoid circular import + from vllm.model_executor.model_loader import get_model_architecture + + model_cls, _ = get_model_architecture(model_config) + processor_factory = self._processor_factories[model_cls] + + ctx = InputProcessingContext(model_config, tokenizer) + return processor_factory(ctx) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index bee3c25dbd8dd..40194716bbf94 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -11,9 +11,10 @@ import vllm.envs as envs from vllm.connections import global_http_connection from vllm.logger import init_logger -from vllm.multimodal.base import MultiModalDataDict, PlaceholderRange from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer +from .inputs import MultiModalDataDict, PlaceholderRange + logger = init_logger(__name__) cached_get_tokenizer = lru_cache(get_tokenizer) diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index a518270974f92..ba9bf58a4a20c 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,5 +1,5 @@ from functools import lru_cache -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Optional import numpy as np @@ -9,8 +9,9 @@ from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import is_list_of -from .base import MultiModalData, MultiModalKwargs +from .base import MultiModalData from .image import ImagePlugin +from .inputs import MultiModalKwargs, VideoItem if TYPE_CHECKING: from vllm.config import ModelConfig @@ -20,17 +21,6 @@ cached_get_video_processor = lru_cache(get_video_processor) cached_get_tokenizer = lru_cache(get_tokenizer) -VideoInput = Union[ - "np.ndarray", # single video input - List["np.ndarray"], - # TODO: support more types - # List[Image.Image], List[List[Image.Image]], - # "torch.Tensor", - # List["torch.Tensor"], - # List[List["np.ndarrray"]], - # List[List["torch.Tensor"]], -] - class VideoPlugin(ImagePlugin): """Plugin for video data.""" @@ -53,13 +43,13 @@ def _get_hf_video_processor( def _default_input_mapper( self, ctx: InputContext, - data: MultiModalData[object], + data: MultiModalData[VideoItem], **mm_processor_kwargs, ) -> MultiModalKwargs: model_config = ctx.model_config if isinstance(data, list) and len(data) == 1: - data = data[0] + data = data[0] # type: ignore if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray): video_processor = self._get_hf_video_processor( diff --git a/vllm/sequence.py b/vllm/sequence.py index 1370cb5c4f9d2..3b41d25a2fe42 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -5,25 +5,21 @@ from array import array from collections import defaultdict from dataclasses import dataclass, field -from functools import cached_property, reduce -from typing import (TYPE_CHECKING, Any, Callable, DefaultDict, Dict, List, - Mapping, Optional) +from functools import reduce +from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional from typing import Sequence as GenericSequence from typing import Set, Tuple, Union import msgspec import torch -from typing_extensions import assert_never +from vllm.inputs import SingletonInputs, SingletonInputsAdapter from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import RequestOutputKind, SamplingParams -if TYPE_CHECKING: - from vllm.inputs import SingletonInputs - VLLM_TOKEN_ID_ARRAY_TYPE = "l" VLLM_INVALID_TOKEN_ID = -1 @@ -407,14 +403,14 @@ class Sequence: def __init__( self, seq_id: int, - inputs: "SingletonInputs", + inputs: SingletonInputs, block_size: int, eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> None: self.seq_id = seq_id - self.inputs = inputs + self.inputs = SingletonInputsAdapter(inputs) self.block_size = block_size self.eos_token_id = eos_token_id self.lora_request = lora_request @@ -441,59 +437,29 @@ def __init__( def n_blocks(self) -> int: return (self.get_len() + self.block_size - 1) // self.block_size - @cached_property + @property def prompt(self) -> Optional[str]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("prompt") + return self.inputs.prompt - assert_never(inputs) - - @cached_property + @property def prompt_token_ids(self) -> List[int]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("prompt_token_ids", []) + return self.inputs.prompt_token_ids - assert_never(inputs) - - @cached_property + @property def prompt_embeds(self) -> Optional[torch.Tensor]: - inputs = self.inputs - - if inputs["type"] == "token": - return None - - assert_never(inputs) + return self.inputs.prompt_embeds - @cached_property + @property def multi_modal_data(self) -> "MultiModalDataDict": - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("multi_modal_data", {}) - - assert_never(inputs) - - @cached_property - def mm_processor_kwargs(self) -> Dict[str, Any]: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("mm_processor_kwargs", {}) - - assert_never(inputs) + return self.inputs.multi_modal_data @property def multi_modal_placeholders(self) -> MultiModalPlaceholderDict: - inputs = self.inputs - - if inputs["type"] == "token": - return inputs.get("multi_modal_placeholders", {}) + return self.inputs.multi_modal_placeholders - assert_never(inputs) + @property + def mm_processor_kwargs(self) -> Dict[str, Any]: + return self.inputs.mm_processor_kwargs @property def lora_int_id(self) -> int: diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 2d7c58cfea13b..09bff9655a882 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -6,6 +6,7 @@ from vllm.engine.metrics_types import StatLoggerBase from vllm.engine.protocol import EngineClient from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType +from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import EmbeddingRequestOutput, RequestOutput @@ -321,6 +322,9 @@ async def get_model_config(self) -> ModelConfig: async def get_decoding_config(self): raise ValueError("Not Supported on V1 yet.") + async def get_input_preprocessor(self) -> InputPreprocessor: + return self.processor.input_preprocessor + async def get_tokenizer( self, lora_request: Optional[LoRARequest] = None, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 5b45615a1b85b..4ebfff9584267 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -7,6 +7,7 @@ from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.outputs import RequestOutput from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest @@ -32,6 +33,7 @@ def __init__( usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, use_cached_outputs: bool = False, multiprocess_mode: bool = False, ) -> None: @@ -50,7 +52,7 @@ def __init__( # Processor (convert Inputs --> EngineCoreRequests) self.processor = Processor(vllm_config.model_config, vllm_config.lora_config, self.tokenizer, - input_registry) + input_registry, mm_registry) # Detokenizer (converts EngineCoreOutputs --> RequestOutput) self.detokenizer = Detokenizer( diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5f13cbf2e4036..5c1577190c75a 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -2,15 +2,17 @@ from typing import Any, Dict, Mapping, Optional, Tuple, Union from vllm.config import LoRAConfig, ModelConfig -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderLLMInputs, InputRegistry, PromptType) +from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, + PromptType, SingletonInputsAdapter) +from vllm.inputs.parse import is_encoder_decoder_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.lora.request import LoRARequest +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.config import try_get_generation_config -from vllm.transformers_utils.tokenizer_group import AnyTokenizer +from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest @@ -20,8 +22,9 @@ def __init__( self, model_config: ModelConfig, lora_config: Optional[LoRAConfig], - tokenizer: AnyTokenizer, + tokenizer: BaseTokenizerGroup, input_registry: InputRegistry = INPUT_REGISTRY, + mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ): self.model_config = model_config @@ -31,7 +34,8 @@ def __init__( self.generation_config_fields = _load_generation_config_dict( model_config) self.input_preprocessor = InputPreprocessor(model_config, - self.tokenizer) + self.tokenizer, + mm_registry) self.input_processor = input_registry.create_input_processor( model_config) @@ -73,6 +77,19 @@ def process_inputs( self._validate_model_inputs(processed_inputs) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) + if is_encoder_decoder_inputs(processed_inputs): + decoder_inputs = SingletonInputsAdapter( + processed_inputs["decoder"]) + encoder_inputs = SingletonInputsAdapter( + processed_inputs["encoder"]) + else: + decoder_inputs = SingletonInputsAdapter(processed_inputs) + encoder_inputs = None + + # TODO: Impl encoder-decoder + if encoder_inputs is not None: + raise NotImplementedError + assert isinstance(params, SamplingParams) # TODO: can we avoid cloning here in multiproc case sampling_params = params.clone() @@ -81,27 +98,43 @@ def process_inputs( # Make Request for Detokenizer. detokenizer_request = DetokenizerRequest( - request_id, processed_inputs.get("prompt"), - processed_inputs.get("prompt_token_ids"), + request_id, + decoder_inputs.prompt, + decoder_inputs.prompt_token_ids, sampling_params.skip_special_tokens, sampling_params.spaces_between_special_tokens, - sampling_params.output_kind, sampling_params.stop, - sampling_params.include_stop_str_in_output) + sampling_params.output_kind, + sampling_params.stop, + sampling_params.include_stop_str_in_output, + ) # Make Request for EngineCore. engine_core_request = EngineCoreRequest( - request_id, processed_inputs.get("prompt"), - processed_inputs.get("prompt_token_ids"), - processed_inputs.get("multi_modal_data"), - processed_inputs.get("multi_modal_placeholders"), - processed_inputs.get("mm_processor_kwargs"), sampling_params, - eos_token_id, arrival_time, lora_request) + request_id, + decoder_inputs.prompt, + decoder_inputs.prompt_token_ids, + decoder_inputs.multi_modal_data, + decoder_inputs.multi_modal_placeholders, + decoder_inputs.mm_processor_kwargs, + sampling_params, + eos_token_id, + arrival_time, + lora_request, + ) return detokenizer_request, engine_core_request - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, - EncoderDecoderLLMInputs]): - prompt_ids = inputs.get("prompt_token_ids") + def _validate_model_inputs(self, inputs: ProcessorInputs): + if is_encoder_decoder_inputs(inputs): + # For encoder-decoder multimodal models, the max_prompt_len + # restricts the decoder prompt length + prompt_inputs = inputs["decoder" if self.model_config. + is_multimodal_model else "encoder"] + else: + prompt_inputs = inputs + + prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids + if prompt_ids is None or len(prompt_ids) == 0: raise ValueError("Prompt cannot be empty") @@ -117,6 +150,10 @@ def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, "inputs, the number of image tokens depends on the number " "of images, and possibly their aspect ratios as well.") + # TODO: Find out how many placeholder tokens are there so we can + # check that chunked prefill does not truncate them + # max_batch_len = self.scheduler_config.max_num_batched_tokens + def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]: config = try_get_generation_config( diff --git a/vllm/v1/request.py b/vllm/v1/request.py index f35cf738c89bf..51fb4003e5fe0 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -1,7 +1,7 @@ import enum -from typing import TYPE_CHECKING, List, Optional, Union +from typing import List, Optional, Union -from vllm.inputs.data import DecoderOnlyInputs +from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs from vllm.lora.request import LoRARequest from vllm.multimodal import MultiModalKwargs from vllm.sampling_params import SamplingParams @@ -9,23 +9,20 @@ from vllm.v1.engine import EngineCoreRequest from vllm.v1.utils import ConstantList -if TYPE_CHECKING: - from vllm.inputs import DecoderOnlyInputs - class Request: def __init__( self, request_id: str, - inputs: "DecoderOnlyInputs", + inputs: DecoderOnlyInputs, sampling_params: SamplingParams, eos_token_id: Optional[int], arrival_time: float, lora_request: Optional[LoRARequest] = None, ) -> None: self.request_id = request_id - self.inputs = inputs + self.inputs = SingletonInputsAdapter(inputs) self.sampling_params = sampling_params # Because of LoRA, the eos token id can be different for each request. self.eos_token_id = eos_token_id @@ -41,17 +38,17 @@ def __init__( assert sampling_params.max_tokens is not None self.max_tokens = sampling_params.max_tokens - self.prompt = inputs.get("prompt") - self.prompt_token_ids = inputs["prompt_token_ids"] + self.prompt = self.inputs.prompt + self.prompt_token_ids = self.inputs.prompt_token_ids self.num_prompt_tokens = len(self.prompt_token_ids) self._output_token_ids: List[int] = [] self._all_token_ids: List[int] = self.prompt_token_ids.copy() self.num_computed_tokens = 0 # Raw multimodal data before the mm input mapper (e.g., PIL images). - self.mm_data = inputs.get("multi_modal_data") - self.mm_processor_kwargs = inputs.get("mm_processor_kwargs") - mm_positions = inputs.get("multi_modal_placeholders") + self.mm_data = self.inputs.multi_modal_data + self.mm_processor_kwargs = self.inputs.mm_processor_kwargs + mm_positions = self.inputs.multi_modal_placeholders if mm_positions: # FIXME(woosuk): Support other modalities. self.mm_positions = mm_positions.get("image", []) @@ -64,8 +61,7 @@ def __init__( def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": return cls( request_id=request.request_id, - inputs=DecoderOnlyInputs( - type="token", + inputs=token_inputs( prompt_token_ids=request.prompt_token_ids, prompt=request.prompt, multi_modal_data=request.mm_data, @@ -114,7 +110,7 @@ def get_finished_reason(self) -> Union[str, None]: return RequestStatus.get_finished_reason(self.status) def has_encoder_inputs(self) -> bool: - return self.mm_data is not None + return len(self.mm_data) > 0 @property def num_encoder_inputs(self) -> int: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 81480786a09e1..eebd1de96537f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -28,7 +28,7 @@ from vllm.v1.sample.metadata import SamplingMetadata if TYPE_CHECKING: - from vllm.multimodal.base import PlaceholderRange + from vllm.multimodal.inputs import PlaceholderRange from vllm.v1.core.scheduler import SchedulerOutput logger = init_logger(__name__) diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 09c62fbb9875f..d3e1202c15e61 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -148,19 +148,29 @@ def build(self) -> ModelInputForCPU: query_lens=seq_lens, ) - def _compute_multi_modal_input(self, seq_group: SequenceGroupMetadata, - seq_data: SequenceData, computed_len: int, - mm_processor_kwargs: Dict[str, Any]): - + def _compute_multi_modal_input( + self, + seq_data: SequenceData, + computed_len: int, + seq_group_metadata: SequenceGroupMetadata, + ): # NOTE: mm_data only includes the subset of multi-modal items that # intersect with the current prefill positions. mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group( - seq_group, range(computed_len, len(seq_data.get_token_ids()))) + seq_group_metadata, + range(computed_len, len(seq_data.get_token_ids())), + ) if not mm_data: - return + return None, None, None - mm_kwargs = self.multi_modal_input_mapper(mm_data, mm_processor_kwargs) + if self.runner.mm_registry.has_processor(self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) # special processing for mrope position deltas. mrope_positions = None @@ -202,7 +212,7 @@ def _prepare_prompt( slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -223,11 +233,14 @@ def _prepare_prompt( mrope_positions = None if seq_group_metadata.multi_modal_data: - mm_kwargs, placeholder_maps, mrope_positions = self \ - ._compute_multi_modal_input( - seq_group_metadata, seq_data, computed_len, - seq_group_metadata.mm_processor_kwargs) - multi_model_kwargs_list.append(mm_kwargs) + ( + mm_kwargs, + placeholder_maps, + mrope_positions, + ) = self._compute_multi_modal_input(seq_data, computed_len, + seq_group_metadata) + + multi_modal_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( placeholder_map) @@ -302,7 +315,7 @@ def _prepare_prompt( multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 92d6552b2f428..1ff30d685c6b1 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -716,7 +716,7 @@ def _prepare_prompt( context_lens: List[int] = [] query_lens: List[int] = [] prefix_block_tables: List[List[int]] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] if len(seq_group_metadata_list) == 0: return PreparePromptMetadata.empty() @@ -777,7 +777,7 @@ def _prepare_prompt( mm_data = seq_group_metadata.multi_modal_data if mm_data: mm_kwargs = self.multi_modal_input_mapper(mm_data) - multi_model_kwargs_list.append(mm_kwargs) + multi_modal_kwargs_list.append(mm_kwargs) if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -876,7 +876,7 @@ def _prepare_prompt( multi_modal_placeholder_index_maps= None # FIXME(kzawora): mutli-modality will not work here ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return PreparePromptMetadata(input_tokens=input_tokens, input_positions=input_positions, diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 2da02f21f8342..042f9f07eace6 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -252,7 +252,7 @@ def __init__( prompt_adapter_request: Optional[PromptAdapterRequest] = None, # Multi-modal inputs. - multi_model_kwargs: Optional[MultiModalKwargs] = None, + multi_modal_kwargs: Optional[MultiModalKwargs] = None, multi_modal_placeholder_maps: Optional[Dict[ str, MultiModalPlaceholderMap]] = None, @@ -373,7 +373,7 @@ def __init__( prompt_adapter_prompt_mapping or []) self.prompt_adapter_request = prompt_adapter_request - self.multi_model_kwargs = multi_model_kwargs + self.multi_modal_kwargs = multi_modal_kwargs self.multi_modal_placeholder_maps = multi_modal_placeholder_maps self.prefix_cache_hit = prefix_cache_hit @@ -661,10 +661,15 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, if not mm_data: return - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs) - inter_data.multi_model_kwargs = mm_kwargs + if self.runner.mm_registry.has_processor(self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps # special processing for mrope position deltas. @@ -938,11 +943,11 @@ def build(self) -> ModelInputForGPU: ) # Multi-modal data. - multi_model_kwargs_list = [ - data.multi_model_kwargs for data in self.inter_data_list - if data.multi_model_kwargs is not None + multi_modal_kwargs_list = [ + data.multi_modal_kwargs for data in self.inter_data_list + if data.multi_modal_kwargs is not None ] - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return self.model_input_cls( input_tokens=input_tokens_tensor, diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index 0ed33e435aa2f..ae4eb6ba6eaec 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -67,7 +67,8 @@ def __init__( self.pin_memory = is_pin_memory_available() # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + self.mm_registry = MULTIMODAL_REGISTRY + self.multi_modal_input_mapper = self.mm_registry \ .create_input_mapper(self.model_config) # Lazy initialization. @@ -122,7 +123,7 @@ def _prepare_prompt( input_block_ids: List[int] = [] seq_lens: List[int] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] for seq_group_metadata in seq_group_metadata_list: assert seq_group_metadata.is_prompt seq_ids = list(seq_group_metadata.seq_data.keys()) @@ -144,12 +145,15 @@ def _prepare_prompt( mm_data = seq_group_metadata.multi_modal_data if mm_data: - # Process multi-modal data - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs, - ) - multi_model_kwargs_list.append(mm_kwargs) + if self.mm_registry.has_processor(self.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) max_seq_len = max(seq_lens) assert max_seq_len > 0 @@ -167,7 +171,7 @@ def _prepare_prompt( dtype=torch.long, device=self.device) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, input_block_ids, seq_lens, multi_modal_kwargs) diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py index 378e1e06039b2..6000e5dfe4e30 100644 --- a/vllm/worker/openvino_model_runner.py +++ b/vllm/worker/openvino_model_runner.py @@ -70,7 +70,8 @@ def __init__( ) # Multi-modal data support - self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \ + self.mm_registry = MULTIMODAL_REGISTRY + self.multi_modal_input_mapper = self.mm_registry \ .create_input_mapper(self.model_config) # Lazy initialization. @@ -102,7 +103,7 @@ def _prepare_model_input( seq_lens: List[int] = [] past_lens: List[int] = [] query_lens: List[int] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -222,11 +223,15 @@ def _prepare_model_input( mm_data, placeholder_maps = MultiModalPlaceholderMap \ .from_seq_group(seq_group_metadata, positions_range) - mm_kwargs = self.multi_modal_input_mapper( - mm_data, - mm_processor_kwargs=seq_group_metadata. - mm_processor_kwargs) - multi_model_kwargs_list.append(mm_kwargs) + if self.mm_registry.has_processor(self.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( @@ -275,7 +280,7 @@ def _prepare_model_input( multi_modal_placeholder_index_maps=placeholder_index_maps, ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return ModelInput( input_tokens, diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index c9e637c057979..e6322e095bbb9 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -160,7 +160,7 @@ def _prepare_prompt( input_positions: List[int] = [] slot_mapping: List[int] = [] seq_lens: List[int] = [] - multi_model_kwargs_list: List[MultiModalKwargs] = [] + multi_modal_kwargs_list: List[MultiModalKwargs] = [] multi_modal_placeholder_maps: Dict[ str, MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap) @@ -191,8 +191,16 @@ def _prepare_prompt( mm_data, placeholder_maps = MultiModalPlaceholderMap \ .from_seq_group(seq_group_metadata, positions_range) - mm_kwargs = self.runner.multi_modal_input_mapper(mm_data) - multi_model_kwargs_list.append(mm_kwargs) + if self.runner.mm_registry.has_processor( + self.runner.model_config): + mm_kwargs = mm_data + else: + mm_kwargs = self.runner.multi_modal_input_mapper( + mm_data, + seq_group_metadata.mm_processor_kwargs, + ) + + multi_modal_kwargs_list.append(mm_kwargs) for modality, placeholder_map in placeholder_maps.items(): multi_modal_placeholder_maps[modality].extend( @@ -264,7 +272,7 @@ def _prepare_prompt( block_tables=torch.tensor([], device=self.device, dtype=torch.int), ) - multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list) + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs)