From 9b663c0c84392d64e3efcdcbfb3b90379a308231 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 21 Nov 2024 22:26:52 -0800 Subject: [PATCH 01/30] rename Signed-off-by: Roger Wang --- vllm/model_executor/models/llava.py | 4 ++-- vllm/model_executor/models/phi3v.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 05c6cc62efcd7..619e9cfefeb93 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -478,7 +478,7 @@ def _process_image_input(self, image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) - def process_mm_inputs(self, **kwargs): + def get_multimodal_embeddings(self, **kwargs): image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -545,7 +545,7 @@ def forward( if intermediate_tensors is not None: inputs_embeds = None elif inputs_embeds is None: - vision_embeddings = self.process_mm_inputs(**kwargs) + vision_embeddings = self.get_multimodal_embeddings(**kwargs) # always pass the input via `inputs_embeds` # to make sure the computation graph is consistent inputs_embeds = self.get_input_embeddings(input_ids, diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 2e583bb08e87a..cc9f960e34c4c 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -676,7 +676,7 @@ def _process_image_input( return image_embeds - def process_mm_inputs(self, **kwargs): + def get_multimodal_embeddings(self, **kwargs): image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None @@ -706,7 +706,7 @@ def forward(self, if intermediate_tensors is not None: inputs_embeds = None elif inputs_embeds is None: - vision_embeddings = self.process_mm_inputs(**kwargs) + vision_embeddings = self.get_multimodal_embeddings(**kwargs) # always pass the input via `inputs_embeds` # to make sure the computation graph is consistent inputs_embeds = self.get_input_embeddings(input_ids, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 2cf55cd497659..0b3990e7615b2 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -362,7 +362,8 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # 2. A list (length: num_images) of tensors, each of shape # [feature_size, hidden_size] in case when the feature size is # dynamic depending on input images. - encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs) + encoder_outputs = self.model.get_multimodal_embeddings( + **batched_mm_inputs) # Cache the encoder outputs. for (req_id, input_id), output in zip(req_input_ids, encoder_outputs): From 757dad2b20c2fd4cbf659274c6a7a8e7a52e403f Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 21 Nov 2024 22:33:27 -0800 Subject: [PATCH 02/30] blip2 Signed-off-by: Roger Wang --- vllm/model_executor/models/blip2.py | 61 +++++++++++++++++------------ vllm/model_executor/models/llava.py | 5 ++- vllm/model_executor/models/phi3v.py | 6 ++- 3 files changed, 44 insertions(+), 28 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 7d7639b4a92ce..f3e92d61ceed4 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData @@ -609,6 +610,25 @@ def _process_image_input(self, return self.language_projection(query_output) + def get_multimodal_embeddings(self, **kwargs): + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + BLIP2_IMAGE_TOKEN_ID) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -616,6 +636,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[SamplerOutput, IntermediateTensors]: """Run forward pass for BLIP-2. @@ -648,32 +669,24 @@ def forward( See also: :class:`Blip2ImageInputs` """ + if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - BLIP2_IMAGE_TOKEN_ID) - - input_ids = None - else: - inputs_embeds = None - - hidden_states = self.language_model.model( - input_ids, - positions, - kv_caches, - attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds) + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 619e9cfefeb93..af9b701059f10 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -544,10 +544,11 @@ def forward( """ if intermediate_tensors is not None: inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent inputs_embeds = self.get_input_embeddings(input_ids, vision_embeddings) input_ids = None diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index cc9f960e34c4c..5d222a9eb9d24 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -703,12 +703,14 @@ def forward(self, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object): + if intermediate_tensors is not None: inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility elif inputs_embeds is None: vision_embeddings = self.get_multimodal_embeddings(**kwargs) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent inputs_embeds = self.get_input_embeddings(input_ids, vision_embeddings) input_ids = None From 49eb639313516ff6c5efaf0e2f3b861469cf7055 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 21 Nov 2024 23:02:31 -0800 Subject: [PATCH 03/30] chameleon Signed-off-by: Roger Wang --- vllm/model_executor/models/chameleon.py | 59 ++++++++++++++++++------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 8f91abffaea90..caaed04c797df 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -29,6 +29,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.utils import set_weight_attrs from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges, repeat_and_pad_placeholder_tokens) @@ -38,7 +39,7 @@ from .interfaces import SupportsMultiModal, SupportsPP from .utils import (is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) + maybe_prefix, merge_multimodal_embeddings) # These configs are not part of the model config but the preprocessor # and processor files, so we hardcode them in the model file for now. @@ -980,6 +981,30 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values(pixel_values), ) + def get_multimodal_embeddings(self, **kwargs) -> torch.Tensor: + image_input = self._parse_and_validate_image_input(**kwargs) + + if image_input is not None: + assert self.model.vqmodel is not None + image_tokens = self.model.get_image_tokens(image_input["data"].to( + self.config.torch_dtype)) + vision_embeddings = self.language_model.get_input_embeddings( + image_tokens) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.model.vocabulary_mapping.image_token_id) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -987,27 +1012,27 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs, ) -> Union[torch.Tensor, IntermediateTensors]: if intermediate_tensors is not None: + inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) input_ids = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - assert self.model.vqmodel is not None - image_tokens = self.model.get_image_tokens( - image_input["data"].to(self.config.torch_dtype)) - image_token_id = self.model.vocabulary_mapping.image_token_id - special_image_mask = input_ids == image_token_id - image_tokens = image_tokens.to(input_ids.device, - input_ids.dtype) - input_ids = input_ids.masked_scatter(special_image_mask, - image_tokens) - - hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits( From 036b1a6be37c0c090418cef86e9056e5af619a19 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 21 Nov 2024 23:10:50 -0800 Subject: [PATCH 04/30] fix Signed-off-by: Roger Wang --- vllm/model_executor/models/chameleon.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index caaed04c797df..b784f4200eab9 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -998,7 +998,7 @@ def get_input_embeddings( vision_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: - inputs_embeds = self.language_model.get_input_embeddings(input_ids) + inputs_embeds = self.model.get_input_embeddings(input_ids) if vision_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, vision_embeddings, @@ -1027,12 +1027,12 @@ def forward( vision_embeddings) input_ids = None - hidden_states = self.language_model.model(input_ids, - positions, - kv_caches, - attn_metadata, - intermediate_tensors, - inputs_embeds=inputs_embeds) + hidden_states = self.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits( From 124e7bc63bc9db9c1408374e471d006f7f2e3119 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 21 Nov 2024 23:12:29 -0800 Subject: [PATCH 05/30] fix Signed-off-by: Roger Wang --- vllm/model_executor/models/chameleon.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index b784f4200eab9..42d1ae3a7dcdc 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -988,8 +988,7 @@ def get_multimodal_embeddings(self, **kwargs) -> torch.Tensor: assert self.model.vqmodel is not None image_tokens = self.model.get_image_tokens(image_input["data"].to( self.config.torch_dtype)) - vision_embeddings = self.language_model.get_input_embeddings( - image_tokens) + vision_embeddings = self.model.get_input_embeddings(image_tokens) return vision_embeddings def get_input_embeddings( From da9551bb3418a94595f197d03c3b2633778d896b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 21 Nov 2024 23:18:51 -0800 Subject: [PATCH 06/30] fix Signed-off-by: Roger Wang --- vllm/model_executor/models/chameleon.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 42d1ae3a7dcdc..8bc60e7b68783 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -981,13 +981,14 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values(pixel_values), ) - def get_multimodal_embeddings(self, **kwargs) -> torch.Tensor: + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None - if image_input is not None: - assert self.model.vqmodel is not None - image_tokens = self.model.get_image_tokens(image_input["data"].to( - self.config.torch_dtype)) + assert self.model.vqmodel is not None + image_tokens = self.model.get_image_tokens(image_input["data"].to( + self.config.torch_dtype)) vision_embeddings = self.model.get_input_embeddings(image_tokens) return vision_embeddings From beaf5359d552f1a732815eeaaf49370a2d61ff1c Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 21 Nov 2024 23:19:58 -0800 Subject: [PATCH 07/30] typing Signed-off-by: Roger Wang --- vllm/model_executor/models/blip2.py | 2 +- vllm/model_executor/models/chameleon.py | 1 - vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/phi3v.py | 2 +- 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f3e92d61ceed4..bf7790617e346 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -610,7 +610,7 @@ def _process_image_input(self, return self.language_projection(query_output) - def get_multimodal_embeddings(self, **kwargs): + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 8bc60e7b68783..48c15e0c0bb69 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -985,7 +985,6 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None - assert self.model.vqmodel is not None image_tokens = self.model.get_image_tokens(image_input["data"].to( self.config.torch_dtype)) diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index af9b701059f10..0e71fe09c9cea 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -478,7 +478,7 @@ def _process_image_input(self, image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) - def get_multimodal_embeddings(self, **kwargs): + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 5d222a9eb9d24..7236fb3e0dce8 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -676,7 +676,7 @@ def _process_image_input( return image_embeds - def get_multimodal_embeddings(self, **kwargs): + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None From ab3c7b7e4713856bcedb6e38964187da25e9a38f Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 21 Nov 2024 23:43:40 -0800 Subject: [PATCH 08/30] glmv Signed-off-by: Roger Wang --- vllm/model_executor/models/chatglm.py | 54 +++++++++++++++++---------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 2ea592aaba9f9..d3fc7e554bb2c 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -33,7 +33,8 @@ from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs +from vllm.multimodal.inputs import (MultiModalData, MultiModalKwargs, + NestedTensors) from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, SequenceData) @@ -532,6 +533,30 @@ def _parse_and_validate_image_input( """) return GLMImagePixelInputs(pixel_values=pixel_values) + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input["pixel_values"] is None: + return None + pixel_values = image_input["pixel_values"].to( + dtype=self.config.torch_dtype) + vision_embeddings = self.vision(pixel_values) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.embedding(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_glm_vision_embeddings( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + vision_embeddings=vision_embeddings, + boi_token_id=self.config.boi_token_id, + eoi_token_id=self.config.eoi_token_id) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -539,26 +564,17 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> torch.Tensor: - if intermediate_tensors is None: - inputs_embeds = self.embedding(input_ids) - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input["pixel_values"] is not None: - pixel_values = image_input["pixel_values"].to( - dtype=inputs_embeds.dtype) - image_embeds = self.vision(pixel_values) - - boi_token_id = self.config.boi_token_id - eoi_token_id = self.config.eoi_token_id - - inputs_embeds = merge_glm_vision_embeddings( - input_ids=input_ids, - inputs_embeds=inputs_embeds, - vision_embeddings=image_embeds, - boi_token_id=boi_token_id, - eoi_token_id=eoi_token_id) + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + if intermediate_tensors is None and inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None else: inputs_embeds = intermediate_tensors["hidden_states"] From ea3cd5b770175b9e14fd2b46fd0a9375cac597fa Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 22 Nov 2024 00:03:38 -0800 Subject: [PATCH 09/30] interface Signed-off-by: Roger Wang --- vllm/model_executor/models/interfaces.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index dcead65115132..d0adf5cf53b1c 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -30,6 +30,13 @@ class SupportsMultiModal(Protocol): def __init__(self, *, multimodal_config: "MultiModalConfig") -> None: ... + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + """ + Returns multimodal embeddings generated from multimodal kwargs + to be merged with text embeddings. + """ + ... + # We can't use runtime_checkable with ClassVar for issubclass checks # so we need to treat the class as an instance and use isinstance instead From 089762d8f8e4fdb24210b06a56c7a38bbe06dd17 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 22 Nov 2024 19:20:24 +0000 Subject: [PATCH 10/30] fuyu Signed-off-by: Roger Wang --- vllm/model_executor/models/fuyu.py | 42 ++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 7b46907ac83ab..3c128d5a452ce 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -35,6 +35,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs from vllm.multimodal.image import cached_get_image_processor +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, consecutive_placeholder_ranges) from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -302,6 +303,24 @@ def _process_image_input( vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) return vision_embeddings + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, _IMAGE_TOKEN_ID) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -309,24 +328,19 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ): if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = self.language_model.model.embed_tokens( - input_ids) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.image_token_id) - - else: - inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model( input_ids=input_ids, From 4aa78484ca255add7ef9630c8f703a4969f5bc66 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 22 Nov 2024 19:57:25 +0000 Subject: [PATCH 11/30] internvl Signed-off-by: Roger Wang --- vllm/model_executor/models/internvl.py | 45 ++++++++++++++++++-------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 47ac00b6afe9b..78496980f6684 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -26,6 +26,7 @@ InternVisionPatchModel) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors from vllm.utils import is_list_of @@ -641,6 +642,25 @@ def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: visual_token_mask = None return visual_token_mask + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.img_context_token_id) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -648,26 +668,23 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[SamplerOutput, IntermediateTensors]: + + visual_token_mask = self._get_visual_token_mask(input_ids) if intermediate_tensors is not None: input_ids = None inputs_embeds = None visual_token_mask = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - if image_input is not None: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.img_context_token_id) - visual_token_mask = self._get_visual_token_mask(input_ids) - input_ids = None - else: - inputs_embeds = None - visual_token_mask = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None forward_kwargs = { "input_ids": input_ids, From 1825fbb8e34c935ac0d3072138798bb509a91d04 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 22 Nov 2024 23:20:54 +0000 Subject: [PATCH 12/30] llava-next Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_next.py | 51 +++++++++++++++--------- vllm/model_executor/models/utils.py | 5 +-- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index abeebb45fc4a7..194d6368c1f0e 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -19,6 +19,7 @@ from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors, PoolerOutput from vllm.utils import is_list_of @@ -565,6 +566,30 @@ def _process_image_input( for i, patch_features_batch in enumerate(patch_embeddings) ] + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + + if vision_embeddings is None: + return self.language_model.get_input_embeddings(input_ids) + + inputs_embeds = embed_multimodal( + input_ids, + self.config.image_token_index, + self.language_model.model.get_input_embeddings, + vision_embeddings, + ) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -572,6 +597,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LlaVA-NeXT. @@ -620,24 +646,14 @@ def forward( """ if intermediate_tensors is not None: inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - inputs_embeds = embed_multimodal( - input_ids, - self.config.image_token_index, - self.language_model.model.get_input_embeddings, - lambda _: self._process_image_input(image_input), - ) - else: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - # for `torch.compile` integration - input_ids = None + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, @@ -645,7 +661,6 @@ def forward( attn_metadata, intermediate_tensors, inputs_embeds=inputs_embeds) - return hidden_states def compute_logits( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 2ab9b19e22068..fb2923db13f12 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -356,8 +356,7 @@ def embed_multimodal( input_ids: torch.Tensor, multimodal_token_id: int, get_text_embeds: Callable[[torch.Tensor], torch.Tensor], - get_multimodal_embeds: Callable[[torch.Tensor], Union[torch.Tensor, - List[torch.Tensor]]], + multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]], ) -> torch.Tensor: """ Embed token IDs and multimodal inputs and combine their embeddings. @@ -374,8 +373,6 @@ def embed_multimodal( is_text = ~is_multimodal text_embeds = get_text_embeds(input_ids[is_text]) - multimodal_embeds = get_multimodal_embeds(input_ids[is_multimodal]) - merged_embeds = torch.empty( (input_ids.shape[0], text_embeds.shape[1]), dtype=text_embeds.dtype, From e5e3368a4b90ccda43002ae867619ff47e488a83 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 23 Nov 2024 00:37:58 +0000 Subject: [PATCH 13/30] llava-next-video Signed-off-by: Roger Wang --- .../model_executor/models/llava_next_video.py | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index e2880c76cf43d..ab52828d0f5a7 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -18,6 +18,7 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) from vllm.sequence import IntermediateTensors @@ -388,6 +389,25 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs): raise ValueError( f"Unsupported type of video input {type(video_pixels)}") + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + video_input = self._parse_and_validate_video_input(**kwargs) + if video_input is None: + return None + vision_embeddings = self._process_video_pixels(video_input) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, vision_embeddings, + self.config.video_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -395,6 +415,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LlaVA-NeXT-Video. @@ -404,22 +425,15 @@ def forward( pixel_values_videos: Pixels in each frames for each input videos. """ if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - video_input = self._parse_and_validate_video_input(**kwargs) - if video_input is not None: - video_embeddings = self._process_video_pixels(video_input) - inputs_embeds = self.language_model \ - .model.get_input_embeddings(input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, video_embeddings, - self.config.video_token_index) - - input_ids = None - else: - inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, From 42912b8874704611db666de2bc7f1581c0792130 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 23 Nov 2024 03:57:18 +0000 Subject: [PATCH 14/30] llava-ov Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 71 +++++++++++++------ 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 705ca1e4ab6e6..aa3385f8e3bb1 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) from vllm.sequence import IntermediateTensors @@ -824,6 +825,46 @@ def apply_pooling(self, image_features, stride=2): image_feature = image_feature.view(batch_frames, -1, dim) return image_feature + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + modalities = self._parse_and_validate_multimodal_inputs(**kwargs) + if not modalities: + return None + + # We append modality representation to each embedding. This is a + # temporary workaround for models that can handle multiple modalities + # at the same time. + multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + + if "images" in modalities: + image_input = modalities["images"] + vision_embeddings = self._process_image_input(image_input) + multimodal_embeddings.append((vision_embeddings, "image")) + if "videos" in modalities: + video_input = modalities["videos"] + video_embeddings = self._process_video_pixels(video_input) + multimodal_embeddings.append((video_embeddings, "video")) + + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[List[Tuple[NestedTensors, + str]]] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + for embeddings, modality in multimodal_embeddings: + if modality == "image": + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, embeddings, + self.config.image_token_index) + if modality == "video": + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, embeddings, + self.config.video_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -831,6 +872,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for LlaVA-Onevision. @@ -840,28 +882,15 @@ def forward( pixel_values_videos: Pixels in each frames for each input videos. """ if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - modalities = self._parse_and_validate_multimodal_inputs(**kwargs) - if modalities: - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - if "images" in modalities: - image_input = modalities["images"] - vision_embeddings = self._process_image_input(image_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - if "videos" in modalities: - video_input = modalities["videos"] - video_embeddings = self._process_video_pixels(video_input) - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, video_embeddings, - self.config.video_token_index) - input_ids = None - else: - inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, From d83ba1ea38c2617589207b70a5792f0758b8a6ca Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 23 Nov 2024 07:38:29 +0000 Subject: [PATCH 15/30] molmo Signed-off-by: Roger Wang --- vllm/model_executor/models/molmo.py | 88 +++++++++++++++-------------- 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 2528f741864b3..2d7e8013b46b3 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -3,7 +3,7 @@ from array import array from dataclasses import dataclass from functools import lru_cache, partial -from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union +from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict import torch from einops import rearrange @@ -36,6 +36,7 @@ ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import cached_get_tokenizer from vllm.platforms import _Backend from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, @@ -749,6 +750,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): make_empty_intermediate_tensors_factory( ["hidden_states", "residual"], config.hidden_size)) + def get_input_embeddings( + self, + input_ids: torch.Tensor, + ) -> torch.Tensor: + return self.embed_tokens(input_ids) + def forward( self, input_ids: torch.Tensor, @@ -1091,19 +1098,16 @@ def _process_image_input( return image_features - def _merge_multimodal_embeddings( - self, - inputs_embeds: torch.Tensor, - image_features: torch.Tensor, - image_input_idx: torch.Tensor, - seq_len: Union[torch.Tensor, List[torch.Tensor]], - ) -> torch.Tensor: + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + image_features = self._process_image_input(image_input) + image_input_idx = image_input["image_input_idx"] + seq_len = image_input["seq_len"] batch_size, num_image, num_patch = image_features.shape[:3] assert image_input_idx.shape == (batch_size, num_image, num_patch) - image_features = image_features.to(inputs_embeds.device) - seq_len = seq_len.to(inputs_embeds.device) - # insert the image feature into the embedding. image_features = image_features.view(batch_size, num_image * num_patch, -1) @@ -1123,12 +1127,24 @@ def _merge_multimodal_embeddings( image_input_idx = image_input_idx + offset.to(image_input_idx.dtype) image_input_idx = image_input_idx.flatten()[:, None] mat = image_input_idx == torch.arange( - seq_len.sum().item(), device=inputs_embeds.device)[None, :] + seq_len.sum().item(), device=image_features.device)[None, :] mat = mat.to(image_features.dtype) - inputs_embeds = inputs_embeds + torch.einsum('nd,nm->md', - image_features, mat) + # Note: In this original implementation from AI2, the final + # vision_embeddings will be always be the same length + # of input embedddings, which is not very efficient. + # TODO(ywang96): see if this can be optimized. + vision_embeddings = torch.einsum('nd,nm->md', image_features, mat) + return vision_embeddings + def get_input_embeddings( + self, + input_ids: torch.Tensor, + vision_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.model.get_input_embeddings(input_ids) + if vision_embeddings is not None: + inputs_embeds = inputs_embeds + vision_embeddings return inputs_embeds def forward( @@ -1138,39 +1154,27 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> SamplerOutput: + if intermediate_tensors is not None: inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - - if image_input is not None: - inputs_embeds = self.model.embed_tokens(input_ids) - image_features = self._process_image_input(image_input) - - inputs_embeds = self._merge_multimodal_embeddings( - inputs_embeds, - image_features, - image_input["image_input_idx"], - image_input["seq_len"], - ) - else: - inputs_embeds = self.model.embed_tokens(input_ids) - # always pass the input via `inputs_embeds` - # to make sure the computation graph is consistent - # for `torch.compile` integration - input_ids = None - - hidden_states = self.model( - input_ids=input_ids, - positions=positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - ) + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None + + hidden_states = self.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states From f4b774789e6590a635c1e4cc3088e79545dff6cc Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 23 Nov 2024 07:49:41 +0000 Subject: [PATCH 16/30] rename interface Signed-off-by: Roger Wang --- vllm/model_executor/models/blip2.py | 6 +++--- vllm/model_executor/models/chameleon.py | 6 +++--- vllm/model_executor/models/chatglm.py | 6 +++--- vllm/model_executor/models/fuyu.py | 7 ++++--- vllm/model_executor/models/interfaces.py | 13 +++++++++++++ vllm/model_executor/models/internvl.py | 6 +++--- vllm/model_executor/models/llava.py | 6 +++--- vllm/model_executor/models/llava_next.py | 6 +++--- vllm/model_executor/models/llava_next_video.py | 6 +++--- vllm/model_executor/models/molmo.py | 6 +++--- vllm/model_executor/models/phi3v.py | 6 +++--- 11 files changed, 44 insertions(+), 30 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index bf7790617e346..815b8e8b509d7 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -620,12 +620,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, + input_ids, inputs_embeds, multimodal_embeddings, BLIP2_IMAGE_TOKEN_ID) return inputs_embeds diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 48c15e0c0bb69..24024cdcf1e4a 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -994,13 +994,13 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, + input_ids, inputs_embeds, multimodal_embeddings, self.model.vocabulary_mapping.image_token_id) return inputs_embeds diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index d3fc7e554bb2c..ee739e73bdb30 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -545,14 +545,14 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.embedding(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_glm_vision_embeddings( input_ids=input_ids, inputs_embeds=inputs_embeds, - vision_embeddings=vision_embeddings, + vision_embeddings=multimodal_embeddings, boi_token_id=self.config.boi_token_id, eoi_token_id=self.config.eoi_token_id) return inputs_embeds diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index 3c128d5a452ce..a3f638e00897c 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -313,12 +313,13 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, _IMAGE_TOKEN_ID) + input_ids, inputs_embeds, multimodal_embeddings, + _IMAGE_TOKEN_ID) return inputs_embeds def forward( diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index d0adf5cf53b1c..b15d3c32ce656 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -9,6 +9,7 @@ if TYPE_CHECKING: from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig + from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors logger = init_logger(__name__) @@ -37,6 +38,18 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: """ ... + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional["NestedTensors"] = None, + ) -> torch.Tensor: + """ + Returns the input embeddings merged from the text embeddings from + input_ids and the multimodal embeddings generated from multimodal + kwargs. + """ + ... + # We can't use runtime_checkable with ClassVar for issubclass checks # so we need to treat the class as an instance and use isinstance instead diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 78496980f6684..14776704466fd 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -652,12 +652,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, + input_ids, inputs_embeds, multimodal_embeddings, self.img_context_token_id) return inputs_embeds diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 0e71fe09c9cea..c962c86f91fef 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -488,12 +488,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, + input_ids, inputs_embeds, multimodal_embeddings, self.config.image_token_index) return inputs_embeds diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 194d6368c1f0e..8c474a0b40d2d 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -576,17 +576,17 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: - if vision_embeddings is None: + if multimodal_embeddings is None: return self.language_model.get_input_embeddings(input_ids) inputs_embeds = embed_multimodal( input_ids, self.config.image_token_index, self.language_model.model.get_input_embeddings, - vision_embeddings, + multimodal_embeddings, ) return inputs_embeds diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index ab52828d0f5a7..a8a5691fb21b8 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -399,12 +399,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, + input_ids, inputs_embeds, multimodal_embeddings, self.config.video_token_index) return inputs_embeds diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 2d7e8013b46b3..d75e425d8b68f 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1140,11 +1140,11 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.model.get_input_embeddings(input_ids) - if vision_embeddings is not None: - inputs_embeds = inputs_embeds + vision_embeddings + if multimodal_embeddings is not None: + inputs_embeds = inputs_embeds + multimodal_embeddings return inputs_embeds def forward( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 7236fb3e0dce8..85f56e8224dab 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -686,12 +686,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: def get_input_embeddings( self, input_ids: torch.Tensor, - vision_embeddings: Optional[NestedTensors] = None, + multimodal_embeddings: Optional[NestedTensors] = None, ) -> torch.Tensor: inputs_embeds = self.embed_tokens(input_ids) - if vision_embeddings is not None: + if multimodal_embeddings is not None: inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, + input_ids, inputs_embeds, multimodal_embeddings, self.image_token_id) return inputs_embeds From 4be9ab22e06ae1ee7255e77bf1d663e3e22e3bca Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 23 Nov 2024 09:27:13 +0000 Subject: [PATCH 17/30] paligemma Signed-off-by: Roger Wang --- vllm/model_executor/models/paligemma.py | 52 +++++++++++++++---------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index dd5256eb87ab3..b2b5f6fc707f6 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -13,6 +13,7 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors @@ -240,36 +241,45 @@ def _process_image_input( return self.multi_modal_projector(image_features) + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa + vision_embeddings = vision_embeddings * (self.config.hidden_size**-0.5) + return vision_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.image_token_index) + return inputs_embeds + def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object) -> Union[SamplerOutput, IntermediateTensors]: if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - parsed_image_input = self._parse_and_validate_image_input(**kwargs) - - if parsed_image_input is not None: - vision_embeddings = self._process_image_input( - parsed_image_input) - # https://github.com/huggingface/transformers/blob/main/src/transformers/models/paligemma/modeling_paligemma.py#L294 # noqa - vision_embeddings = vision_embeddings * ( - self.config.hidden_size**-0.5) - - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - inputs_embeds = merge_multimodal_embeddings( - input_ids, inputs_embeds, vision_embeddings, - self.config.image_token_index) - - input_ids = None - else: - inputs_embeds = None + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + vision_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + vision_embeddings) + input_ids = None hidden_states = self.language_model.model(input_ids, positions, From 20709c6907f1e52fdd2bf20a022eda8a8eaf7492 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 23 Nov 2024 23:33:16 +0000 Subject: [PATCH 18/30] qwen2vl Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 4 +- vllm/model_executor/models/qwen2_vl.py | 99 ++++++++++++------- 2 files changed, 67 insertions(+), 36 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index aa3385f8e3bb1..dcf852843b696 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -887,9 +887,9 @@ def forward( # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. elif inputs_embeds is None: - vision_embeddings = self.get_multimodal_embeddings(**kwargs) + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) inputs_embeds = self.get_input_embeddings(input_ids, - vision_embeddings) + multimodal_embeddings) input_ids = None hidden_states = self.language_model.model(input_ids, diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 531608a877f2f..02501785a4030 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -63,7 +63,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import cached_get_image_processor from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict, - MultiModalKwargs) + MultiModalKwargs, NestedTensors) from vllm.multimodal.utils import cached_get_tokenizer from vllm.platforms import _Backend from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData @@ -1238,6 +1238,52 @@ def _merge_multimodal_embeddings( inputs_embeds[mask, :] = multimodal_embeddings return inputs_embeds + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + + image_input = self._parse_and_validate_image_input(**kwargs) + video_input = self._parse_and_validate_video_input(**kwargs) + if image_input is None and video_input is None: + return None + + # We append modality representation to each embedding. This is a + # temporary workaround for models that can handle multiple modalities + # at the same time. + multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + + if image_input is not None: + image_embeds = self._process_image_input(image_input) + multimodal_embeddings.append((image_embeds, "image")) + if video_input is not None: + video_embeds = self._process_video_input(video_input) + multimodal_embeddings.append((video_embeds, "video")) + + return multimodal_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[List[Tuple[NestedTensors, + str]]] = None, + ) -> torch.Tensor: + inputs_embeds = self.model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + for embeddings, modality in multimodal_embeddings: + if modality == "image": + inputs_embeds = self._merge_multimodal_embeddings( + input_ids, + inputs_embeds, + embeddings, + placeholder_token_id=self.config.image_token_id, + ) + if modality == "video": + inputs_embeds = self._merge_multimodal_embeddings( + input_ids, + inputs_embeds, + embeddings, + placeholder_token_id=self.config.video_token_id, + ) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -1245,6 +1291,7 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for Qwen2-VL. @@ -1266,42 +1313,26 @@ def forward( video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in LLM. `None` if no videos are passed. """ + if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - image_input = self._parse_and_validate_image_input(**kwargs) - video_input = self._parse_and_validate_video_input(**kwargs) - - if image_input is None and video_input is None: - inputs_embeds = None - else: - if uses_mrope(self.config): - assert positions.ndim == 2 and positions.size(0) == 3, ( - "multimodal section rotary embedding requires " - f"(3, seq_len) positions, but got {positions.size()}") - - inputs_embeds = self.model.embed_tokens(input_ids) - - if image_input is not None: - image_embeds = self._process_image_input(image_input) - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - image_embeds, - placeholder_token_id=self.config.image_token_id, - ) - - if video_input is not None: - video_embeds = self._process_video_input(video_input) - inputs_embeds = self._merge_multimodal_embeddings( - input_ids, - inputs_embeds, - video_embeds, - placeholder_token_id=self.config.video_token_id, - ) - input_ids = None + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + + # We need to check for usage of mrope here in case there is + # multimodal data. + # TODO (ywang96): move this to model runner in V1. + if multimodal_embeddings is not None and uses_mrope(self.config): + assert positions.ndim == 2 and positions.size(0) == 3, ( + "multimodal section rotary embedding requires " + f"(3, seq_len) positions, but got {positions.size()}") + + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) + input_ids = None hidden_states = self.model( input_ids=input_ids, From df1494ff3e1afb1e4598194c0c27bfc1a186b885 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Nov 2024 04:55:03 +0000 Subject: [PATCH 19/30] qwen2_audio Signed-off-by: Roger Wang --- vllm/model_executor/models/qwen2_audio.py | 59 ++++++++++++++--------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 0c2374c3c3fc9..44e675f490d21 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -42,10 +42,12 @@ from vllm.model_executor.models.qwen2 import Qwen2Model from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.multimodal.inputs import NestedTensors from vllm.multimodal.utils import consecutive_placeholder_ranges from vllm.sequence import IntermediateTensors, SequenceData from .interfaces import SupportsMultiModal, SupportsPP +from .utils import merge_multimodal_embeddings logger = init_logger(__name__) @@ -371,6 +373,25 @@ def _process_audio_input(self, return masked_audio_features + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + audio_input = self._parse_and_validate_audio_input(**kwargs) + if audio_input is None: + return None + masked_audio_features = self._process_audio_input(audio_input) + return masked_audio_features + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + inputs_embeds = merge_multimodal_embeddings( + input_ids, inputs_embeds, multimodal_embeddings, + self.config.audio_token_index) + return inputs_embeds + def forward( self, input_ids: torch.Tensor, @@ -378,33 +399,27 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs: object, ) -> Union[torch.Tensor, IntermediateTensors]: + if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - audio_input = self._parse_and_validate_audio_input(**kwargs) - if audio_input is None: - inputs_embeds = None - else: - inputs_embeds = self.language_model.embed_tokens(input_ids) - masked_audio_features = self._process_audio_input(audio_input) - # merge llm embeddings and audio features - mask = (input_ids == self.config.audio_token_index) - inputs_embeds[mask, :] = masked_audio_features - - input_ids = None - - hidden_states = self.language_model( - input_ids=input_ids, - positions=positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds, - ) + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings) + input_ids = None + + hidden_states = self.language_model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, From 45d8e0a5562dfdd2831a7f58232799ce4ac3bcbc Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Nov 2024 05:40:36 +0000 Subject: [PATCH 20/30] ultravox Signed-off-by: Roger Wang --- vllm/model_executor/models/ultravox.py | 72 +++++++++++++++++--------- 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 512adbc7db35e..b4062fedc3986 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -449,10 +449,36 @@ def _process_audio_input( return result - def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, + def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + audio_input = self._parse_and_validate_audio_input(**kwargs) + if audio_input is None: + return None + audio_embeddings = self._process_audio_input(audio_input) + return audio_embeddings + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional[NestedTensors] = None, + attn_metadata: Optional[AttentionMetadata] = None, + ) -> torch.Tensor: + inputs_embeds = self.language_model.get_input_embeddings(input_ids) + if multimodal_embeddings is not None: + + # TODO(ywang96): use merge_multimodal_embeddings after + # v0 is deprecated + merge_multimodal_embeddings_from_map( + inputs_embeds, multimodal_embeddings, + attn_metadata.multi_modal_placeholder_index_maps["audio"]) + return inputs_embeds + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[torch.Tensor], + intermediate_tensors: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, **kwargs) -> Union[torch.Tensor, IntermediateTensors]: """Run forward pass for Ultravox @@ -466,30 +492,28 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, Args: audio_features: A batch of audio inputs [B, N, 80, M]. """ + if intermediate_tensors is not None: - input_ids = None inputs_embeds = None - else: - audio_input = self._parse_and_validate_audio_input(**kwargs) - if audio_input is not None: - audio_embeddings = self._process_audio_input(audio_input) - inputs_embeds = self.language_model.model.get_input_embeddings( - input_ids) - - merge_multimodal_embeddings_from_map( - inputs_embeds, audio_embeddings, - attn_metadata.multi_modal_placeholder_index_maps["audio"]) - input_ids = None - else: - inputs_embeds = None - - hidden_states = self.language_model.model( - input_ids=input_ids, - positions=positions, - kv_caches=kv_caches, - attn_metadata=attn_metadata, - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds) + + # NOTE: In v1, inputs_embeds is always generated at model runner, this + # condition is for v0 compatibility. + elif inputs_embeds is None: + multimodal_embeddings = self.get_multimodal_embeddings(**kwargs) + + # TODO(ywang96): remove attn_metadata from get_input_embeddings + # after v0 is deprecated + inputs_embeds = self.get_input_embeddings(input_ids, + multimodal_embeddings, + attn_metadata) + input_ids = None + + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + intermediate_tensors, + inputs_embeds=inputs_embeds) return hidden_states def compute_logits(self, hidden_states: torch.Tensor, From 56b31e7ed985510abdb5afcffb62c6217a38a9f6 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sun, 24 Nov 2024 06:09:23 +0000 Subject: [PATCH 21/30] typing Signed-off-by: Roger Wang --- vllm/model_executor/models/blip2.py | 2 +- vllm/model_executor/models/chameleon.py | 2 +- vllm/model_executor/models/chatglm.py | 2 +- vllm/model_executor/models/fuyu.py | 2 +- vllm/model_executor/models/interfaces.py | 17 +++++++++++++++-- vllm/model_executor/models/internvl.py | 2 +- vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/llava_next.py | 2 +- vllm/model_executor/models/llava_next_video.py | 2 +- vllm/model_executor/models/llava_onevision.py | 3 ++- vllm/model_executor/models/molmo.py | 2 +- vllm/model_executor/models/paligemma.py | 2 +- vllm/model_executor/models/phi3v.py | 2 +- vllm/model_executor/models/qwen2_audio.py | 2 +- vllm/model_executor/models/qwen2_vl.py | 3 ++- vllm/model_executor/models/ultravox.py | 2 +- 16 files changed, 32 insertions(+), 17 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 815b8e8b509d7..d2592016aff34 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -610,7 +610,7 @@ def _process_image_input(self, return self.language_projection(query_output) - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index 24024cdcf1e4a..e1a8e915afeb7 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -981,7 +981,7 @@ def _parse_and_validate_image_input( data=self._validate_pixel_values(pixel_values), ) - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index ee739e73bdb30..3c301fb3a355d 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -533,7 +533,7 @@ def _parse_and_validate_image_input( """) return GLMImagePixelInputs(pixel_values=pixel_values) - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input["pixel_values"] is None: return None diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index a3f638e00897c..6e86900326c4b 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -303,7 +303,7 @@ def _process_image_input( vision_embeddings, _ = self.vision_embed_tokens(image_input["data"]) return vision_embeddings - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b15d3c32ce656..7aa9f23593178 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,5 +1,5 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, - Protocol, Type, Union, overload, runtime_checkable) + Protocol, Tuple, Type, Union, overload, runtime_checkable) import torch from typing_extensions import TypeIs @@ -8,6 +8,7 @@ from vllm.utils import supports_kw if TYPE_CHECKING: + from vllm.attention import AttentionMetadata from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors @@ -31,13 +32,25 @@ class SupportsMultiModal(Protocol): def __init__(self, *, multimodal_config: "MultiModalConfig") -> None: ... - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings( + self, **kwargs + ) -> Optional[Union["NestedTensors", List[Tuple["NestedTensors", str]]]]: """ Returns multimodal embeddings generated from multimodal kwargs to be merged with text embeddings. """ ... + # Only for models that support v0 chunked prefill + @overload + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings: Optional["NestedTensors"] = None, + attn_metadata: Optional["AttentionMetadata"] = None, + ) -> torch.Tensor: + ... + def get_input_embeddings( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 14776704466fd..867df56965b66 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -642,7 +642,7 @@ def _get_visual_token_mask(self, input_ids: torch.Tensor) -> torch.Tensor: visual_token_mask = None return visual_token_mask - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index c962c86f91fef..e7757b3c7d405 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -478,7 +478,7 @@ def _process_image_input(self, image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 8c474a0b40d2d..e113f5862830d 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -566,7 +566,7 @@ def _process_image_input( for i, patch_features_batch in enumerate(patch_embeddings) ] - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index a8a5691fb21b8..b130791808924 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -389,7 +389,7 @@ def _process_video_pixels(self, inputs: LlavaNextVideoPixelInputs): raise ValueError( f"Unsupported type of video input {type(video_pixels)}") - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: video_input = self._parse_and_validate_video_input(**kwargs) if video_input is None: return None diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index dcf852843b696..138f5135f88c8 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -825,7 +825,8 @@ def apply_pooling(self, image_features, stride=2): image_feature = image_feature.view(batch_frames, -1, dim) return image_feature - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings( + self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: modalities = self._parse_and_validate_multimodal_inputs(**kwargs) if not modalities: return None diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index d75e425d8b68f..3abc23cf17fcf 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -1098,7 +1098,7 @@ def _process_image_input( return image_features - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index b2b5f6fc707f6..2e5b6bee784e7 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -241,7 +241,7 @@ def _process_image_input( return self.multi_modal_projector(image_features) - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 85f56e8224dab..4cb874a13e0c1 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -676,7 +676,7 @@ def _process_image_input( return image_embeds - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: image_input = self._parse_and_validate_image_input(**kwargs) if image_input is None: return None diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index 44e675f490d21..a0605fee82aca 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -373,7 +373,7 @@ def _process_audio_input(self, return masked_audio_features - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: return None diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 02501785a4030..282764cdb8bf0 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1238,7 +1238,8 @@ def _merge_multimodal_embeddings( inputs_embeds[mask, :] = multimodal_embeddings return inputs_embeds - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings( + self, **kwargs) -> Optional[List[Tuple[NestedTensors, str]]]: image_input = self._parse_and_validate_image_input(**kwargs) video_input = self._parse_and_validate_video_input(**kwargs) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index b4062fedc3986..b61deccde45b7 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -449,7 +449,7 @@ def _process_audio_input( return result - def get_multimodal_embeddings(self, **kwargs) -> Optional[torch.Tensor]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]: audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is None: return None From 1de5b2b174717b6eb116e2d4bab9e93f51898ba6 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Sat, 23 Nov 2024 22:11:54 -0800 Subject: [PATCH 22/30] comment Signed-off-by: Roger Wang --- vllm/model_executor/models/interfaces.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 7aa9f23593178..b05f55b4effd7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -42,6 +42,7 @@ def get_multimodal_embeddings( ... # Only for models that support v0 chunked prefill + # TODO(ywang96): Remove this overload once v0 is deprecated @overload def get_input_embeddings( self, From 4fd91d06e5b38255d86683ee8adcfe33c93a664b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Nov 2024 07:05:15 +0000 Subject: [PATCH 23/30] update interface changes Signed-off-by: Roger Wang --- vllm/model_executor/models/interfaces.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index ab6c5f5e63d6d..3189e7f2719f5 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -11,7 +11,6 @@ if TYPE_CHECKING: from vllm.attention import AttentionMetadata - from vllm.config import LoRAConfig, MultiModalConfig, SchedulerConfig from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors @@ -31,9 +30,6 @@ class SupportsMultiModal(Protocol): MRO of your model class. """ - def __init__(self, *, multimodal_config: "MultiModalConfig") -> None: - ... - def get_multimodal_embeddings( self, **kwargs ) -> Optional[Union["NestedTensors", List[Tuple["NestedTensors", str]]]]: From f8e76b8283ddd3a82314da8cf730d50b375bc8a1 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Nov 2024 08:38:01 +0000 Subject: [PATCH 24/30] fix internvl Signed-off-by: Roger Wang --- vllm/model_executor/models/internvl.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 867df56965b66..b1c0065afbf30 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -656,6 +656,7 @@ def get_input_embeddings( ) -> torch.Tensor: inputs_embeds = self.language_model.get_input_embeddings(input_ids) if multimodal_embeddings is not None: + assert self.img_context_token_id is not None inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, multimodal_embeddings, self.img_context_token_id) @@ -672,11 +673,10 @@ def forward( **kwargs: object, ) -> Union[SamplerOutput, IntermediateTensors]: - visual_token_mask = self._get_visual_token_mask(input_ids) + visual_token_mask = None if intermediate_tensors is not None: input_ids = None inputs_embeds = None - visual_token_mask = None # NOTE: In v1, inputs_embeds is always generated at model runner, this # condition is for v0 compatibility. @@ -694,6 +694,13 @@ def forward( "intermediate_tensors": intermediate_tensors, "inputs_embeds": inputs_embeds, } + if self.img_context_token_id is not None: + visual_token_mask = self._get_visual_token_mask(input_ids) + + # We always overwrite it back to None after computing visual token + # mask so that this doesn't need to depend on encoder output + self.img_context_token_id = None + if self.is_mono: forward_kwargs.update({"visual_token_mask": visual_token_mask}) From e6a0f53fc598503661585289363611e3fdeb15de Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Nov 2024 08:46:12 +0000 Subject: [PATCH 25/30] generic typing Signed-off-by: Roger Wang --- vllm/model_executor/models/interfaces.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 3189e7f2719f5..a339df83b64fd 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,5 +1,6 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, - Protocol, Tuple, Type, Union, overload, runtime_checkable) + Protocol, Type, TypeVar, Union, overload, + runtime_checkable) import torch from typing_extensions import TypeIs @@ -11,11 +12,12 @@ if TYPE_CHECKING: from vllm.attention import AttentionMetadata - from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors logger = init_logger(__name__) +T = TypeVar("T") + @runtime_checkable class SupportsMultiModal(Protocol): @@ -30,9 +32,7 @@ class SupportsMultiModal(Protocol): MRO of your model class. """ - def get_multimodal_embeddings( - self, **kwargs - ) -> Optional[Union["NestedTensors", List[Tuple["NestedTensors", str]]]]: + def get_multimodal_embeddings(self, **kwargs) -> Optional[T]: """ Returns multimodal embeddings generated from multimodal kwargs to be merged with text embeddings. @@ -45,7 +45,7 @@ def get_multimodal_embeddings( def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional["NestedTensors"] = None, + multimodal_embeddings: Optional[T] = None, attn_metadata: Optional["AttentionMetadata"] = None, ) -> torch.Tensor: ... @@ -53,7 +53,7 @@ def get_input_embeddings( def get_input_embeddings( self, input_ids: torch.Tensor, - multimodal_embeddings: Optional["NestedTensors"] = None, + multimodal_embeddings: Optional[T] = None, ) -> torch.Tensor: """ Returns the input embeddings merged from the text embeddings from From 59407eb382532290f23a04a8dae43b151a0359a1 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Nov 2024 16:26:54 +0000 Subject: [PATCH 26/30] using typing extentions Signed-off-by: Roger Wang --- vllm/model_executor/models/interfaces.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index a339df83b64fd..d50f22a17bd92 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,9 +1,8 @@ from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, - Protocol, Type, TypeVar, Union, overload, - runtime_checkable) + Protocol, Type, Union, overload, runtime_checkable) import torch -from typing_extensions import TypeIs +from typing_extensions import TypeIs, TypeVar from vllm.logger import init_logger from vllm.utils import supports_kw @@ -12,11 +11,12 @@ if TYPE_CHECKING: from vllm.attention import AttentionMetadata + from vllm.multimodal.inputs import NestedTensors from vllm.sequence import IntermediateTensors logger = init_logger(__name__) -T = TypeVar("T") +T = TypeVar("T", default=NestedTensors) @runtime_checkable From e4f23e4646c3300bda2f67bb30d9c9512de058fe Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 27 Nov 2024 00:51:45 +0800 Subject: [PATCH 27/30] Fix fake import --- vllm/model_executor/models/interfaces.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index d50f22a17bd92..f81b8249d5908 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -16,7 +16,7 @@ logger = init_logger(__name__) -T = TypeVar("T", default=NestedTensors) +T = TypeVar("T", default="NestedTensors") @runtime_checkable From bdf5a4f88dd3a47fe58036ed181483a346034ba5 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Nov 2024 16:54:40 +0000 Subject: [PATCH 28/30] add TODO for mixed-modality Signed-off-by: Roger Wang --- vllm/model_executor/models/llava_onevision.py | 8 +++++--- vllm/model_executor/models/qwen2_vl.py | 8 +++++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 138f5135f88c8..3166737d61582 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -831,9 +831,11 @@ def get_multimodal_embeddings( if not modalities: return None - # We append modality representation to each embedding. This is a - # temporary workaround for models that can handle multiple modalities - # at the same time. + # We make a tuple of each embedding with its modality string. This is a + # temporary workaround for models to handle mixed modalities when + # get_multimodal_embeddings and get_input_embeddings are called + # separately. + # TODO(ywang96): Add support for mixed-modality inference for v1. multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] if "images" in modalities: diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 282764cdb8bf0..7956a98b21569 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -1246,9 +1246,11 @@ def get_multimodal_embeddings( if image_input is None and video_input is None: return None - # We append modality representation to each embedding. This is a - # temporary workaround for models that can handle multiple modalities - # at the same time. + # We make a tuple of each embedding with its modality string. This is a + # temporary workaround for models to handle mixed modalities when + # get_multimodal_embeddings and get_input_embeddings are called + # separately. + # TODO(ywang96): Add support for mixed-modality inference for v1. multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] if image_input is not None: From c2f29f72907a88f37fa4f9a3dbea04f42c125bf7 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Nov 2024 09:06:40 -0800 Subject: [PATCH 29/30] ignore f401 Signed-off-by: Roger Wang --- vllm/model_executor/models/interfaces.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index f81b8249d5908..b9754b7aa825f 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from vllm.attention import AttentionMetadata - from vllm.multimodal.inputs import NestedTensors + from vllm.multimodal.inputs import NestedTensors # noqa: F401 from vllm.sequence import IntermediateTensors logger = init_logger(__name__) From aa9f804c31e086429886f5cbb4a6302d010a06e9 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 26 Nov 2024 17:09:28 +0000 Subject: [PATCH 30/30] format Signed-off-by: Roger Wang --- vllm/model_executor/models/interfaces.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index b9754b7aa825f..1545ce332309f 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -11,7 +11,7 @@ if TYPE_CHECKING: from vllm.attention import AttentionMetadata - from vllm.multimodal.inputs import NestedTensors # noqa: F401 + from vllm.multimodal.inputs import NestedTensors # noqa: F401 from vllm.sequence import IntermediateTensors logger = init_logger(__name__)