Skip to content

Commit

Permalink
[0/N] Rename MultiModalInputs to MultiModalKwargs (vllm-project#1…
Browse files Browse the repository at this point in the history
…0040)

Signed-off-by: DarkLight1337 <[email protected]>
Signed-off-by: Maxime Fournioux <[email protected]>
  • Loading branch information
DarkLight1337 authored and mfournioux committed Nov 20, 2024
1 parent 74d53e2 commit c8d902e
Show file tree
Hide file tree
Showing 32 changed files with 151 additions and 121 deletions.
2 changes: 1 addition & 1 deletion docs/source/design/multimodal/multimodal_index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ Base Classes

.. autodata:: vllm.multimodal.MultiModalDataDict

.. autoclass:: vllm.multimodal.MultiModalInputs
.. autoclass:: vllm.multimodal.MultiModalKwargs
:members:
:show-inheritance:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from PIL.Image import Image

from vllm.inputs import InputContext, token_inputs
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer

from .....conftest import IMAGE_ASSETS
Expand Down Expand Up @@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
# Ensure that we get the appropriately shaped pixel_values
# for images and image embeddings, respectively.
assert isinstance(mapped_img_data, MultiModalInputs)
assert isinstance(mapped_img_data, MultiModalKwargs)
assert "pixel_values" in mapped_img_data
assert mapped_img_data["pixel_values"].shape == expected_shape

Expand Down
22 changes: 11 additions & 11 deletions tests/multimodal/test_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import torch

from vllm.multimodal.base import MultiModalInputs, NestedTensors
from vllm.multimodal.base import MultiModalKwargs, NestedTensors


def assert_nested_tensors_equal(expected: NestedTensors,
Expand All @@ -13,40 +13,40 @@ def assert_nested_tensors_equal(expected: NestedTensors,
assert_nested_tensors_equal(expected_item, actual_item)


def assert_multimodal_inputs_equal(expected: MultiModalInputs,
actual: MultiModalInputs):
def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
actual: MultiModalKwargs):
assert set(expected.keys()) == set(actual.keys())
for key in expected:
assert_nested_tensors_equal(expected[key], actual[key])


def test_multimodal_input_batch_single_tensor():
t = torch.rand([1, 2])
result = MultiModalInputs.batch([{"image": t}])
result = MultiModalKwargs.batch([{"image": t}])
assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})


def test_multimodal_input_batch_multiple_tensors():
a = torch.rand([1, 1, 2])
b = torch.rand([1, 1, 2])
c = torch.rand([1, 1, 2])
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})


def test_multimodal_input_batch_multiple_heterogeneous_tensors():
a = torch.rand([1, 2, 2])
b = torch.rand([1, 3, 2])
c = torch.rand([1, 4, 2])
result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
assert_multimodal_inputs_equal(result, {"image": [a, b, c]})


def test_multimodal_input_batch_nested_tensors():
a = torch.rand([2, 3])
b = torch.rand([2, 3])
c = torch.rand([2, 3])
result = MultiModalInputs.batch([{
result = MultiModalKwargs.batch([{
"image": [a]
}, {
"image": [b]
Expand All @@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists():
a = torch.rand([1, 2, 3])
b = torch.rand([1, 2, 3])
c = torch.rand([1, 2, 3])
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
assert_multimodal_inputs_equal(
result,
{"image": [torch.stack([a, b]), c.unsqueeze(0)]})
Expand All @@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists():
b = torch.rand([1, 2, 3])
c = torch.rand([1, 2, 3])
d = torch.rand([1, 2, 3])
result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
assert_multimodal_inputs_equal(
result,
{"image": torch.stack([torch.stack([a, b]),
Expand All @@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths():
b = torch.rand([1, 3, 3])
c = torch.rand([1, 4, 3])

result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})

result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}])
result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
4 changes: 2 additions & 2 deletions vllm/model_executor/models/chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.base import MultiModalData
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
Expand Down Expand Up @@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv(
raise
pixel_values = raw_batch_data['images']

return MultiModalInputs({'pixel_values': pixel_values})
return MultiModalKwargs({'pixel_values': pixel_values})


def merge_glm_vision_embeddings(
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from vllm.model_executor.models.persimmon import PersimmonForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges)
Expand Down Expand Up @@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
])

# image has been processed with prompt in input processor
return MultiModalInputs({"pixel_values": data})
return MultiModalKwargs({"pixel_values": data})


@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
Expand Down
10 changes: 5 additions & 5 deletions vllm/model_executor/models/h2ovl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
token_inputs)
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.utils import is_list_of

Expand Down Expand Up @@ -324,12 +324,12 @@ def input_mapper(
data: object,
*,
max_dynamic_patch: Optional[int] = None,
) -> MultiModalInputs:
) -> MultiModalKwargs:

# NOTE: Preprocessing for the image data is done in the
# 'input_processor' function during actual inference.
if isinstance(data, dict):
return MultiModalInputs(data)
return MultiModalKwargs(data)

# The section below is only used with dummy data during
# memory profiling.
Expand All @@ -347,7 +347,7 @@ def input_mapper(
pixel_values = [image_pixel_values_mapper(img) for img in data]

else:
return MultiModalInputs({"image_embeds": data})
return MultiModalKwargs({"image_embeds": data})
model_config = ctx.model_config
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
Expand All @@ -359,7 +359,7 @@ def input_mapper(
return_tensors="pt",
)[0]

return MultiModalInputs({
return MultiModalKwargs({
"pixel_values": pixel_values,
"image_token_id": image_token_id
})
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.transformers_utils.processor import cached_get_processor
Expand Down Expand Up @@ -127,7 +127,7 @@ def input_mapper_for_idefics3(
logger.error("Failed to process image (%s)", data)
raise

return MultiModalInputs(batch_data)
return MultiModalKwargs(batch_data)


def _resize_output_size(height: int,
Expand Down
6 changes: 3 additions & 3 deletions vllm/model_executor/models/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
InternVisionPatchModel)
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of
Expand Down Expand Up @@ -346,7 +346,7 @@ def input_mapper(
# we can't stack here because images may have different num_patches
data = [image_pixel_values_mapper(img) for img in data]
else:
return MultiModalInputs({"image_embeds": data})
return MultiModalKwargs({"image_embeds": data})
model_config = ctx.model_config
tokenizer = cached_get_tokenizer(
model_config.tokenizer,
Expand All @@ -355,7 +355,7 @@ def input_mapper(
add_special_tokens=False,
return_tensors="pt")[0]

return MultiModalInputs({
return MultiModalKwargs({
"pixel_values": data,
"image_token_id": image_token_id
})
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from vllm.model_executor.models.utils import LLMWrapper
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.image import cached_get_image_processor
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors, SequenceData
Expand Down Expand Up @@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
batch_data["slice_start_id"] = data[0]["slice_start_id"]
batch_data["slice_end_id"] = data[0]["slice_end_id"]

return MultiModalInputs(batch_data)
return MultiModalKwargs(batch_data)


class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/mllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -1162,7 +1162,7 @@ def sample(

def _parse_and_validate_image_input(self, **kwargs: object):
# tensor with the same shape will be batched together by
# MultiModalInputs.batch, so pixel_values here can be:
# MultiModalKwargs.batch, so pixel_values here can be:
# - List[List[torch.Tensor]]:
# with shape (num_tiles, 3, image_res, image_res)
# - List[torch.Tensor]:
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/molmo.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
SequenceData)
Expand Down Expand Up @@ -866,7 +866,7 @@ def image_input_mapper_for_molmo(
ctx: InputContext,
data: object,
):
return MultiModalInputs(data)
return MultiModalKwargs(data)


def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
Expand Down
10 changes: 5 additions & 5 deletions vllm/model_executor/models/pixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from vllm.model_executor.models.utils import merge_multimodal_embeddings
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import (cached_get_tokenizer,
consecutive_placeholder_ranges)
from vllm.sequence import IntermediateTensors, SequenceData
Expand Down Expand Up @@ -94,16 +94,16 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,


def input_mapper_for_pixtral(ctx: InputContext,
data: object) -> MultiModalInputs:
"""Maps the input data to its MultiModalInputs (if any).
data: object) -> MultiModalKwargs:
"""Maps the input data to its MultiModalKwargs (if any).
Args:
ctx: Context of the loaded model.
data: data potentially containing image/image embeddings to be mapped
to pixel_values in .forward() for a visual QWenLMHeadModel model.
Returns:
MultiModalInputs containing the stacked normalized images tensor or
MultiModalKwargs containing the stacked normalized images tensor or
image embeddings.
"""
# Early exit if we have provided an image to a language only Qwen model
Expand All @@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
dtype=torch.float16)
images.append(image)

return MultiModalInputs({"images": images})
return MultiModalKwargs({"images": images})


def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
Expand Down
12 changes: 6 additions & 6 deletions vllm/model_executor/models/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.base import MultiModalInputs
from vllm.multimodal.base import MultiModalKwargs
from vllm.multimodal.utils import cached_get_tokenizer
from vllm.sequence import IntermediateTensors, SequenceData
from vllm.utils import is_list_of
Expand Down Expand Up @@ -722,16 +722,16 @@ def input_processor_for_qwen(ctx: InputContext,
multi_modal_data=multi_modal_data)


def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
"""Maps the input data to its MultiModalInputs (if any).
def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
"""Maps the input data to its MultiModalKwargs (if any).
Args:
ctx: Context of the loaded model.
data: data potentially containing image/image embeddings to be mapped
to pixel_values in .forward() for a visual QWenLMHeadModel model.
Returns:
MultiModalInputs containing the stacked normalized images tensor or
MultiModalKwargs containing the stacked normalized images tensor or
image embeddings.
"""
# Early exit if we have provided an image to a language only Qwen model
Expand All @@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
logger.warning(
"Images were provided but this model has no visual config; "
"multimodal inputs will not be forwarded to the model.")
return MultiModalInputs()
return MultiModalKwargs()

model_config = ctx.model_config
tokenizer = cached_get_tokenizer(
Expand Down Expand Up @@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
data = [data]
transformed_images = [transform(datum) for datum in data]
pixel_values = torch.stack(transformed_images, dim=0)
return MultiModalInputs({"pixel_values": pixel_values})
return MultiModalKwargs({"pixel_values": pixel_values})


def build_normalization_transform(image_size: int) -> transforms.Compose:
Expand Down
8 changes: 4 additions & 4 deletions vllm/model_executor/models/qwen2_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
default_weight_loader, maybe_remap_kv_scale_name)
from vllm.model_executor.models.qwen2 import Qwen2Model
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
from vllm.multimodal.utils import consecutive_placeholder_ranges
from vllm.sequence import IntermediateTensors, SequenceData

Expand Down Expand Up @@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio(
def input_mapper_for_qwen2_audio(
ctx: InputContext,
multi_modal_data: Union[np.ndarray, List[np.ndarray]],
) -> MultiModalInputs:
) -> MultiModalKwargs:
"""Input mapper for Qwen2-Audio."""
if not isinstance(multi_modal_data, list):
multi_modal_data = [multi_modal_data]

if len(multi_modal_data) == 0:
return MultiModalInputs()
return MultiModalKwargs()

processor = cached_get_processor(ctx.model_config.model)
audio_feature_extractor = processor.feature_extractor
Expand All @@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio(
logger.error("Failed to process audio (%s)", multi_modal_data)
raise

return MultiModalInputs(batch_data)
return MultiModalKwargs(batch_data)


@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)
Expand Down
Loading

0 comments on commit c8d902e

Please sign in to comment.