[Bugfix] Fix broadcasting logic for multi_modal_kwargs (vllm-projec…

…t#6836)
aibrix · Jul 31, 2024 · f230cc2 · f230cc2
1 parent da1f7cc
commit f230cc2
Show file tree

Hide file tree

Showing 16 changed files with 254 additions and 211 deletions.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -56,7 +56,6 @@ steps:
   fast_check: true
   commands:
   - pytest -v -s core
-  - pytest -v -s distributed/test_parallel_state.py
 
 - label: Distributed Comm Ops Test
   #mirror_hardwares: [amd]
@@ -90,13 +89,13 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-  - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
@@ -44,6 +44,8 @@ Base Classes
 
 .. autodata:: vllm.multimodal.BatchedTensors
 
+.. autodata:: vllm.multimodal.BatchedTensorInputs
+
 .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
     :members:
     :show-inheritance:

diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
@@ -19,10 +19,10 @@
 
 model = os.environ["TEST_DIST_MODEL"]
 
-if model.startswith("llava-hf/llava"):
+if model.startswith("llava-hf/llava-1.5"):
     from ..models.test_llava import models, run_test
-elif model.startswith("microsoft/Phi-3-vision"):
-    from ..models.test_phi3v import models, run_test
+elif model.startswith("llava-hf/llava-v1.6"):
+    from ..models.test_llava_next import models, run_test
 else:
     raise NotImplementedError(f"Unsupported model: {model}")
 
@@ -45,7 +45,8 @@ def test_models(hf_runner, vllm_runner, image_assets,
         vllm_runner,
         image_assets,
         model=models[0],
-        size_factors=[1.0],
+        # So that LLaVA-NeXT processor may return nested list
+        size_factors=[0.25, 0.5, 1.0],
         dtype=dtype,
         max_tokens=max_tokens,
         num_logprobs=num_logprobs,

diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
@@ -1,14 +1,12 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoConfig, AutoTokenizer
 
-from vllm.model_executor.models.llava_next import (
-    get_llava_next_image_feature_size)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
-from ..conftest import IMAGE_ASSETS
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
@@ -27,6 +25,8 @@
 
 IMAGE_TOKEN_ID = 32000
 
+models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
+
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                          Optional[SampleLogprobs]],
@@ -50,34 +50,19 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     return hf_output_ids, hf_output_str, out_logprobs
 
 
-@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype, max_tokens, num_logprobs) -> None:
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -89,6 +74,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
     with vllm_runner(model,
                      dtype=dtype,
                      max_model_len=4096,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True) as vllm_model:
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
@@ -122,9 +109,54 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
         )
 
 
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
 @pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
                                                          (183, 488, 776)])
 def test_image_feature_size(height_and_width_and_result):
+    # Avoid initializing CUDA too early in distributed tests
+    from vllm.model_executor.models.llava_next import (
+        get_llava_next_image_feature_size)
+
     height, width, result = height_and_width_and_result
     config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
     assert get_llava_next_image_feature_size(config,

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -45,54 +45,30 @@ class GraphCaptureContext:
 
 
 def _split_tensor_dict(
-        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
-        prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
     """Split the tensor dictionary into two parts:
     1. A list of (key, value) pairs. If the value is a tensor, it is replaced
          by its metadata.
     2. A list of tensors.
-
-    If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
-    metadata will be "key1%key2".
     """
     metadata_list: List[Tuple[str, Any]] = []
-    tensor_list = []
+    tensor_list: List[torch.Tensor] = []
     for key, value in tensor_dict.items():
-        assert "%" not in key, (
-            "Avoid having '%' in key "
-            "as it is used as a separator for nested entries.")
         if isinstance(value, torch.Tensor):
             # Note: we cannot use `value.device` here,
             # because it contains not only the device type but also the device
             # index (e.g. "cuda:0"). We only need the device type.
             # receiving side will set the device index.
             device = value.device.type
             metadata_list.append(
-                (prefix + key, TensorMetadata(device, value.dtype,
-                                              value.size())))
+                (key, TensorMetadata(device, value.dtype, value.size())))
             tensor_list.append(value)
-        elif isinstance(value, dict):
-            if len(value) == 0:
-                metadata_list.append((prefix + key, value))
-            inner_metadata_list, inner_tensor_list = _split_tensor_dict(
-                value, prefix + key + "%")
-            metadata_list.extend(inner_metadata_list)
-            tensor_list.extend(inner_tensor_list)
         else:
-            metadata_list.append((prefix + key, value))
+            metadata_list.append((key, value))
     return metadata_list, tensor_list
 
 
-def _update_nested_dict(nested_dict, flattened_key, value):
-    key_splits = flattened_key.split("%")
-    cur_dict = nested_dict
-    for k in key_splits[:-1]:
-        if k not in cur_dict:
-            cur_dict[k] = {}
-        cur_dict = cur_dict[k]
-    cur_dict[key_splits[-1]] = value
-
-
 class GroupCoordinator:
     """
     PyTorch ProcessGroup wrapper for a group of processes.
@@ -566,7 +542,7 @@ def broadcast_tensor_dict(
                                          device=value.device)
                     if tensor.numel() == 0:
                         # Skip broadcasting empty tensors.
-                        _update_nested_dict(tensor_dict, key, tensor)
+                        tensor_dict[key] = tensor
                         continue
                     if tensor.is_cpu:
                         # use metadata_group for CPU tensors
@@ -583,9 +559,9 @@ def broadcast_tensor_dict(
                             group=group,
                             async_op=True)
                     async_handles.append(handle)
-                    _update_nested_dict(tensor_dict, key, tensor)
+                    tensor_dict[key] = tensor
                 else:
-                    _update_nested_dict(tensor_dict, key, value)
+                    tensor_dict[key] = value
             for async_handle in async_handles:
                 async_handle.wait()
         return tensor_dict
@@ -661,7 +637,7 @@ def recv_tensor_dict(
                                      device=value.device)
                 if tensor.numel() == 0:
                     # Skip broadcasting empty tensors.
-                    _update_nested_dict(tensor_dict, key, tensor)
+                    tensor_dict[key] = tensor
                     continue
                 if tensor.is_cpu:
                     # use metadata_group for CPU tensors
@@ -673,9 +649,9 @@ def recv_tensor_dict(
                     torch.distributed.recv(tensor,
                                            src=self.ranks[src],
                                            group=group)
-                _update_nested_dict(tensor_dict, key, tensor)
+                tensor_dict[key] = tensor
             else:
-                _update_nested_dict(tensor_dict, key, value)
+                tensor_dict[key] = value
         return tensor_dict
 
     def barrier(self):

diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
@@ -1,5 +1,6 @@
-from .base import (BatchedTensors, MultiModalDataBuiltins, MultiModalDataDict,
-                   MultiModalInputs, MultiModalPlugin, NestedTensors)
+from .base import (BatchedTensorInputs, BatchedTensors, MultiModalDataBuiltins,
+                   MultiModalDataDict, MultiModalInputs, MultiModalPlugin,
+                   NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -12,6 +13,7 @@
 """
 
 __all__ = [
+    "BatchedTensorInputs",
     "BatchedTensors",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",