Initial commit:

vllm-project · Aug 8, 2024 · 124c946 · 124c946
1 parent 21b9c49
commit 124c946
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 33 deletions.
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -1,5 +1,5 @@
 """
-This example shows how to use vLLM for running offline inference 
+This example shows how to use vLLM for running offline inference
 with the correct prompt format on vision language models.
 
 For most models, the prompt format should follow corresponding examples
@@ -12,7 +12,8 @@
 from vllm.utils import FlexibleArgumentParser
 
 # Input image and question
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+cherry_blossom_image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+stop_sign_image = ImageAsset("stop_sign").pil_image.convert("RGB")
 question = "What is the content of this image?"
 
 
@@ -170,34 +171,38 @@ def main(args):
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=64,
+    sampling_params = SamplingParams(max_tokens=2048,
                                      stop_token_ids=stop_token_ids)
 
-    assert args.num_prompts > 0
-    if args.num_prompts == 1:
-        # Single inference
-        inputs = {
-            "prompt": prompt,
+    inputs = [
+        {
+            "prompt": f"[INST] What is the content of this image? [/INST]",
+        },
+        {
+            "prompt": f"[INST] <image>\nWhat is the content of this image? [/INST]",
             "multi_modal_data": {
-                "image": image
-            },
-        }
-
-    else:
-        # Batch inference
-        inputs = [{
-            "prompt": prompt,
+                "image": cherry_blossom_image
+            }
+        },
+        {
+            "prompt": f"[INST] <image>\nWhat is the content of this image? [/INST]",
             "multi_modal_data": {
-                "image": image
-            },
-        } for _ in range(args.num_prompts)]
+                "image": stop_sign_image
+            }
+        },
+        {
+            "prompt": f"[INST] <image> \n <image> \nWhat are the contents of the two images? [/INST]",
+            "multi_modal_data": {
+                "image": [cherry_blossom_image, stop_sign_image]
+            }
+        },
+    ]
 
     outputs = llm.generate(inputs, sampling_params=sampling_params)
 
-    for o in outputs:
+    for i, o in zip(inputs, outputs):
         generated_text = o.outputs[0].text
-        print(generated_text)
+        print(i["prompt"], generated_text)
 
 
 if __name__ == "__main__":

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
@@ -207,6 +207,14 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
             input_height=height,
             input_width=width,
         )
+    elif isinstance(image_data, list):
+            width, height = image_data[0].size
+
+            image_feature_size = get_llava_next_image_feature_size(
+                hf_config,
+                input_height=height,
+                input_width=width,
+            )
     elif isinstance(image_data, torch.Tensor):
         raise NotImplementedError("Embeddings input is not supported yet")
     else:
@@ -511,7 +519,7 @@ def forward(
         9047, 13566, 29901]`.
 
         To reserve space in KV cache, we have to insert placeholder tokens
-        before they are inputted to the model, so the input processor prepends 
+        before they are inputted to the model, so the input processor prepends
         additional image tokens (denoted as `32000`), resulting in:
         `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
         29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
@@ -532,7 +540,7 @@ def forward(
                 batch.
             pixel_values: The pixels in each grid patch for each input image.
             image_sizes: The original `(height, width)` for each input image.
-        
+
         See also:
             :class:`LlavaNextImageInputs`
         """

diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
@@ -71,13 +71,9 @@ def repeat_and_pad_image_tokens(
                 "Please follow the prompt format that is "
                 "documented on HuggingFace which does not involve "
                 "repeating %s tokens.", image_token_str)
-        elif image_token_count > 1:
-            logger.warning("Multiple image input is not supported yet, "
-                           "so any extra image tokens will be treated "
-                           "as plain text.")
 
         # The image tokens are removed to be consistent with HuggingFace
-        new_prompt = prompt.replace(image_token_str, replacement_str, 1)
+        new_prompt = prompt.replace(image_token_str, replacement_str)
 
     new_token_ids: List[int] = []
     for i, token in enumerate(prompt_token_ids):
@@ -89,10 +85,6 @@ def repeat_and_pad_image_tokens(
                 pad_token_right=pad_token_right,
             )
             new_token_ids.extend(replacement_ids)
-
-            # No need to further scan the list since we only replace once
-            new_token_ids.extend(prompt_token_ids[i + 1:])
-            break
         else:
             new_token_ids.append(token)