diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index ea607fc2a1e51..122a7743ae14c 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -1,5 +1,5 @@ """ -This example shows how to use vLLM for running offline inference +This example shows how to use vLLM for running offline inference with the correct prompt format on vision language models. For most models, the prompt format should follow corresponding examples @@ -12,7 +12,8 @@ from vllm.utils import FlexibleArgumentParser # Input image and question -image = ImageAsset("cherry_blossom").pil_image.convert("RGB") +cherry_blossom_image = ImageAsset("cherry_blossom").pil_image.convert("RGB") +stop_sign_image = ImageAsset("stop_sign").pil_image.convert("RGB") question = "What is the content of this image?" @@ -170,34 +171,38 @@ def main(args): # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(temperature=0.2, - max_tokens=64, + sampling_params = SamplingParams(max_tokens=2048, stop_token_ids=stop_token_ids) - assert args.num_prompts > 0 - if args.num_prompts == 1: - # Single inference - inputs = { - "prompt": prompt, + inputs = [ + { + "prompt": f"[INST] What is the content of this image? [/INST]", + }, + { + "prompt": f"[INST] \nWhat is the content of this image? [/INST]", "multi_modal_data": { - "image": image - }, - } - - else: - # Batch inference - inputs = [{ - "prompt": prompt, + "image": cherry_blossom_image + } + }, + { + "prompt": f"[INST] \nWhat is the content of this image? [/INST]", "multi_modal_data": { - "image": image - }, - } for _ in range(args.num_prompts)] + "image": stop_sign_image + } + }, + { + "prompt": f"[INST] \n \nWhat are the contents of the two images? [/INST]", + "multi_modal_data": { + "image": [cherry_blossom_image, stop_sign_image] + } + }, + ] outputs = llm.generate(inputs, sampling_params=sampling_params) - for o in outputs: + for i, o in zip(inputs, outputs): generated_text = o.outputs[0].text - print(generated_text) + print(i["prompt"], generated_text) if __name__ == "__main__": diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index 9abc480f60dec..ef7ced36582b9 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -207,6 +207,14 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs): input_height=height, input_width=width, ) + elif isinstance(image_data, list): + width, height = image_data[0].size + + image_feature_size = get_llava_next_image_feature_size( + hf_config, + input_height=height, + input_width=width, + ) elif isinstance(image_data, torch.Tensor): raise NotImplementedError("Embeddings input is not supported yet") else: @@ -511,7 +519,7 @@ def forward( 9047, 13566, 29901]`. To reserve space in KV cache, we have to insert placeholder tokens - before they are inputted to the model, so the input processor prepends + before they are inputted to the model, so the input processor prepends additional image tokens (denoted as `32000`), resulting in: `[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255, 29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568, @@ -532,7 +540,7 @@ def forward( batch. pixel_values: The pixels in each grid patch for each input image. image_sizes: The original `(height, width)` for each input image. - + See also: :class:`LlavaNextImageInputs` """ diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index b6a3909e95632..2db06a213b9b9 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -71,13 +71,9 @@ def repeat_and_pad_image_tokens( "Please follow the prompt format that is " "documented on HuggingFace which does not involve " "repeating %s tokens.", image_token_str) - elif image_token_count > 1: - logger.warning("Multiple image input is not supported yet, " - "so any extra image tokens will be treated " - "as plain text.") # The image tokens are removed to be consistent with HuggingFace - new_prompt = prompt.replace(image_token_str, replacement_str, 1) + new_prompt = prompt.replace(image_token_str, replacement_str) new_token_ids: List[int] = [] for i, token in enumerate(prompt_token_ids): @@ -89,10 +85,6 @@ def repeat_and_pad_image_tokens( pad_token_right=pad_token_right, ) new_token_ids.extend(replacement_ids) - - # No need to further scan the list since we only replace once - new_token_ids.extend(prompt_token_ids[i + 1:]) - break else: new_token_ids.append(token)