Skip to content

Commit

Permalink
Initial commit:
Browse files Browse the repository at this point in the history
  • Loading branch information
zifeitong committed Aug 15, 2024
1 parent f4da5f7 commit c700350
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 33 deletions.
49 changes: 27 additions & 22 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
This example shows how to use vLLM for running offline inference
This example shows how to use vLLM for running offline inference
with the correct prompt format on vision language models.
For most models, the prompt format should follow corresponding examples
Expand All @@ -12,7 +12,8 @@
from vllm.utils import FlexibleArgumentParser

# Input image and question
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
cherry_blossom_image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
stop_sign_image = ImageAsset("stop_sign").pil_image.convert("RGB")
question = "What is the content of this image?"


Expand Down Expand Up @@ -181,34 +182,38 @@ def main(args):

# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2,
max_tokens=64,
sampling_params = SamplingParams(max_tokens=2048,
stop_token_ids=stop_token_ids)

assert args.num_prompts > 0
if args.num_prompts == 1:
# Single inference
inputs = {
"prompt": prompt,
inputs = [
{
"prompt": f"[INST] What is the content of this image? [/INST]",
},
{
"prompt": f"[INST] <image>\nWhat is the content of this image? [/INST]",
"multi_modal_data": {
"image": image
},
}

else:
# Batch inference
inputs = [{
"prompt": prompt,
"image": cherry_blossom_image
}
},
{
"prompt": f"[INST] <image>\nWhat is the content of this image? [/INST]",
"multi_modal_data": {
"image": image
},
} for _ in range(args.num_prompts)]
"image": stop_sign_image
}
},
{
"prompt": f"[INST] <image> \n <image> \nWhat are the contents of the two images? [/INST]",
"multi_modal_data": {
"image": [cherry_blossom_image, stop_sign_image]
}
},
]

outputs = llm.generate(inputs, sampling_params=sampling_params)

for o in outputs:
for i, o in zip(inputs, outputs):
generated_text = o.outputs[0].text
print(generated_text)
print(i["prompt"], generated_text)


if __name__ == "__main__":
Expand Down
12 changes: 10 additions & 2 deletions vllm/model_executor/models/llava_next.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,14 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
input_height=height,
input_width=width,
)
elif isinstance(image_data, list):
width, height = image_data[0].size

image_feature_size = get_llava_next_image_feature_size(
hf_config,
input_height=height,
input_width=width,
)
elif isinstance(image_data, torch.Tensor):
image_feature_size = image_data.shape[0]
else:
Expand Down Expand Up @@ -545,7 +553,7 @@ def forward(
9047, 13566, 29901]`.
To reserve space in KV cache, we have to insert placeholder tokens
before they are inputted to the model, so the input processor prepends
before they are inputted to the model, so the input processor prepends
additional image tokens (denoted as `32000`), resulting in:
`[1, 319, 13563, 1546, 263, 12758, 5199, 322, 385, 23116, 21082, 20255,
29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568,
Expand All @@ -566,7 +574,7 @@ def forward(
batch.
pixel_values: The pixels in each grid patch for each input image.
image_sizes: The original `(height, width)` for each input image.
See also:
:class:`LlavaNextImageInputs`
"""
Expand Down
10 changes: 1 addition & 9 deletions vllm/multimodal/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,9 @@ def repeat_and_pad_image_tokens(
"Please follow the prompt format that is "
"documented on HuggingFace which does not involve "
"repeating %s tokens.", image_token_str)
elif image_token_count > 1:
logger.warning("Multiple image input is not supported yet, "
"so any extra image tokens will be treated "
"as plain text.")

# The image tokens are removed to be consistent with HuggingFace
new_prompt = prompt.replace(image_token_str, replacement_str, 1)
new_prompt = prompt.replace(image_token_str, replacement_str)

new_token_ids: List[int] = []
for i, token in enumerate(prompt_token_ids):
Expand All @@ -89,10 +85,6 @@ def repeat_and_pad_image_tokens(
pad_token_right=pad_token_right,
)
new_token_ids.extend(replacement_ids)

# No need to further scan the list since we only replace once
new_token_ids.extend(prompt_token_ids[i + 1:])
break
else:
new_token_ids.append(token)

Expand Down

0 comments on commit c700350

Please sign in to comment.