From 60f22a275bf03633f5b948137e301ccc602c9985 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 26 Jun 2024 16:54:43 -0700 Subject: [PATCH 1/2] add llava next example --- examples/llava_next_example.py | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 examples/llava_next_example.py diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py new file mode 100644 index 0000000000000..7acf374968d2c --- /dev/null +++ b/examples/llava_next_example.py @@ -0,0 +1,38 @@ +import requests +from io import BytesIO + +from PIL import Image + +from vllm import LLM +from vllm.multimodal.image import ImagePixelData +from vllm import SamplingParams + + +# Dynamic image input is currently not supported and therefore +# a fixed image input shape and its corresponding feature size is required. +# See https://github.com/vllm-project/vllm/pull/4199 for the complete +# configuration matrix. + +llm = LLM( + model="llava-hf/llava-v1.6-mistral-7b-hf", + image_input_type="pixel_values", + image_token_id=32000, + image_input_shape="1,3,336,336", + image_feature_size=1176, +) + +prompt = "[INST] " + "" * 1176 + "\nWhat is shown in this image? [/INST]" +url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg" +image = Image.open(BytesIO(requests.get(url).content)) +sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100) + +outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": ImagePixelData(image), +}, sampling_params=sampling_params) + +generated_text = "" +for o in outputs: + generated_text += o.outputs[0].text + +print(f"LLM output:{generated_text}") From 886cb941ffb7c3412f02011c319e7b3cb68cccea Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 26 Jun 2024 17:11:25 -0700 Subject: [PATCH 2/2] format --- examples/llava_next_example.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py index 7acf374968d2c..e90a86abe41cb 100644 --- a/examples/llava_next_example.py +++ b/examples/llava_next_example.py @@ -1,16 +1,14 @@ -import requests from io import BytesIO +import requests from PIL import Image -from vllm import LLM +from vllm import LLM, SamplingParams from vllm.multimodal.image import ImagePixelData -from vllm import SamplingParams - -# Dynamic image input is currently not supported and therefore +# Dynamic image input is currently not supported and therefore # a fixed image input shape and its corresponding feature size is required. -# See https://github.com/vllm-project/vllm/pull/4199 for the complete +# See https://github.com/vllm-project/vllm/pull/4199 for the complete # configuration matrix. llm = LLM( @@ -26,13 +24,15 @@ image = Image.open(BytesIO(requests.get(url).content)) sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100) -outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": ImagePixelData(image), -}, sampling_params=sampling_params) +outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": ImagePixelData(image), + }, + sampling_params=sampling_params) generated_text = "" for o in outputs: generated_text += o.outputs[0].text - + print(f"LLM output:{generated_text}")