From 60f22a275bf03633f5b948137e301ccc602c9985 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 26 Jun 2024 16:54:43 -0700
Subject: [PATCH 1/2] add llava next example

---
 examples/llava_next_example.py | 38 ++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 examples/llava_next_example.py
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
new file mode 100644
index 0000000000000..7acf374968d2c
--- /dev/null
+++ b/examples/llava_next_example.py
@@ -0,0 +1,38 @@
+import requests
+from io import BytesIO
+
+from PIL import Image
+
+from vllm import LLM
+from vllm.multimodal.image import ImagePixelData
+from vllm import SamplingParams
+
+
+# Dynamic image input is currently not supported and therefore 
+# a fixed image input shape and its corresponding feature size is required.
+# See https://github.com/vllm-project/vllm/pull/4199 for the complete 
+# configuration matrix.
+
+llm = LLM(
+    model="llava-hf/llava-v1.6-mistral-7b-hf",
+    image_input_type="pixel_values",
+    image_token_id=32000,
+    image_input_shape="1,3,336,336",
+    image_feature_size=1176,
+)
+
+prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
+url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+image = Image.open(BytesIO(requests.get(url).content))
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
+
+outputs = llm.generate({
+    "prompt": prompt,
+    "multi_modal_data": ImagePixelData(image),
+}, sampling_params=sampling_params)
+
+generated_text = ""
+for o in outputs:
+    generated_text += o.outputs[0].text
+    
+print(f"LLM output:{generated_text}")

From 886cb941ffb7c3412f02011c319e7b3cb68cccea Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 26 Jun 2024 17:11:25 -0700
Subject: [PATCH 2/2] format

---
 examples/llava_next_example.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
index 7acf374968d2c..e90a86abe41cb 100644
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -1,16 +1,14 @@
-import requests
 from io import BytesIO
 
+import requests
 from PIL import Image
 
-from vllm import LLM
+from vllm import LLM, SamplingParams
 from vllm.multimodal.image import ImagePixelData
-from vllm import SamplingParams
 
-
-# Dynamic image input is currently not supported and therefore 
+# Dynamic image input is currently not supported and therefore
 # a fixed image input shape and its corresponding feature size is required.
-# See https://github.com/vllm-project/vllm/pull/4199 for the complete 
+# See https://github.com/vllm-project/vllm/pull/4199 for the complete
 # configuration matrix.
 
 llm = LLM(
@@ -26,13 +24,15 @@
 image = Image.open(BytesIO(requests.get(url).content))
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
 
-outputs = llm.generate({
-    "prompt": prompt,
-    "multi_modal_data": ImagePixelData(image),
-}, sampling_params=sampling_params)
+outputs = llm.generate(
+    {
+        "prompt": prompt,
+        "multi_modal_data": ImagePixelData(image),
+    },
+    sampling_params=sampling_params)
 
 generated_text = ""
 for o in outputs:
     generated_text += o.outputs[0].text
-    
+
 print(f"LLM output:{generated_text}")