From b31b7712676960ebebf7c93207aa9b12e0ddf9d9 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 8 Oct 2024 18:00:40 +0530
Subject: [PATCH 1/7] Update vlm.rst to include an example on videos

---
 docs/source/models/vlm.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 8f5aa58f9f2b9..85a0d8b0a3a05 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -133,7 +133,8 @@ Instead of passing in a single image, you can pass in a list of images.
         generated_text = o.outputs[0].text
         print(generated_text)
 
-A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
+A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_. Multi-image input can be extended to 
+perform video captioning. Refer to [this resource](https://github.com/vllm-project/vllm/issues/9128#issuecomment-2399642038) for a full example. 
 
 Online Inference
 ----------------

From b5aca1b6e9fe1d5823197ec2aa650382a3d441ee Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 8 Oct 2024 20:14:44 +0530
Subject: [PATCH 2/7] Update vlm.rst

---
 docs/source/models/vlm.rst | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 85a0d8b0a3a05..58c0e9da6f826 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -134,7 +134,25 @@ Instead of passing in a single image, you can pass in a list of images.
         print(generated_text)
 
 A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_. Multi-image input can be extended to 
-perform video captioning. Refer to [this resource](https://github.com/vllm-project/vllm/issues/9128#issuecomment-2399642038) for a full example. 
+perform video captioning. We show this with [Qwen-VL2]https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: 
+
+.. code-block:: python
+    # Specify the maximum number of frames per video to be 4. This can be changed. 
+    llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
+
+    # Create the request payload.
+    video_frames = ... # load your video making sure it only has the number of frames specified earlier.
+    messages = [{"role": "user", "content": []}]
+    messages[0]["content"].append({"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."})
+    for i in range(len(video_frames)):
+        base64_image = encode_image(video_frames[i]) # base64 encoding.
+
+    # Perform inference and log output.
+    outputs = llm.chat(messages)
+    
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
 
 Online Inference
 ----------------

From f3d79ebecdec6113d45abddbe9b092892434721f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 8 Oct 2024 20:20:15 +0530
Subject: [PATCH 3/7] Update docs/source/models/vlm.rst

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/vlm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 58c0e9da6f826..df09b6e700738 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -134,7 +134,7 @@ Instead of passing in a single image, you can pass in a list of images.
         print(generated_text)
 
 A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_. Multi-image input can be extended to 
-perform video captioning. We show this with [Qwen-VL2]https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: 
+perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct>`_ as it supports videos: 
 
 .. code-block:: python
     # Specify the maximum number of frames per video to be 4. This can be changed. 

From 3c7f8e731f024a51fe81c0621b7c666de3ef7bf2 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 8 Oct 2024 20:23:41 +0530
Subject: [PATCH 4/7] Update vlm.rst

---
 docs/source/models/vlm.rst | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index df09b6e700738..ce089902ffc26 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -142,10 +142,13 @@ perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qw
 
     # Create the request payload.
     video_frames = ... # load your video making sure it only has the number of frames specified earlier.
-    messages = [{"role": "user", "content": []}]
-    messages[0]["content"].append({"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."})
+    messages = [
+        {"role": "user", "content": [{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}]}
+    ]
     for i in range(len(video_frames)):
         base64_image = encode_image(video_frames[i]) # base64 encoding.
+        new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
+        messages[0]["content"].append(new_image)
 
     # Perform inference and log output.
     outputs = llm.chat(messages)

From e3d2c4421285f3b5642829fb50155c63e5c3e532 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 8 Oct 2024 20:40:32 +0530
Subject: [PATCH 5/7] Update docs/source/models/vlm.rst

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/vlm.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index ce089902ffc26..9727e0cdddbf2 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -133,8 +133,9 @@ Instead of passing in a single image, you can pass in a list of images.
         generated_text = o.outputs[0].text
         print(generated_text)
 
-A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_. Multi-image input can be extended to 
-perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct>`_ as it supports videos: 
+A code example can be found in `examples/offline_inference_vision_language_multi_image.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_vision_language_multi_image.py>`_.
+
+Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct>`_ as it supports videos: 
 
 .. code-block:: python
     # Specify the maximum number of frames per video to be 4. This can be changed. 

From 7bda55e628a4dd33ac4c0f0e2acbbedff1eaaa96 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 8 Oct 2024 20:50:12 +0530
Subject: [PATCH 6/7] Update docs/source/models/vlm.rst

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/vlm.rst | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 9727e0cdddbf2..c4795c478f6d0 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -143,16 +143,19 @@ Multi-image input can be extended to perform video captioning. We show this with
 
     # Create the request payload.
     video_frames = ... # load your video making sure it only has the number of frames specified earlier.
-    messages = [
-        {"role": "user", "content": [{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}]}
-    ]
+    message = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
+        ],
+    }
     for i in range(len(video_frames)):
         base64_image = encode_image(video_frames[i]) # base64 encoding.
         new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
-        messages[0]["content"].append(new_image)
+        message["content"].append(new_image)
 
     # Perform inference and log output.
-    outputs = llm.chat(messages)
+    outputs = llm.chat([message])
     
     for o in outputs:
         generated_text = o.outputs[0].text

From 263f59aa5eba1db72ce85344f243c7c290cfcd34 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 8 Oct 2024 21:06:20 +0530
Subject: [PATCH 7/7] Update docs/source/models/vlm.rst

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 docs/source/models/vlm.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index c4795c478f6d0..45316fd34a5d2 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -138,6 +138,7 @@ A code example can be found in `examples/offline_inference_vision_language_multi
 Multi-image input can be extended to perform video captioning. We show this with `Qwen2-VL <https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct>`_ as it supports videos: 
 
 .. code-block:: python
+
     # Specify the maximum number of frames per video to be 4. This can be changed. 
     llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})