From 723d7275026d2c6877e0555af7deca266dd5e6fb Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Wed, 21 Aug 2024 09:40:20 -0700
Subject: [PATCH] Update vllm/multimodal/image.py

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 tests/models/test_llava_next.py | 17 ++++++-----------
 vllm/multimodal/image.py        |  4 ++--
 2 files changed, 8 insertions(+), 13 deletions(-)
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index e03ad115b93bb..9640e827b4151 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -14,16 +14,11 @@
 
 _LIMIT_IMAGE_PER_PROMPT = 4
 
-_PREFACE = (
-    "A chat between a curious human and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the human's "
-    "questions.")
-
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
-    f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:",
+    "[INST] <image>\nWhat's the content of the image? [/INST]",
     "cherry_blossom":
-    f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
+    "[INST] <image>\nWhat is the season? [/INST]",
 })
 
 models = ["llava-hf/llava-v1.6-mistral-7b-hf"]
@@ -256,10 +251,10 @@ def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
 
     inputs = [(
         [
-            f"{_PREFACE} USER: <image><image>\nDescribe the 2 images. ASSISTANT:",  # noqa: E501
-            f"{_PREFACE} USER: <image><image>\nDescribe the 2 images. ASSISTANT:",  # noqa: E501
-            f"{_PREFACE} USER: <image><image><image><image>\nDescribe the 4 images. ASSISTANT:",  # noqa: E501
-            f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:"
+            "[INST] <image><image>\nDescribe 2 images. [/INST]",
+            "[INST] <image><image>\nDescribe 2 images. [/INST]",
+            "[INST] <image><image><image><image>\nDescribe 4 images. [/INST]",
+            "[INST] <image>\nWhat is the season? [/INST]"
         ],
         [
             [stop_sign, cherry_blossom],
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 4dd6d42f24051..a91d93494f0db 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -77,11 +77,11 @@ def repeat_and_pad_image_tokens(
         prompt_parts = prompt.split(image_token_str,
                                     maxsplit=len(repeat_count))
         new_prompt = ""
-        for i in range(len(repeat_count)):
+        for i, repeat_count_item in enumerate(repeat_count):
             replacement_str = "".join(
                 repeat_and_pad_token(
                     image_token_str,
-                    repeat_count=repeat_count[i],
+                    repeat_count=repeat_count_item,
                     pad_token_left=pad_token_str_left,
                     pad_token_right=pad_token_str_right,
                 ))