[Misc][VLM][Doc] Consolidate offline examples for vision language mod…

…els (vllm-project#6858) Co-authored-by: Cyrus Leung <[email protected]>
KuntaiDu · Jul 27, 2024 · ab4258e · ab4258e
1 parent 0a06e4c
commit ab4258e
Show file tree

Hide file tree

Showing 7 changed files with 174 additions and 212 deletions.
diff --git a/examples/fuyu_example.py b/examples/fuyu_example.py
diff --git a/examples/llava_example.py b/examples/llava_example.py
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
diff --git a/examples/minicpmv_example.py b/examples/minicpmv_example.py
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -0,0 +1,174 @@
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+# Input image and question
+image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+question = "What is the content of this image?"
+
+
+# LLaVA-1.5
+def run_llava(question):
+
+    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+    return llm, prompt
+
+
+# LLaVA-1.6/LLaVA-NeXT
+def run_llava_next(question):
+
+    prompt = f"[INST] <image>\n{question} [/INST]"
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+
+    return llm, prompt
+
+
+# Fuyu
+def run_fuyu(question):
+
+    prompt = f"{question}\n"
+    llm = LLM(model="adept/fuyu-8b")
+
+    return llm, prompt
+
+
+# Phi-3-Vision
+def run_phi3v(question):
+
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (128k) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # In this example, we override max_num_seqs to 5 while
+    # keeping the original context length of 128k.
+    llm = LLM(
+        model="microsoft/Phi-3-vision-128k-instruct",
+        trust_remote_code=True,
+        max_num_seqs=5,
+    )
+    return llm, prompt
+
+
+# PaliGemma
+def run_paligemma(question):
+
+    prompt = question
+    llm = LLM(model="google/paligemma-3b-mix-224")
+
+    return llm, prompt
+
+
+# Chameleon
+def run_chameleon(question):
+
+    prompt = f"{question}<image>"
+    llm = LLM(model="facebook/chameleon-7b")
+    return llm, prompt
+
+
+# MiniCPM-V
+def run_minicpmv(question):
+
+    # 2.0
+    # The official repo doesn't work yet, so we need to use a fork for now
+    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
+    # model_name = "HwwwH/MiniCPM-V-2"
+
+    # 2.5
+    model_name = "openbmb/MiniCPM-Llama3-V-2_5"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+    )
+
+    messages = [{
+        'role': 'user',
+        'content': f'(<image>./</image>)\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    return llm, prompt
+
+
+model_example_map = {
+    "llava": run_llava,
+    "llava-next": run_llava_next,
+    "fuyu": run_fuyu,
+    "phi3_v": run_phi3v,
+    "paligemma": run_paligemma,
+    "chameleon": run_chameleon,
+    "minicpmv": run_minicpmv,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    llm, prompt = model_example_map[model](question)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models')
+    args = parser.parse_args()
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/paligemma_example.py b/examples/paligemma_example.py
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py