[Model] Support E5-V (vllm-project#9576)

Signed-off-by: Sumit Dubey <[email protected]>
sumitd2 · Nov 14, 2024 · a0c8d79 · a0c8d79
1 parent f28b5c3
commit a0c8d79
Show file tree

Hide file tree

Showing 12 changed files with 532 additions and 90 deletions.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -334,6 +334,14 @@ The following modalities are supported depending on the model:
 - **V**\ ideo
 - **A**\ udio
 
+Any combination of modalities joined by :code:`+` are supported.
+
+- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+
+On the other hand, modalities separated by :code:`/` are mutually exclusive.
+
+- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+
 .. _supported_vlms:
 
 Text Generation
@@ -484,6 +492,12 @@ Multimodal Embedding
     - Example HF Models
     - :ref:`LoRA <lora>`
     - :ref:`PP <distributed_serving>`
+  * - :code:`LlavaNextForConditionalGeneration`
+    - LLaVA-NeXT-based
+    - T / I
+    - :code:`royokong/e5-v`
+    - 
+    - ✅︎
   * - :code:`Phi3VForCausalLM`
     - Phi-3-Vision-based
     - T + I

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -1,6 +1,6 @@
 """
-This example shows how to use vLLM for running offline inference 
-with the correct prompt format on vision language models.
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
@@ -450,7 +450,7 @@ def main(args):
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
-        'vision language models')
+        'vision language models for text generation')
     parser.add_argument('--model-type',
                         '-m',
                         type=str,

diff --git a/examples/offline_inference_vision_language_embedding.py b/examples/offline_inference_vision_language_embedding.py
@@ -1,22 +1,170 @@
+"""
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for multimodal embedding.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from argparse import Namespace
+from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
+
+from PIL.Image import Image
+
 from vllm import LLM
-from vllm.assets.image import ImageAsset
-
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-prompt = "<|image_1|> Represent the given image with the following question: What is in the image"  # noqa: E501
-
-# Create an LLM.
-llm = LLM(
-    model="TIGER-Lab/VLM2Vec-Full",
-    task="embedding",
-    trust_remote_code=True,
-    max_model_len=4096,
-    max_num_seqs=2,
-    mm_processor_kwargs={"num_crops": 16},
-)
-
-# Generate embedding. The output is a list of EmbeddingRequestOutputs.
-outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})
-
-# Print the outputs.
-for output in outputs:
-    print(output.outputs.embedding)  # list of 3072 floats
+from vllm.multimodal.utils import fetch_image
+from vllm.utils import FlexibleArgumentParser
+
+
+class TextQuery(TypedDict):
+    modality: Literal["text"]
+    text: str
+
+
+class ImageQuery(TypedDict):
+    modality: Literal["image"]
+    image: Image
+
+
+class TextImageQuery(TypedDict):
+    modality: Literal["text+image"]
+    text: str
+    image: Image
+
+
+QueryModality = Literal["text", "image", "text+image"]
+Query = Union[TextQuery, ImageQuery, TextImageQuery]
+
+
+class ModelRequestData(NamedTuple):
+    llm: LLM
+    prompt: str
+    image: Optional[Image]
+
+
+def run_e5_v(query: Query):
+    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = llama3_template.format(
+            f"{text}\nSummary above sentence in one word: ")
+        image = None
+    elif query["modality"] == "image":
+        prompt = llama3_template.format(
+            "<image>\nSummary above image in one word: ")
+        image = query["image"]
+    else:
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    llm = LLM(
+        model="royokong/e5-v",
+        task="embedding",
+        max_model_len=4096,
+    )
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        image=image,
+    )
+
+
+def run_vlm2vec(query: Query):
+    if query["modality"] == "text":
+        text = query["text"]
+        prompt = f"Find me an everyday image that matches the given caption: {text}"  # noqa: E501
+        image = None
+    elif query["modality"] == "image":
+        prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+        image = query["image"]
+    elif query["modality"] == "text+image":
+        text = query["text"]
+        prompt = f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        image = query["image"]
+    else:
+        modality = query['modality']
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    llm = LLM(
+        model="TIGER-Lab/VLM2Vec-Full",
+        task="embedding",
+        trust_remote_code=True,
+        mm_processor_kwargs={"num_crops": 4},
+    )
+
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        image=image,
+    )
+
+
+def get_query(modality: QueryModality):
+    if modality == "text":
+        return TextQuery(modality="text", text="A dog sitting in the grass")
+
+    if modality == "image":
+        return ImageQuery(
+            modality="image",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg"  # noqa: E501
+            ),
+        )
+
+    if modality == "text+image":
+        return TextImageQuery(
+            modality="text+image",
+            text="A cat standing in the snow.",
+            image=fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg"  # noqa: E501
+            ),
+        )
+
+    msg = f"Modality {modality} is not supported."
+    raise ValueError(msg)
+
+
+def run_encode(model: str, modality: QueryModality):
+    query = get_query(modality)
+    req_data = model_example_map[model](query)
+
+    mm_data = {}
+    if req_data.image is not None:
+        mm_data["image"] = req_data.image
+
+    outputs = req_data.llm.encode({
+        "prompt": req_data.prompt,
+        "multi_modal_data": mm_data,
+    })
+
+    for output in outputs:
+        print(output.outputs.embedding)
+
+
+def main(args: Namespace):
+    run_encode(args.model_name, args.modality)
+
+
+model_example_map = {
+    "e5_v": run_e5_v,
+    "vlm2vec": run_vlm2vec,
+}
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for multimodal embedding')
+    parser.add_argument('--model-name',
+                        '-m',
+                        type=str,
+                        default="vlm2vec",
+                        choices=model_example_map.keys(),
+                        help='The name of the embedding model.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=get_args(QueryModality),
+                        help='Modality of the input.')
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
@@ -1,7 +1,7 @@
 """
 This example shows how to use vLLM for running offline inference with
-multi-image input on vision language models, using the chat template defined
-by the model.
+multi-image input on vision language models for text generation,
+using the chat template defined by the model.
 """
 from argparse import Namespace
 from typing import List, NamedTuple, Optional
@@ -334,7 +334,8 @@ def main(args: Namespace):
 if __name__ == "__main__":
     parser = FlexibleArgumentParser(
         description='Demo on using vLLM for offline inference with '
-        'vision language models that support multi-image input')
+        'vision language models that support multi-image input for text '
+        'generation')
     parser.add_argument('--model-type',
                         '-m',
                         type=str,

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -43,10 +43,12 @@
 _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 
-PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
-PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
-                         List[List[Tuple[np.ndarray, int]]]]
-PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
+_M = TypeVar("_M")
+_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+
+PromptImageInput = _PromptMultiModalInput[Image.Image]
+PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -318,12 +320,12 @@ def get_inputs(
                 "text": prompt,
                 "return_tensors": "pt",
             }
-            if images is not None and images[i] is not None:
-                processor_kwargs["images"] = images[i]
-            if videos is not None and videos[i] is not None:
-                processor_kwargs["videos"] = videos[i]
-            if audios is not None and audios[i] is not None:
-                audio, sr = audios[i]
+            if images is not None and (image := images[i]) is not None:
+                processor_kwargs["images"] = image
+            if videos is not None and (video := videos[i]) is not None:
+                processor_kwargs["videos"] = video
+            if audios is not None and (audio_tuple := audios[i]) is not None:
+                audio, sr = audio_tuple
                 processor_kwargs["audio"] = audio
                 processor_kwargs["sampling_rate"] = sr
 
@@ -338,7 +340,7 @@ def generate(
         self,
         prompts: List[str],
         images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
@@ -368,7 +370,7 @@ def generate_greedy(
         prompts: List[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
@@ -409,7 +411,7 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         images: Optional[PromptImageInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
         audios: Optional[PromptAudioInput] = None,
         **kwargs: Any,
     ) -> List[List[torch.Tensor]]:
@@ -488,7 +490,7 @@ def generate_greedy_logprobs_limit(
         num_logprobs: int,
         images: Optional[PromptImageInput] = None,
         audios: Optional[PromptAudioInput] = None,
-        videos: Optional[List[np.ndarray]] = None,
+        videos: Optional[PromptVideoInput] = None,
         **kwargs: Any,
     ) -> List[TokensTextLogprobs]:
         all_inputs = self.get_inputs(prompts,
@@ -657,15 +659,18 @@ def get_inputs(
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = {"image": image}
+                if image is not None:
+                    inputs[i]["multi_modal_data"] = {"image": image}
 
         if videos is not None:
             for i, video in enumerate(videos):
-                inputs[i]["multi_modal_data"] = {"video": video}
+                if video is not None:
+                    inputs[i]["multi_modal_data"] = {"video": video}
 
         if audios is not None:
             for i, audio in enumerate(audios):
-                inputs[i]["multi_modal_data"] = {"audio": audio}
+                if audio is not None:
+                    inputs[i]["multi_modal_data"] = {"audio": audio}
 
         return inputs
 
@@ -837,13 +842,20 @@ def generate_beam_search(
             returned_outputs.append((token_ids, texts))
         return returned_outputs
 
-    def encode(self, prompts: List[str]) -> List[List[float]]:
-        req_outputs = self.model.encode(prompts)
-        outputs = []
-        for req_output in req_outputs:
-            embedding = req_output.outputs.embedding
-            outputs.append(embedding)
-        return outputs
+    def encode(
+        self,
+        prompts: List[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[List[float]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.model.encode(inputs)
+        return [req_output.outputs.embedding for req_output in req_outputs]
 
     def __enter__(self):
         return self

diff --git a/tests/models/embedding/utils.py b/tests/models/embedding/utils.py
@@ -16,7 +16,8 @@ def check_embeddings_close(
 
     for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
             zip(embeddings_0_lst, embeddings_1_lst)):
-        assert len(embeddings_0) == len(embeddings_1)
+        assert len(embeddings_0) == len(embeddings_1), (
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
 
         sim = F.cosine_similarity(torch.tensor(embeddings_0),
                                   torch.tensor(embeddings_1),