Skip to content

Commit

Permalink
[Model] Support E5-V (vllm-project#9576)
Browse files Browse the repository at this point in the history
Signed-off-by: Sumit Dubey <[email protected]>
  • Loading branch information
DarkLight1337 authored and sumitd2 committed Nov 14, 2024
1 parent f28b5c3 commit a0c8d79
Show file tree
Hide file tree
Showing 12 changed files with 532 additions and 90 deletions.
14 changes: 14 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,14 @@ The following modalities are supported depending on the model:
- **V**\ ideo
- **A**\ udio

Any combination of modalities joined by :code:`+` are supported.

- e.g.: :code:`T + I` means that the model supports text-only, image-only, and text-with-image inputs.

On the other hand, modalities separated by :code:`/` are mutually exclusive.

- e.g.: :code:`T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.

.. _supported_vlms:

Text Generation
Expand Down Expand Up @@ -484,6 +492,12 @@ Multimodal Embedding
- Example HF Models
- :ref:`LoRA <lora>`
- :ref:`PP <distributed_serving>`
* - :code:`LlavaNextForConditionalGeneration`
- LLaVA-NeXT-based
- T / I
- :code:`royokong/e5-v`
-
- ✅︎
* - :code:`Phi3VForCausalLM`
- Phi-3-Vision-based
- T + I
Expand Down
6 changes: 3 additions & 3 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on vision language models.
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for text generation.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
Expand Down Expand Up @@ -450,7 +450,7 @@ def main(args):
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models')
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
Expand Down
190 changes: 169 additions & 21 deletions examples/offline_inference_vision_language_embedding.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,170 @@
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
from argparse import Namespace
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args

from PIL.Image import Image

from vllm import LLM
from vllm.assets.image import ImageAsset

image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
prompt = "<|image_1|> Represent the given image with the following question: What is in the image" # noqa: E501

# Create an LLM.
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
task="embedding",
trust_remote_code=True,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={"num_crops": 16},
)

# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = llm.encode({"prompt": prompt, "multi_modal_data": {"image": image}})

# Print the outputs.
for output in outputs:
print(output.outputs.embedding) # list of 3072 floats
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser


class TextQuery(TypedDict):
modality: Literal["text"]
text: str


class ImageQuery(TypedDict):
modality: Literal["image"]
image: Image


class TextImageQuery(TypedDict):
modality: Literal["text+image"]
text: str
image: Image


QueryModality = Literal["text", "image", "text+image"]
Query = Union[TextQuery, ImageQuery, TextImageQuery]


class ModelRequestData(NamedTuple):
llm: LLM
prompt: str
image: Optional[Image]


def run_e5_v(query: Query):
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501

if query["modality"] == "text":
text = query["text"]
prompt = llama3_template.format(
f"{text}\nSummary above sentence in one word: ")
image = None
elif query["modality"] == "image":
prompt = llama3_template.format(
"<image>\nSummary above image in one word: ")
image = query["image"]
else:
modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'")

llm = LLM(
model="royokong/e5-v",
task="embedding",
max_model_len=4096,
)

return ModelRequestData(
llm=llm,
prompt=prompt,
image=image,
)


def run_vlm2vec(query: Query):
if query["modality"] == "text":
text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
image = None
elif query["modality"] == "image":
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
image = query["image"]
else:
modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'")

llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
task="embedding",
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
)

return ModelRequestData(
llm=llm,
prompt=prompt,
image=image,
)


def get_query(modality: QueryModality):
if modality == "text":
return TextQuery(modality="text", text="A dog sitting in the grass")

if modality == "image":
return ImageQuery(
modality="image",
image=fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501
),
)

if modality == "text+image":
return TextImageQuery(
modality="text+image",
text="A cat standing in the snow.",
image=fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501
),
)

msg = f"Modality {modality} is not supported."
raise ValueError(msg)


def run_encode(model: str, modality: QueryModality):
query = get_query(modality)
req_data = model_example_map[model](query)

mm_data = {}
if req_data.image is not None:
mm_data["image"] = req_data.image

outputs = req_data.llm.encode({
"prompt": req_data.prompt,
"multi_modal_data": mm_data,
})

for output in outputs:
print(output.outputs.embedding)


def main(args: Namespace):
run_encode(args.model_name, args.modality)


model_example_map = {
"e5_v": run_e5_v,
"vlm2vec": run_vlm2vec,
}

if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding')
parser.add_argument('--model-name',
'-m',
type=str,
default="vlm2vec",
choices=model_example_map.keys(),
help='The name of the embedding model.')
parser.add_argument('--modality',
type=str,
default="image",
choices=get_args(QueryModality),
help='Modality of the input.')
args = parser.parse_args()
main(args)
7 changes: 4 additions & 3 deletions examples/offline_inference_vision_language_multi_image.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""
This example shows how to use vLLM for running offline inference with
multi-image input on vision language models, using the chat template defined
by the model.
multi-image input on vision language models for text generation,
using the chat template defined by the model.
"""
from argparse import Namespace
from typing import List, NamedTuple, Optional
Expand Down Expand Up @@ -334,7 +334,8 @@ def main(args: Namespace):
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input')
'vision language models that support multi-image input for text '
'generation')
parser.add_argument('--model-type',
'-m',
type=str,
Expand Down
60 changes: 36 additions & 24 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]

PromptImageInput = Union[List[Image.Image], List[List[Image.Image]]]
PromptAudioInput = Union[List[Tuple[np.ndarray, int]],
List[List[Tuple[np.ndarray, int]]]]
PromptVideoInput = Union[List[np.ndarray], List[List[np.ndarray]]]
_M = TypeVar("_M")
_PromptMultiModalInput = Union[List[_M], List[List[_M]]]

PromptImageInput = _PromptMultiModalInput[Image.Image]
PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
PromptVideoInput = _PromptMultiModalInput[np.ndarray]


def _read_prompts(filename: str) -> List[str]:
Expand Down Expand Up @@ -318,12 +320,12 @@ def get_inputs(
"text": prompt,
"return_tensors": "pt",
}
if images is not None and images[i] is not None:
processor_kwargs["images"] = images[i]
if videos is not None and videos[i] is not None:
processor_kwargs["videos"] = videos[i]
if audios is not None and audios[i] is not None:
audio, sr = audios[i]
if images is not None and (image := images[i]) is not None:
processor_kwargs["images"] = image
if videos is not None and (video := videos[i]) is not None:
processor_kwargs["videos"] = video
if audios is not None and (audio_tuple := audios[i]) is not None:
audio, sr = audio_tuple
processor_kwargs["audio"] = audio
processor_kwargs["sampling_rate"] = sr

Expand All @@ -338,7 +340,7 @@ def generate(
self,
prompts: List[str],
images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[Tuple[List[List[int]], List[str]]]:
Expand Down Expand Up @@ -368,7 +370,7 @@ def generate_greedy(
prompts: List[str],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[Tuple[List[int], str]]:
Expand Down Expand Up @@ -409,7 +411,7 @@ def generate_greedy_logprobs(
prompts: List[str],
max_tokens: int,
images: Optional[PromptImageInput] = None,
videos: Optional[List[np.ndarray]] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
**kwargs: Any,
) -> List[List[torch.Tensor]]:
Expand Down Expand Up @@ -488,7 +490,7 @@ def generate_greedy_logprobs_limit(
num_logprobs: int,
images: Optional[PromptImageInput] = None,
audios: Optional[PromptAudioInput] = None,
videos: Optional[List[np.ndarray]] = None,
videos: Optional[PromptVideoInput] = None,
**kwargs: Any,
) -> List[TokensTextLogprobs]:
all_inputs = self.get_inputs(prompts,
Expand Down Expand Up @@ -657,15 +659,18 @@ def get_inputs(
inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
if images is not None:
for i, image in enumerate(images):
inputs[i]["multi_modal_data"] = {"image": image}
if image is not None:
inputs[i]["multi_modal_data"] = {"image": image}

if videos is not None:
for i, video in enumerate(videos):
inputs[i]["multi_modal_data"] = {"video": video}
if video is not None:
inputs[i]["multi_modal_data"] = {"video": video}

if audios is not None:
for i, audio in enumerate(audios):
inputs[i]["multi_modal_data"] = {"audio": audio}
if audio is not None:
inputs[i]["multi_modal_data"] = {"audio": audio}

return inputs

Expand Down Expand Up @@ -837,13 +842,20 @@ def generate_beam_search(
returned_outputs.append((token_ids, texts))
return returned_outputs

def encode(self, prompts: List[str]) -> List[List[float]]:
req_outputs = self.model.encode(prompts)
outputs = []
for req_output in req_outputs:
embedding = req_output.outputs.embedding
outputs.append(embedding)
return outputs
def encode(
self,
prompts: List[str],
images: Optional[PromptImageInput] = None,
videos: Optional[PromptVideoInput] = None,
audios: Optional[PromptAudioInput] = None,
) -> List[List[float]]:
inputs = self.get_inputs(prompts,
images=images,
videos=videos,
audios=audios)

req_outputs = self.model.encode(inputs)
return [req_output.outputs.embedding for req_output in req_outputs]

def __enter__(self):
return self
Expand Down
3 changes: 2 additions & 1 deletion tests/models/embedding/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def check_embeddings_close(

for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)):
assert len(embeddings_0) == len(embeddings_1)
assert len(embeddings_0) == len(embeddings_1), (
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")

sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1),
Expand Down
Loading

0 comments on commit a0c8d79

Please sign in to comment.