diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 3633214c17a7a..c5e905731d763 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -6,9 +6,9 @@ from vllm.assets.image import ImageAsset from vllm.config import ModelConfig -from vllm.entrypoints.llm import apply_hf_chat_template from vllm.entrypoints.chat_utils import (parse_chat_messages, parse_chat_messages_futures) +from vllm.entrypoints.llm import apply_hf_chat_template from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import encode_image_base64 from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -515,27 +515,38 @@ def test_mllama_interleaved_images( }] }] + def test_mllama_parse_matches_hf( mllama_model_config, mllama_tokenizer, image_url, ): - """Checks end to end correctness of hf allignment for mllama parsing.""" + """Checks end to end correctness of hf alignment for mllama parsing.""" + def get_conversation(is_hf: bool): img_part = {"type": "image_url", "image_url": {"url": image_url}} if is_hf: img_part = {'type': 'image'} - return [ + return [{ + 'role': + 'user', + 'content': [ { - 'role': 'user', 'content': [ - {'type': 'text', 'text': 'The content of the first image is:'}, - img_part, - {'type': 'text', 'text': 'The content of the second image is:'}, - img_part, - {'type': 'text', 'text': 'What animal is in the first image?'}, - ] - } + 'type': 'text', + 'text': 'The content of the first image is:' + }, + img_part, + { + 'type': 'text', + 'text': 'The content of the second image is:' + }, + img_part, + { + 'type': 'text', + 'text': 'What animal is in the first image?' + }, ] + }] tokenizer = mllama_tokenizer.tokenizer # Build and parse a conversation with {"type": "image"} using the tokenizer diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 90e2a9000e0e6..0165f50a8354e 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -410,6 +410,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], _ImageParser = partial(cast, ChatCompletionContentPartImageParam) _AudioParser = partial(cast, ChatCompletionContentPartAudioParam) _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) +MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'} # Define a mapping from part types to their corresponding parsing functions. MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = { @@ -482,44 +483,71 @@ def _parse_chat_message_content_parts( parts: Iterable[ChatCompletionContentPartParam], mm_tracker: BaseMultiModalItemTracker, ) -> List[ConversationMessage]: - texts: List[str] = [] + content = [] mm_parser = mm_tracker.create_parser() + keep_multimodal_content = \ + mm_tracker._model_config.hf_config.model_type in \ + MODEL_KEEP_MULTI_MODAL_CONTENT for part in parts: - if isinstance(part, str): # Handle plain text parts - text = _TextParser(part) - texts.append(text) - else: # Handle structured dictionary parts - part_type, content = _parse_chat_message_content_mm_part(part) - - # if part_type is text/refusal/image_url/audio_url but - # content is empty, logg a warning and skip - if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: - logger.warning("Skipping multimodal part " - "with empty / unparsable content.") - continue - - if part_type in ("text", "refusal"): - texts.append(content) - elif part_type == "image_url": - mm_parser.parse_image(content) - # has_image = True - elif part_type == "audio_url": - mm_parser.parse_audio(content) - else: - raise NotImplementedError(f"Unknown part type: {part_type}") - + parse_res = _parse_chat_message_content_part( + part, mm_parser, wrap_dicts=keep_multimodal_content) + if parse_res: + content.append(parse_res) + + if keep_multimodal_content: + # Parsing wraps images and texts as interleaved dictionaries + return [ConversationMessage(role=role, + content=content)] # type: ignore + texts = cast(List[str], content) text_prompt = "\n".join(texts) mm_placeholder_counts = mm_parser.mm_placeholder_counts() if mm_placeholder_counts: - text_prompt = _get_full_multimodal_text_prompt( - mm_placeholder_counts, - text_prompt, - ) + text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts, + text_prompt) return [ConversationMessage(role=role, content=text_prompt)] +def _parse_chat_message_content_part( + part: ChatCompletionContentPartParam, + mm_parser: BaseMultiModalContentParser, + wrap_dicts: bool) -> Optional[Union[str, Dict[str, str]]]: + """Parses a single part of a conversation. If wrap_dicts is True, + structured dictionary pieces for texts and images will be + wrapped in dictionaries, i.e., {"type": "text", "text", ...} and + {"type": "image"}, respectively. Otherwise multimodal data will be + handled by mm_parser, and texts will be returned as strings to be joined + with multimodal placeholders. + """ + if isinstance(part, str): # Handle plain text parts + text = _TextParser(part) + return text + else: # Handle structured dictionary parts + part_type, content = _parse_chat_message_content_mm_part(part) + + # if part_type is text/refusal/image_url/audio_url but + # content is empty, log a warning and skip + if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content: + logger.warning("Skipping multimodal part " + "with empty / unparsable content.") + return None + + if part_type in ("text", "refusal"): + if wrap_dicts: + return {'type': 'text', 'text': content} + return content + elif part_type == "image_url": + mm_parser.parse_image(content) + if wrap_dicts: + return {'type': 'image'} + elif part_type == "audio_url": + mm_parser.parse_audio(content) + else: + raise NotImplementedError(f"Unknown part type: {part_type}") + return None + + # No need to validate using Pydantic again _AssistantParser = partial(cast, ChatCompletionAssistantMessageParam) _ToolParser = partial(cast, ChatCompletionToolMessageParam)