diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 0833a5fe6a6d5..daed2979fdb16 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -106,6 +106,12 @@ def _call_hf_processor( audios = mm_data.pop("audios", []) if not audios: + if not mm_data: + # Text-only input not supported in composite processor + prompt_ids = self._get_tokenizer().encode(prompt) + return BatchFeature(dict(input_ids=[prompt_ids]), + tensor_type="pt") + return super()._call_hf_processor( prompt=prompt, mm_data=mm_data, @@ -153,6 +159,7 @@ def _get_mm_field_tags( ) -> Mapping[str, MultiModalFieldTag]: return dict( audio_features=MultiModalFieldTags.indexed("audio"), + audio_token_len=MultiModalFieldTags.indexed("audio"), audio_embeds=MultiModalFieldTags.indexed("audio"), ) diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index d8f1d65d1115f..3780e764a18ec 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -803,14 +803,14 @@ def _call_hf_processor( def _apply_hf_processor( self, - prompt: str, + prompt_text: str, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: processor_data, passthrough_data = self._get_hf_mm_data(mm_items) processed_data = self._call_hf_processor( - prompt=prompt, + prompt=prompt_text, mm_data=processor_data, mm_kwargs=hf_processor_mm_kwargs, ) @@ -827,7 +827,7 @@ def _apply_hf_processor( def _cached_apply_hf_processor( self, - prompt: str, + prompt_text: str, mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], ) -> tuple[list[int], MultiModalKwargs]: @@ -835,7 +835,7 @@ def _cached_apply_hf_processor( if cache is None: return self._apply_hf_processor( - prompt=prompt, + prompt_text=prompt_text, mm_items=mm_items, hf_processor_mm_kwargs=hf_processor_mm_kwargs, ) @@ -860,15 +860,17 @@ def _cached_apply_hf_processor( # Rely on our placeholder replacement logic instead of HF # to insert the placeholder tokens - prompt_ids = _encode(self._get_tokenizer(), - prompt, - add_special_tokens=True) - - _, mm_missing_kwargs = self._apply_hf_processor( - prompt=prompt, - mm_items=mm_missing_items, - hf_processor_mm_kwargs=hf_processor_mm_kwargs, - ) + prompt_ids = self._get_tokenizer().encode(prompt_text) + + if mm_missing_items: + _, mm_missing_kwargs = self._apply_hf_processor( + prompt_text=prompt_text, + mm_items=mm_missing_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + else: + # Avoid unnecessary tokenization of the prompt text + mm_missing_kwargs = MultiModalKwargs({}) mm_missing_next_idx = {modality: 0 for modality in mm_missing_items}