diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 0833a5fe6a6d5..daed2979fdb16 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -106,6 +106,12 @@ def _call_hf_processor(
         audios = mm_data.pop("audios", [])
 
         if not audios:
+            if not mm_data:
+                # Text-only input not supported in composite processor
+                prompt_ids = self._get_tokenizer().encode(prompt)
+                return BatchFeature(dict(input_ids=[prompt_ids]),
+                                    tensor_type="pt")
+
             return super()._call_hf_processor(
                 prompt=prompt,
                 mm_data=mm_data,
@@ -153,6 +159,7 @@ def _get_mm_field_tags(
     ) -> Mapping[str, MultiModalFieldTag]:
         return dict(
             audio_features=MultiModalFieldTags.indexed("audio"),
+            audio_token_len=MultiModalFieldTags.indexed("audio"),
             audio_embeds=MultiModalFieldTags.indexed("audio"),
         )
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index d8f1d65d1115f..3780e764a18ec 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -803,14 +803,14 @@ def _call_hf_processor(
 
     def _apply_hf_processor(
         self,
-        prompt: str,
+        prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
         processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
 
         processed_data = self._call_hf_processor(
-            prompt=prompt,
+            prompt=prompt_text,
             mm_data=processor_data,
             mm_kwargs=hf_processor_mm_kwargs,
         )
@@ -827,7 +827,7 @@ def _apply_hf_processor(
 
     def _cached_apply_hf_processor(
         self,
-        prompt: str,
+        prompt_text: str,
         mm_items: MultiModalDataItems,
         hf_processor_mm_kwargs: Mapping[str, object],
     ) -> tuple[list[int], MultiModalKwargs]:
@@ -835,7 +835,7 @@ def _cached_apply_hf_processor(
 
         if cache is None:
             return self._apply_hf_processor(
-                prompt=prompt,
+                prompt_text=prompt_text,
                 mm_items=mm_items,
                 hf_processor_mm_kwargs=hf_processor_mm_kwargs,
             )
@@ -860,15 +860,17 @@ def _cached_apply_hf_processor(
 
         # Rely on our placeholder replacement logic instead of HF
         # to insert the placeholder tokens
-        prompt_ids = _encode(self._get_tokenizer(),
-                             prompt,
-                             add_special_tokens=True)
-
-        _, mm_missing_kwargs = self._apply_hf_processor(
-            prompt=prompt,
-            mm_items=mm_missing_items,
-            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-        )
+        prompt_ids = self._get_tokenizer().encode(prompt_text)
+
+        if mm_missing_items:
+            _, mm_missing_kwargs = self._apply_hf_processor(
+                prompt_text=prompt_text,
+                mm_items=mm_missing_items,
+                hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            )
+        else:
+            # Avoid unnecessary tokenization of the prompt text
+            mm_missing_kwargs = MultiModalKwargs({})
 
         mm_missing_next_idx = {modality: 0 for modality in mm_missing_items}