Update

Signed-off-by: DarkLight1337 <[email protected]>
vllm-project · Dec 19, 2024 · 32e5197 · 32e5197
1 parent c3f1bde
commit 32e5197
Showing 1 changed file with 17 additions and 11 deletions.
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
@@ -599,8 +599,8 @@ def __init__(self, capacity: int) -> None:
         # DEBUG: Set to None to disable
         self.debug_cache_hit_ratio_steps: Optional[int] = None
 
-        self._text_cache = LRUCache[str, BatchFeature](capacity)
-        self._mm_cache = LRUCache[str, BatchFeature](capacity)
+        self._fine_text_cache = LRUCache[str, BatchFeature](capacity)
+        self._fine_mm_cache = LRUCache[str, BatchFeature](capacity)
         self._coarse_cache = LRUCache[str, BatchFeature](capacity)
 
     def maybe_log_cache_stats(self, cache: LRUCache, name: str) -> None:
@@ -674,9 +674,9 @@ def _cached_call_fine(
         for idx in range(num_items):
             mm_item = {k: [v[idx]] for k, v in mm_data.items()}
 
-            self.maybe_log_cache_stats(self._mm_cache, "mm_cache")
+            self.maybe_log_cache_stats(self._fine_mm_cache, "fine_mm_cache")
 
-            processed_mm_item = self._mm_cache.get_or_put(
+            processed_mm_item = self._fine_mm_cache.get_or_put(
                 self._hash_kwargs(**mm_item, **mm_kwargs),
                 default_factory=partial(
                     ctx.call_hf_processor,
@@ -687,15 +687,16 @@ def _cached_call_fine(
             )
 
             for k, v in processed_mm_item.items():
+                # Remove the extra batch dimension
                 processed_mm_items[k].append(v[0])
 
-        # NOTE: Some processors do not accept mm-only input, in which case
-        # we have to fallback to processing `prompt` and `mm_data` together
-        # Therefore, we place the text processing last to avoid redundant
-        # computation
-        self.maybe_log_cache_stats(self._text_cache, "text_cache")
+        # NOTE: Some processors (e.g. llava) do not accept mm-only input,
+        # in which case we have to fallback to processing `prompt` and `mm_data`
+        # together. Therefore, we place the text processing last to avoid
+        # redundant computation
+        self.maybe_log_cache_stats(self._fine_text_cache, "fine_text_cache")
 
-        processed_text = self._text_cache.get_or_put(
+        processed_text = self._fine_text_cache.get_or_put(
             prompt,
             default_factory=partial(
                 ctx.call_hf_processor,
@@ -751,7 +752,12 @@ def call_hf_processor(
                     mm_kwargs,
                 )
             except Exception:
-                pass  # See NOTE in `_cached_call_fine`
+                # Failures are expected; see NOTE in `_cached_call_fine`
+                logger.debug(
+                    "Failed to apply processor on each item separately",
+                    stack_info=True,
+                )
+                pass
 
         return self._cached_call_coarse(
             ctx,