remove disable arg

vllm-project · Dec 2, 2024 · f340a9d · f340a9d
1 parent 4e8dac4
commit f340a9d
Show file tree

Hide file tree

Showing 6 changed files with 7 additions and 30 deletions.
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -26,7 +26,6 @@ def run_llava(question: str, modality: str):
 
     llm = LLM(model="llava-hf/llava-1.5-7b-hf",
               max_model_len=4096)
-              #mm_disable_frontend_processor=True)
 
     stop_token_ids = None
     return llm, prompt, stop_token_ids

diff --git a/vllm/config.py b/vllm/config.py
@@ -125,8 +125,6 @@ class ModelConfig:
             HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
-        mm_disable_frontend_processor: Disables multi-modal HF preprocessor/mapper 
-            execution in the frontend process (not recommended)
         override_neuron_config: Initialize non default neuron config or
             override default neuron config that are specific to Neuron devices,
             this argument will be used to configure the neuron config that
@@ -165,7 +163,6 @@ def __init__(
             config_format: ConfigFormat = ConfigFormat.AUTO,
             hf_overrides: Optional[HfOverrides] = None,
             mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-            mm_disable_frontend_processor: bool = False,
             override_neuron_config: Optional[Dict[str, Any]] = None,
             override_pooler_config: Optional["PoolerConfig"] = None) -> None:
         self.model = model
@@ -225,7 +222,6 @@ def __init__(
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.use_async_output_proc = use_async_output_proc
         self.mm_processor_kwargs = mm_processor_kwargs
-        self.mm_disable_frontend_processor = mm_disable_frontend_processor
 
         # Set enforce_eager to False if the value is unset.
         if self.enforce_eager is None:
@@ -2397,8 +2393,7 @@ def __str__(self):
         "decoding_config=%r, observability_config=%r, "
         "seed=%d, served_model_name=%s, "
         "num_scheduler_steps=%d, enable_prefix_caching=%s, "
-        "use_async_output_proc=%s, mm_processor_kwargs=%s, "
-        "mm_disable_frontend_processor=%s") % \
+        "use_async_output_proc=%s, mm_processor_kwargs=%s") % \
         (self.model_config.model, self.speculative_config,
         self.model_config.tokenizer,
         self.model_config.skip_tokenizer_init,
@@ -2424,5 +2419,4 @@ def __str__(self):
         self.scheduler_config.num_scheduler_steps,
         self.cache_config.enable_prefix_caching,
         self.model_config.use_async_output_proc,
-        self.model_config.mm_processor_kwargs,
-        self.model_config.mm_disable_frontend_processor)
+        self.model_config.mm_processor_kwargs)
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -141,7 +141,6 @@ class EngineArgs:
     tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
-    mm_disable_frontend_processor: bool = False
     enable_lora: bool = False
     enable_lora_bias: bool = False
     max_loras: int = 1
@@ -570,12 +569,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             help=('Overrides for the multimodal input mapping/processing, '
                   'e.g., image processor. For example: {"num_crops": 4}.'))
 
-        parser.add_argument(
-            '--mm-disable-frontend-processor',
-            action='store_true',
-            default=EngineArgs.mm_disable_frontend_processor,
-            help="Disable multi-modal frontend processing (not recommended)")
-
         # LoRA related configs
         parser.add_argument('--enable-lora',
                             action='store_true',
@@ -941,7 +934,6 @@ def create_model_config(self) -> ModelConfig:
             use_async_output_proc=not self.disable_async_output_proc,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
-            mm_disable_frontend_processor=self.mm_disable_frontend_processor,
             override_neuron_config=self.override_neuron_config,
             override_pooler_config=self.override_pooler_config,
         )

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -168,7 +168,6 @@ def __init__(
         disable_async_output_proc: bool = False,
         hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-        mm_disable_frontend_processor: bool = False,
         # After positional args are removed, move this right below `model`
         task: TaskOption = "auto",
         override_pooler_config: Optional[PoolerConfig] = None,
@@ -214,7 +213,6 @@ def __init__(
             disable_async_output_proc=disable_async_output_proc,
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
-            mm_disable_frontend_processor=mm_disable_frontend_processor,
             override_pooler_config=override_pooler_config,
             compilation_config=compilation_config_instance,
             **kwargs,
@@ -545,7 +543,6 @@ def chat(
         continue_final_message: bool = False,
         tools: Optional[List[Dict[str, Any]]] = None,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-        mm_disable_frontend_processor: bool = False
     ) -> List[RequestOutput]:
         """
         Generate responses for a chat conversation.
@@ -587,9 +584,7 @@ def chat(
                 ``True`` if ``add_generation_prompt`` is also ``True``.
             mm_processor_kwargs: Multimodal processor kwarg overrides for this
                 chat request. Only used for offline requests.
-            mm_disable_frontend_processor: Disable multi-modal frontend 
-                processing (not recommended)
-
+            
         Returns:
             A list of ``RequestOutput`` objects containing the generated
             responses in the same order as the input messages.

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -98,11 +98,9 @@ def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
         req = Request.from_engine_core_request(request)
 
-        # Apply multi-modal mapper (if necessary)
-        if req.mm_data:
-            assert req.mm_inputs is None or req.mm_inputs == []
-            req.mm_inputs = self.mm_input_mapper.process_inputs(
-                req.mm_data, req.mm_processor_kwargs)
+        # Sanity check to verify that the multi-modal preprocessor
+        # ran in the frontend P0 process
+        assert req.mm_data is None or req.mm_data == {}
 
         self.scheduler.add_request(req)
 

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -104,8 +104,7 @@ def process_inputs(
         # here in the frontend process (if enabled)
         mm_data = decoder_inputs.multi_modal_data
         mm_inputs = None
-        if (not self.model_config.mm_disable_frontend_processor
-                and mm_data is not None):
+        if mm_data is not None:
             mm_inputs = self.mm_input_mapper.process_inputs(
                 decoder_inputs.multi_modal_data,
                 decoder_inputs.mm_processor_kwargs)