[perf] fix perf regression from vllm-project#12253 (vllm-project#12380)

Signed-off-by: youkaichao <[email protected]>
ShangmingCai · Jan 24, 2025 · 6dd94db · 6dd94db
1 parent 0e74d79
commit 6dd94db
Showing 1 changed file with 4 additions and 1 deletion.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -455,7 +455,6 @@ def __init__(self,
         self.enable_prompt_adapter = (self.runner.prompt_adapter_config
                                       is not None)
         self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
-        self.decode_only = True
 
         # Attention metadata inputs.
         if self.attn_backend is not None:
@@ -477,6 +476,10 @@ def prepare(self,
                 finished_requests_ids: Optional[List[str]] = None) -> None:
         self.finished_requests_ids = finished_requests_ids
 
+        # if the current batch is decode-only.
+        # will be set to False if there is any non-decode request.
+        self.decode_only = True
+
         # Intermediate data (data in CPU before going to GPU) for
         # the current sequence group.
         self.inter_data_list: List[