diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py index 3583f87779b2a..d1052587ef87b 100644 --- a/vllm/v1/core/scheduler.py +++ b/vllm/v1/core/scheduler.py @@ -99,7 +99,6 @@ def schedule(self) -> "SchedulerOutput": req_to_new_block_ids: Dict[str, List[int]] = {} num_scheduled_tokens: Dict[str, int] = {} token_budget = self.max_num_scheduled_tokens - # Encoder-related. scheduled_encoder_inputs: Dict[str, List[int]] = {} encoder_budget = self.max_num_encoder_input_tokens @@ -344,7 +343,6 @@ def _try_schedule_encoder_inputs( assert mm_positions is not None assert len(mm_positions) > 0 for i, pos_info in enumerate(mm_positions): - start_pos = pos_info["offset"] num_encoder_tokens = pos_info["length"] @@ -367,7 +365,6 @@ def _try_schedule_encoder_inputs( # tokens just before the encoder input. num_new_tokens = start_pos - num_computed_tokens break - if num_encoder_tokens > encoder_budget: # The encoder budget is exhausted. We can only schedule the # decoder tokens up until the encoder input. diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 55b96748f38a6..06d6a2dfa3ed5 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -83,11 +83,13 @@ def _initialize_kv_caches(self, def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" - req = Request.from_engine_core_request(request) - if req.mm_hashes is not None: - req.mm_inputs = self.mm_input_mapper_server.process_inputs( - req.mm_inputs, req.mm_hashes) + # Add doc + if request.mm_hashes is not None: + request.mm_inputs = self.mm_input_mapper_server.process_inputs( + request.mm_inputs, request.mm_hashes) + + req = Request.from_engine_core_request(request) self.scheduler.add_request(req) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 0ef46dd6487e6..6bc1e4d5c769f 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -20,7 +20,6 @@ def __init__( eos_token_id: Optional[int], arrival_time: float, lora_request: Optional[LoRARequest] = None, - mm_hashes: Optional[List[str]] = None, ) -> None: self.request_id = request_id self.inputs = SingletonInputsAdapter(inputs) @@ -57,7 +56,6 @@ def __init__( self.mm_inputs = self.inputs.multi_modal_inputs else: self.mm_inputs: List[MultiModalKwargs] = [] - self.mm_hashes = mm_hashes @classmethod def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": @@ -75,7 +73,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request": eos_token_id=request.eos_token_id, arrival_time=request.arrival_time, lora_request=request.lora_request, - mm_hashes=request.mm_hashes, ) @property diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index c35888580faa4..4692762493f00 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -352,11 +352,9 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): req_input_ids: List[Tuple[int, int]] = [] for req_id, encoder_input_ids in scheduled_encoder_inputs.items(): req_state = self.requests[req_id] - for input_id in encoder_input_ids: mm_inputs.append(req_state.mm_inputs[input_id]) req_input_ids.append((req_id, input_id)) - batched_mm_inputs = MultiModalKwargs.batch(mm_inputs) batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs, device=self.device)