Skip to content

Commit

Permalink
more cleanups
Browse files Browse the repository at this point in the history
  • Loading branch information
alexm-neuralmagic committed Dec 9, 2024
1 parent 800f8d6 commit 545a40a
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 12 deletions.
3 changes: 0 additions & 3 deletions vllm/v1/core/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def schedule(self) -> "SchedulerOutput":
req_to_new_block_ids: Dict[str, List[int]] = {}
num_scheduled_tokens: Dict[str, int] = {}
token_budget = self.max_num_scheduled_tokens

# Encoder-related.
scheduled_encoder_inputs: Dict[str, List[int]] = {}
encoder_budget = self.max_num_encoder_input_tokens
Expand Down Expand Up @@ -344,7 +343,6 @@ def _try_schedule_encoder_inputs(
assert mm_positions is not None
assert len(mm_positions) > 0
for i, pos_info in enumerate(mm_positions):

start_pos = pos_info["offset"]
num_encoder_tokens = pos_info["length"]

Expand All @@ -367,7 +365,6 @@ def _try_schedule_encoder_inputs(
# tokens just before the encoder input.
num_new_tokens = start_pos - num_computed_tokens
break

if num_encoder_tokens > encoder_budget:
# The encoder budget is exhausted. We can only schedule the
# decoder tokens up until the encoder input.
Expand Down
10 changes: 6 additions & 4 deletions vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,13 @@ def _initialize_kv_caches(self,

def add_request(self, request: EngineCoreRequest):
"""Add request to the scheduler."""
req = Request.from_engine_core_request(request)

if req.mm_hashes is not None:
req.mm_inputs = self.mm_input_mapper_server.process_inputs(
req.mm_inputs, req.mm_hashes)
# Add doc
if request.mm_hashes is not None:
request.mm_inputs = self.mm_input_mapper_server.process_inputs(
request.mm_inputs, request.mm_hashes)

req = Request.from_engine_core_request(request)

self.scheduler.add_request(req)

Expand Down
3 changes: 0 additions & 3 deletions vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ def __init__(
eos_token_id: Optional[int],
arrival_time: float,
lora_request: Optional[LoRARequest] = None,
mm_hashes: Optional[List[str]] = None,
) -> None:
self.request_id = request_id
self.inputs = SingletonInputsAdapter(inputs)
Expand Down Expand Up @@ -57,7 +56,6 @@ def __init__(
self.mm_inputs = self.inputs.multi_modal_inputs
else:
self.mm_inputs: List[MultiModalKwargs] = []
self.mm_hashes = mm_hashes

@classmethod
def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
Expand All @@ -75,7 +73,6 @@ def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
eos_token_id=request.eos_token_id,
arrival_time=request.arrival_time,
lora_request=request.lora_request,
mm_hashes=request.mm_hashes,
)

@property
Expand Down
2 changes: 0 additions & 2 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,11 +352,9 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
req_input_ids: List[Tuple[int, int]] = []
for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
req_state = self.requests[req_id]

for input_id in encoder_input_ids:
mm_inputs.append(req_state.mm_inputs[input_id])
req_input_ids.append((req_id, input_id))

batched_mm_inputs = MultiModalKwargs.batch(mm_inputs)
batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
device=self.device)
Expand Down

0 comments on commit 545a40a

Please sign in to comment.