Skip to content

Commit 28fc47d

Browse files
WoosukKwonweilong.yu
authored andcommitted
[V1] Use input_ids as input for text-only models (vllm-project#11032)
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent 782521e commit 28fc47d

File tree

1 file changed

+47
-21
lines changed

1 file changed

+47
-21
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def __init__(
6161
self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
6262
cache_config.cache_dtype]
6363

64+
self.is_multimodal_model = model_config.is_multimodal_model
6465
self.sliding_window = model_config.get_sliding_window()
6566
self.block_size = cache_config.block_size
6667
self.max_model_len = model_config.max_model_len
@@ -103,6 +104,11 @@ def __init__(
103104
# The batch sizes in the config are in descending order.
104105
self.cudagraph_batch_sizes = list(
105106
reversed(self.vllm_config.compilation_config.capture_sizes))
107+
108+
# Persistent buffers for CUDA graphs.
109+
self.input_ids = torch.zeros(self.max_num_tokens,
110+
dtype=torch.int32,
111+
device=self.device)
106112
self.positions = torch.zeros(self.max_num_tokens,
107113
dtype=torch.int64,
108114
device=self.device)
@@ -310,7 +316,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
310316
seq_start_loc_np[0] = 0
311317
np.cumsum(seq_lens, out=seq_start_loc_np[1:])
312318

313-
input_ids = input_ids.to(self.device, non_blocking=True)
319+
self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
320+
non_blocking=True)
314321
self.positions[:total_num_scheduled_tokens].copy_(positions,
315322
non_blocking=True)
316323
query_start_loc = query_start_loc.to(self.device, non_blocking=True)
@@ -331,7 +338,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
331338
# token from the partial request.
332339
# TODO: Support prompt logprobs.
333340
logits_indices = query_start_loc[1:] - 1
334-
return input_ids, attn_metadata, logits_indices
341+
return attn_metadata, logits_indices
335342

336343
def _prepare_sampling(
337344
self,
@@ -427,13 +434,15 @@ def execute_model(
427434
) -> ModelRunnerOutput:
428435
self._update_states(scheduler_output)
429436

430-
# Run the encoder.
431-
self._execute_encoder(scheduler_output)
432-
encoder_outputs = self._gather_encoder_outputs(scheduler_output)
437+
if self.is_multimodal_model:
438+
# Run the multimodal encoder if any.
439+
self._execute_encoder(scheduler_output)
440+
encoder_outputs = self._gather_encoder_outputs(scheduler_output)
441+
else:
442+
encoder_outputs = []
433443

434444
# Prepare the decoder inputs.
435-
input_ids, attn_metadata, logits_indices = self._prepare_inputs(
436-
scheduler_output)
445+
attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
437446
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
438447
if (self.use_cuda_graph
439448
and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -444,29 +453,39 @@ def execute_model(
444453
else:
445454
# Eager mode.
446455
num_input_tokens = num_scheduled_tokens
447-
448456
attn_metadata.num_input_tokens = num_input_tokens
449457

450-
# Get the inputs embeds.
451-
if encoder_outputs:
452-
inputs_embeds = self.model.get_input_embeddings(
453-
input_ids, encoder_outputs)
458+
if self.is_multimodal_model:
459+
# NOTE(woosuk): To unify token ids and soft tokens (vision
460+
# embeddings), we always use embeddings (rather than token ids)
461+
# as input to the multimodal model, even when the input is text.
462+
input_ids = self.input_ids[:num_scheduled_tokens]
463+
if encoder_outputs:
464+
inputs_embeds = self.model.get_input_embeddings(
465+
input_ids, encoder_outputs)
466+
else:
467+
inputs_embeds = self.model.get_input_embeddings(input_ids)
468+
# TODO(woosuk): Avoid the copy. Optimize.
469+
self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
470+
inputs_embeds = self.inputs_embeds[:num_input_tokens]
471+
input_ids = None
454472
else:
455-
inputs_embeds = self.model.get_input_embeddings(input_ids)
456-
# NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
457-
# always use embeddings (rather than token ids) as input to the model.
458-
# TODO(woosuk): Avoid the copy. Optimize.
459-
self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
473+
# For text-only models, we use token ids as input.
474+
# While it is possible to use embeddings as input just like the
475+
# multimodal models, it is not desirable for performance since
476+
# then the embedding layer is not included in the CUDA graph.
477+
input_ids = self.input_ids[:num_input_tokens]
478+
inputs_embeds = None
460479

461480
# Run the decoder.
462481
# Use persistent buffers for CUDA graphs.
463482
with set_forward_context(attn_metadata, self.vllm_config):
464483
hidden_states = self.model(
465-
input_ids=None,
484+
input_ids=input_ids,
466485
positions=self.positions[:num_input_tokens],
467486
kv_caches=self.kv_caches,
468487
attn_metadata=None,
469-
inputs_embeds=self.inputs_embeds[:num_input_tokens],
488+
inputs_embeds=inputs_embeds,
470489
)
471490
hidden_states = hidden_states[:num_scheduled_tokens]
472491
hidden_states = hidden_states[logits_indices]
@@ -534,13 +553,20 @@ def _dummy_run(
534553
num_tokens: int,
535554
kv_caches: List[torch.Tensor],
536555
) -> torch.Tensor:
556+
if self.is_multimodal_model:
557+
input_ids = None
558+
inputs_embeds = self.inputs_embeds[:num_tokens]
559+
else:
560+
input_ids = self.input_ids[:num_tokens]
561+
inputs_embeds = None
537562
with set_forward_context(None, self.vllm_config):
538563
hidden_states = model(
539-
input_ids=None,
564+
input_ids=input_ids,
540565
positions=self.positions[:num_tokens],
541566
kv_caches=kv_caches,
542567
attn_metadata=None,
543-
inputs_embeds=self.inputs_embeds[:num_tokens])
568+
inputs_embeds=inputs_embeds,
569+
)
544570
return hidden_states
545571

546572
def profile_run(self) -> None:

0 commit comments

Comments
 (0)