@@ -61,6 +61,7 @@ def __init__(
61
61
self .kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE [
62
62
cache_config .cache_dtype ]
63
63
64
+ self .is_multimodal_model = model_config .is_multimodal_model
64
65
self .sliding_window = model_config .get_sliding_window ()
65
66
self .block_size = cache_config .block_size
66
67
self .max_model_len = model_config .max_model_len
@@ -103,6 +104,11 @@ def __init__(
103
104
# The batch sizes in the config are in descending order.
104
105
self .cudagraph_batch_sizes = list (
105
106
reversed (self .vllm_config .compilation_config .capture_sizes ))
107
+
108
+ # Persistent buffers for CUDA graphs.
109
+ self .input_ids = torch .zeros (self .max_num_tokens ,
110
+ dtype = torch .int32 ,
111
+ device = self .device )
106
112
self .positions = torch .zeros (self .max_num_tokens ,
107
113
dtype = torch .int64 ,
108
114
device = self .device )
@@ -310,7 +316,8 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
310
316
seq_start_loc_np [0 ] = 0
311
317
np .cumsum (seq_lens , out = seq_start_loc_np [1 :])
312
318
313
- input_ids = input_ids .to (self .device , non_blocking = True )
319
+ self .input_ids [:total_num_scheduled_tokens ].copy_ (input_ids ,
320
+ non_blocking = True )
314
321
self .positions [:total_num_scheduled_tokens ].copy_ (positions ,
315
322
non_blocking = True )
316
323
query_start_loc = query_start_loc .to (self .device , non_blocking = True )
@@ -331,7 +338,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
331
338
# token from the partial request.
332
339
# TODO: Support prompt logprobs.
333
340
logits_indices = query_start_loc [1 :] - 1
334
- return input_ids , attn_metadata , logits_indices
341
+ return attn_metadata , logits_indices
335
342
336
343
def _prepare_sampling (
337
344
self ,
@@ -427,13 +434,15 @@ def execute_model(
427
434
) -> ModelRunnerOutput :
428
435
self ._update_states (scheduler_output )
429
436
430
- # Run the encoder.
431
- self ._execute_encoder (scheduler_output )
432
- encoder_outputs = self ._gather_encoder_outputs (scheduler_output )
437
+ if self .is_multimodal_model :
438
+ # Run the multimodal encoder if any.
439
+ self ._execute_encoder (scheduler_output )
440
+ encoder_outputs = self ._gather_encoder_outputs (scheduler_output )
441
+ else :
442
+ encoder_outputs = []
433
443
434
444
# Prepare the decoder inputs.
435
- input_ids , attn_metadata , logits_indices = self ._prepare_inputs (
436
- scheduler_output )
445
+ attn_metadata , logits_indices = self ._prepare_inputs (scheduler_output )
437
446
num_scheduled_tokens = scheduler_output .total_num_scheduled_tokens
438
447
if (self .use_cuda_graph
439
448
and num_scheduled_tokens <= self .cudagraph_batch_sizes [- 1 ]):
@@ -444,29 +453,39 @@ def execute_model(
444
453
else :
445
454
# Eager mode.
446
455
num_input_tokens = num_scheduled_tokens
447
-
448
456
attn_metadata .num_input_tokens = num_input_tokens
449
457
450
- # Get the inputs embeds.
451
- if encoder_outputs :
452
- inputs_embeds = self .model .get_input_embeddings (
453
- input_ids , encoder_outputs )
458
+ if self .is_multimodal_model :
459
+ # NOTE(woosuk): To unify token ids and soft tokens (vision
460
+ # embeddings), we always use embeddings (rather than token ids)
461
+ # as input to the multimodal model, even when the input is text.
462
+ input_ids = self .input_ids [:num_scheduled_tokens ]
463
+ if encoder_outputs :
464
+ inputs_embeds = self .model .get_input_embeddings (
465
+ input_ids , encoder_outputs )
466
+ else :
467
+ inputs_embeds = self .model .get_input_embeddings (input_ids )
468
+ # TODO(woosuk): Avoid the copy. Optimize.
469
+ self .inputs_embeds [:num_scheduled_tokens ].copy_ (inputs_embeds )
470
+ inputs_embeds = self .inputs_embeds [:num_input_tokens ]
471
+ input_ids = None
454
472
else :
455
- inputs_embeds = self .model .get_input_embeddings (input_ids )
456
- # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
457
- # always use embeddings (rather than token ids) as input to the model.
458
- # TODO(woosuk): Avoid the copy. Optimize.
459
- self .inputs_embeds [:num_scheduled_tokens ].copy_ (inputs_embeds )
473
+ # For text-only models, we use token ids as input.
474
+ # While it is possible to use embeddings as input just like the
475
+ # multimodal models, it is not desirable for performance since
476
+ # then the embedding layer is not included in the CUDA graph.
477
+ input_ids = self .input_ids [:num_input_tokens ]
478
+ inputs_embeds = None
460
479
461
480
# Run the decoder.
462
481
# Use persistent buffers for CUDA graphs.
463
482
with set_forward_context (attn_metadata , self .vllm_config ):
464
483
hidden_states = self .model (
465
- input_ids = None ,
484
+ input_ids = input_ids ,
466
485
positions = self .positions [:num_input_tokens ],
467
486
kv_caches = self .kv_caches ,
468
487
attn_metadata = None ,
469
- inputs_embeds = self . inputs_embeds [: num_input_tokens ] ,
488
+ inputs_embeds = inputs_embeds ,
470
489
)
471
490
hidden_states = hidden_states [:num_scheduled_tokens ]
472
491
hidden_states = hidden_states [logits_indices ]
@@ -534,13 +553,20 @@ def _dummy_run(
534
553
num_tokens : int ,
535
554
kv_caches : List [torch .Tensor ],
536
555
) -> torch .Tensor :
556
+ if self .is_multimodal_model :
557
+ input_ids = None
558
+ inputs_embeds = self .inputs_embeds [:num_tokens ]
559
+ else :
560
+ input_ids = self .input_ids [:num_tokens ]
561
+ inputs_embeds = None
537
562
with set_forward_context (None , self .vllm_config ):
538
563
hidden_states = model (
539
- input_ids = None ,
564
+ input_ids = input_ids ,
540
565
positions = self .positions [:num_tokens ],
541
566
kv_caches = kv_caches ,
542
567
attn_metadata = None ,
543
- inputs_embeds = self .inputs_embeds [:num_tokens ])
568
+ inputs_embeds = inputs_embeds ,
569
+ )
544
570
return hidden_states
545
571
546
572
def profile_run (self ) -> None :
0 commit comments