|
11 | 11 | from vllm.multimodal import MultiModalPlaceholderMap
|
12 | 12 |
|
13 | 13 | if TYPE_CHECKING:
|
14 |
| - from vllm.worker.model_runner import ModelInputForGPUBuilder |
| 14 | + from vllm.worker.model_runner import (ModelInputForGPUBuilder, |
| 15 | + ModelInputForGPUWithSamplingMetadata) |
15 | 16 |
|
16 | 17 | # Placeholder attention backend for models like Mamba and embedding models that
|
17 | 18 | # lack attention.
|
@@ -186,6 +187,67 @@ def decode_metadata(self) -> Optional["PlaceholderAttentionMetadata"]:
|
186 | 187 | )
|
187 | 188 | return self._cached_decode_metadata
|
188 | 189 |
|
| 190 | + def advance_step(self, |
| 191 | + model_input: "ModelInputForGPUWithSamplingMetadata", |
| 192 | + sampled_token_ids: Optional[torch.Tensor], |
| 193 | + block_size: int, |
| 194 | + num_seqs: int, |
| 195 | + num_queries: int, |
| 196 | + turn_prefills_into_decodes: bool = False): |
| 197 | + """ |
| 198 | + Update metadata in-place to advance one decode step. |
| 199 | + """ |
| 200 | + # When using cudagraph, the num_seqs is padded to the next captured |
| 201 | + # batch sized, but num_queries tracks the actual number of requests in |
| 202 | + # the batch. For --enforce-eager mode, num_seqs == num_queries |
| 203 | + if num_seqs != num_queries: |
| 204 | + assert num_seqs > num_queries |
| 205 | + assert self.use_cuda_graph |
| 206 | + |
| 207 | + assert not turn_prefills_into_decodes, \ |
| 208 | + ("Multi-Step + Chunked-Prefill is not supported for attention-free" |
| 209 | + "models. turn_prefills_into_decodes is a " |
| 210 | + "Multi-Step + Chunked-Prefill specific parameter.") |
| 211 | + |
| 212 | + assert self.seq_lens is not None |
| 213 | + assert self.max_decode_seq_len == max(self.seq_lens) |
| 214 | + |
| 215 | + assert self.num_prefills == 0 |
| 216 | + assert self.num_prefill_tokens == 0 |
| 217 | + assert self.num_decode_tokens == num_seqs |
| 218 | + |
| 219 | + assert self.seq_lens is not None |
| 220 | + assert len(self.seq_lens) == num_seqs |
| 221 | + assert self.seq_lens_tensor is not None |
| 222 | + assert self.seq_lens_tensor.shape == (num_seqs, ) |
| 223 | + assert self.max_query_len == 1 |
| 224 | + assert self.max_prefill_seq_len == 0 |
| 225 | + |
| 226 | + assert self.query_start_loc is not None |
| 227 | + assert self.query_start_loc.shape == (num_queries + 1, ) |
| 228 | + assert self.seq_start_loc is not None |
| 229 | + assert self.seq_start_loc.shape == (num_seqs + 1, ) |
| 230 | + |
| 231 | + assert self.context_lens_tensor is not None |
| 232 | + assert self.context_lens_tensor.shape == (num_queries, ) |
| 233 | + |
| 234 | + assert self.block_tables is not None |
| 235 | + |
| 236 | + # Update query lengths. Note that we update only queries and not seqs, |
| 237 | + # since tensors may be padded due to captured cuda graph batch size |
| 238 | + for i in range(num_queries): |
| 239 | + self.seq_lens[i] += 1 |
| 240 | + self.max_decode_seq_len = max(self.seq_lens) |
| 241 | + |
| 242 | + # Update sequences, masking off entries greater than num_queries |
| 243 | + device = self.seq_lens_tensor.device |
| 244 | + mask = torch.arange(self.seq_lens_tensor.size(0), |
| 245 | + device=device) < num_queries |
| 246 | + self.seq_lens_tensor += mask.to(self.seq_lens_tensor.dtype) |
| 247 | + if sampled_token_ids is not None: |
| 248 | + model_input.input_tokens.masked_scatter_( |
| 249 | + mask, sampled_token_ids[:num_queries]) |
| 250 | + |
189 | 251 |
|
190 | 252 | class PlaceholderAttentionMetadataBuilder(
|
191 | 253 | AttentionMetadataBuilder[PlaceholderAttentionMetadata]):
|
|
0 commit comments