From 433767966d15d6cb2223a48cd3e4d78eb40e109c Mon Sep 17 00:00:00 2001 From: Chendi Xue Date: Fri, 8 Nov 2024 20:38:54 -0500 Subject: [PATCH] forget to submit cpu_draft_model_runner. add it here Signed-off-by: Chendi Xue --- vllm/spec_decode/cpu_draft_model_runner.py | 43 ++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 vllm/spec_decode/cpu_draft_model_runner.py diff --git a/vllm/spec_decode/cpu_draft_model_runner.py b/vllm/spec_decode/cpu_draft_model_runner.py new file mode 100644 index 0000000000000..adbf247c5c557 --- /dev/null +++ b/vllm/spec_decode/cpu_draft_model_runner.py @@ -0,0 +1,43 @@ +from typing import List, Optional +import torch +from vllm.logger import init_logger +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.sequence import IntermediateTensors +from vllm.worker.cpu_model_runner import CPUModelRunner as ModelRunnerBaseCls +from vllm.worker.cpu_model_runner import ModelInputForCPUWithSamplingMetadata +logger = init_logger(__name__) + +class CPUTP1DraftModelRunner(ModelRunnerBaseCls): + """Specialized model runner for speculative decoding draft model. + Since the draft model always execute k forward passes consecutively to + generate k speculative tokens in a single speculative decoding step, + we could get rid of most CPU-GPU synchronization and data transfer + overheads by keeping model input and output tensors on GPU all the time. + TODOs: + 1. Support TP > 1 (this requires some designs because we do not expect + any broadcasting inside execute_model). + """ + def __init__(self, *args, **kwargs): + if kwargs.get("return_hidden_states"): + raise ValueError( + "return_hidden_states is not supported for TP1DraftModelRunner." + ) + super().__init__(*args, **kwargs) + self.indices_of_seq_with_bonus_tokens = None + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForCPUWithSamplingMetadata, + kv_caches: List[torch.Tensor], + previous_hidden_states: Optional[torch.Tensor] = None, + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + ) -> Optional[List[SamplerOutput]]: + return super().execute_model( + model_input=model_input, + kv_caches=kv_caches, + previous_hidden_states=previous_hidden_states, + intermediate_tensors=intermediate_tensors, + num_steps=num_steps, + ) \ No newline at end of file