[4.5/N] bugfix for quant config in speculative decode (vllm-project#1…

…0007) Signed-off-by: youkaichao <[email protected]> Signed-off-by: Maxime Fournioux <[email protected]>
mfournioux · Nov 20, 2024 · 220d000 · 220d000
1 parent b4408d1
commit 220d000
Showing 1 changed file with 4 additions and 0 deletions.
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
@@ -61,6 +61,10 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
 
     draft_worker_config = copy.deepcopy(vllm_config)
     draft_worker_config.model_config = speculative_config.draft_model_config
+    draft_worker_config.quant_config = VllmConfig._get_quantization_config(
+        draft_worker_config.model_config,
+        vllm_config.load_config,
+    )
     draft_worker_config.parallel_config = speculative_config.draft_parallel_config  # noqa
     # TODO allow draft-model specific load config.