fix test

Signed-off-by: jiang1.li <[email protected]>
vllm-project · Nov 15, 2024 · 5980981 · 5980981
1 parent ba2575b
commit 5980981
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 2 deletions.
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
@@ -2,6 +2,8 @@
 from functools import partial
 from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
+import torch
+
 import vllm.envs as envs
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig)
@@ -64,6 +66,12 @@ def _init_executor(self) -> None:
         self.parallel_config = _verify_and_get_parallel_config(
             self.parallel_config)
 
+        if (self.scheduler_config.chunked_prefill_enabled
+                and self.model_config.dtype == torch.half):
+            logger.warning("Chunked-prefill on the CPU backend only does not"
+                           " support fp16 for now, cast to bf16.")
+            self.model_config.dtype = torch.bfloat16
+
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
@@ -429,8 +429,6 @@ def __init__(
         **kwargs,
     ):
         ModelRunnerBase.__init__(self, vllm_config)
-        # Currently, CPU worker doesn't support chunked prefill.
-        assert self.scheduler_config.chunked_prefill_enabled is False
         model_config = self.model_config
         cache_config = self.cache_config