From bbdeca23e6ab2e551e15d8dc7823914e0c4413a1 Mon Sep 17 00:00:00 2001
From: "jiang1.li" <jiang1.li@intel.com>
Date: Mon, 18 Nov 2024 07:14:33 +0000
Subject: [PATCH] rebase

---
 vllm/executor/cpu_executor.py | 26 --------------------------
 vllm/platforms/cpu.py         |  4 +++-
 2 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 060763e795f11..1542a2ae367eb 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -2,14 +2,6 @@
 from functools import partial
 from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
-<<<<<<< HEAD
-=======
-import torch
-
-import vllm.envs as envs
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
->>>>>>> 5980981e (fix test)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
@@ -62,24 +54,6 @@ def _init_executor(self) -> None:
         os.environ["LOCAL_WORLD_SIZE"] = str(
             self.parallel_config.tensor_parallel_size)
 
-<<<<<<< HEAD
-=======
-        self.model_config = _verify_and_get_model_config(self.model_config)
-        self.cache_config = _verify_and_get_cache_config(self.cache_config)
-        self.scheduler_config = _verify_and_get_scheduler_config(
-            self.scheduler_config)
-        self.parallel_config = _verify_and_get_parallel_config(
-            self.parallel_config)
-
-        if ((self.scheduler_config.chunked_prefill_enabled
-             or self.cache_config.enable_prefix_caching)
-                and self.model_config.dtype == torch.half):
-            logger.warning("chunked-prefill and prefix-cache on the CPU "
-                           "backend does not support fp16 for now,"
-                           " cast to bf16.")
-            self.model_config.dtype = torch.bfloat16
-
->>>>>>> 5980981e (fix test)
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 2af0dc2e73e32..cfd4343a1ce5a 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -61,7 +61,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 f" {kv_cache_space}, expect a positive integer value.")
 
         scheduler_config = vllm_config.scheduler_config
-        if (scheduler_config.chunked_prefill_enabled and model_config.dtype == torch.half):
+        if ((scheduler_config.chunked_prefill_enabled
+             or cache_config.enable_prefix_caching)
+                and model_config.dtype == torch.half):
             logger.warning("Chunked-prefill on the CPU backend only does not"
                            " support fp16 for now, cast to bf16.")
             model_config.dtype = torch.bfloat16