From bbdeca23e6ab2e551e15d8dc7823914e0c4413a1 Mon Sep 17 00:00:00 2001 From: "jiang1.li" Date: Mon, 18 Nov 2024 07:14:33 +0000 Subject: [PATCH] rebase --- vllm/executor/cpu_executor.py | 26 -------------------------- vllm/platforms/cpu.py | 4 +++- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 060763e795f11..1542a2ae367eb 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -2,14 +2,6 @@ from functools import partial from typing import Any, Awaitable, List, Optional, Set, Tuple, Union -<<<<<<< HEAD -======= -import torch - -import vllm.envs as envs -from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, - SchedulerConfig) ->>>>>>> 5980981e (fix test) from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper, ResultHandler, WorkerMonitor) @@ -62,24 +54,6 @@ def _init_executor(self) -> None: os.environ["LOCAL_WORLD_SIZE"] = str( self.parallel_config.tensor_parallel_size) -<<<<<<< HEAD -======= - self.model_config = _verify_and_get_model_config(self.model_config) - self.cache_config = _verify_and_get_cache_config(self.cache_config) - self.scheduler_config = _verify_and_get_scheduler_config( - self.scheduler_config) - self.parallel_config = _verify_and_get_parallel_config( - self.parallel_config) - - if ((self.scheduler_config.chunked_prefill_enabled - or self.cache_config.enable_prefix_caching) - and self.model_config.dtype == torch.half): - logger.warning("chunked-prefill and prefix-cache on the CPU " - "backend does not support fp16 for now," - " cast to bf16.") - self.model_config.dtype = torch.bfloat16 - ->>>>>>> 5980981e (fix test) # Multiprocessing-based executor does not support multi-node setting. # Since it only works for single node, we can use the loopback address # 127.0.0.1 for communication. diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index 2af0dc2e73e32..cfd4343a1ce5a 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -61,7 +61,9 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: f" {kv_cache_space}, expect a positive integer value.") scheduler_config = vllm_config.scheduler_config - if (scheduler_config.chunked_prefill_enabled and model_config.dtype == torch.half): + if ((scheduler_config.chunked_prefill_enabled + or cache_config.enable_prefix_caching) + and model_config.dtype == torch.half): logger.warning("Chunked-prefill on the CPU backend only does not" " support fp16 for now, cast to bf16.") model_config.dtype = torch.bfloat16