From 0a031ed146ec835a3aabbca616c406a0bc15e471 Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Wed, 23 Oct 2024 16:28:21 +0800 Subject: [PATCH] [Hardware][XPU] using current_platform.is_xpu (#9605) --- vllm/attention/selector.py | 6 +++--- vllm/config.py | 4 ++-- vllm/executor/ray_utils.py | 4 ++-- vllm/model_executor/custom_op.py | 4 ++-- vllm/utils.py | 29 +++-------------------------- vllm/worker/xpu_worker.py | 7 ++++--- 6 files changed, 16 insertions(+), 38 deletions(-) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 714c4f7fdb4e5..cd3c642b8c8a2 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -10,7 +10,7 @@ from vllm.attention.backends.abstract import AttentionBackend from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino, is_xpu +from vllm.utils import STR_BACKEND_ENV_VAR, is_hip, is_openvino logger = init_logger(__name__) @@ -136,7 +136,7 @@ def get_attn_backend( from vllm.attention.backends.openvino import OpenVINOAttentionBackend return OpenVINOAttentionBackend elif backend == _Backend.IPEX: - assert is_xpu(), RuntimeError( + assert current_platform.is_xpu(), RuntimeError( "IPEX attention backend is only used for the XPU device.") logger.info("Using IPEX attention backend.") from vllm.attention.backends.ipex_attn import IpexAttnBackend @@ -198,7 +198,7 @@ def which_attn_to_use( logger.info("Cannot use %s backend on OpenVINO.", selected_backend) return _Backend.OPENVINO - if is_xpu(): + if current_platform.is_xpu(): if selected_backend != _Backend.IPEX: logger.info("Cannot use %s backend on XPU.", selected_backend) return _Backend.IPEX diff --git a/vllm/config.py b/vllm/config.py index 12935e77c2aa7..c569789c650ab 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -17,7 +17,7 @@ get_hf_image_processor_config, get_hf_text_config) from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory, - is_hip, is_openvino, is_xpu, print_warning_once) + is_hip, is_openvino, print_warning_once) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -1121,7 +1121,7 @@ def __init__(self, device: str = "auto") -> None: self.device_type = "tpu" elif current_platform.is_cpu(): self.device_type = "cpu" - elif is_xpu(): + elif current_platform.is_xpu(): self.device_type = "xpu" else: raise RuntimeError("Failed to infer device type") diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 7e46acefc5b0e..0af7b3386d895 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -10,7 +10,7 @@ from vllm.logger import init_logger from vllm.platforms import current_platform from vllm.sequence import ExecuteModelRequest, IntermediateTensors -from vllm.utils import get_ip, is_hip, is_xpu +from vllm.utils import get_ip, is_hip from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -231,7 +231,7 @@ def initialize_ray_cluster( assert_ray_available() # Connect to a ray cluster. - if is_hip() or is_xpu(): + if is_hip() or current_platform.is_xpu(): ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index d7506d268e73b..71eed6eb68d78 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -7,7 +7,7 @@ from vllm.compilation.levels import CompilationLevel from vllm.logger import init_logger from vllm.platforms import current_platform -from vllm.utils import is_hip, is_xpu, print_warning_once +from vllm.utils import is_hip, print_warning_once logger = init_logger(__name__) @@ -78,7 +78,7 @@ def dispatch_forward(self): return self.forward_cpu elif current_platform.is_tpu(): return self.forward_tpu - elif is_xpu(): + elif current_platform.is_xpu(): return self.forward_xpu else: return self.forward_cuda diff --git a/vllm/utils.py b/vllm/utils.py index 797c1bcfd5342..0e9b241b6f9f6 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -327,29 +327,6 @@ def is_openvino() -> bool: return False -@lru_cache(maxsize=None) -def is_xpu() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - is_xpu_flag = "xpu" in version("vllm") - except PackageNotFoundError: - return False - # vllm is not build with xpu - if not is_xpu_flag: - return False - try: - import intel_extension_for_pytorch as ipex # noqa: F401 - _import_ipex = True - except ImportError as e: - logger.warning("Import Error for IPEX: %s", e.msg) - _import_ipex = False - # ipex dependency is not ready - if not _import_ipex: - logger.warning("not found ipex lib") - return False - return hasattr(torch, "xpu") and torch.xpu.is_available() - - @lru_cache(maxsize=None) def get_max_shared_memory_bytes(gpu: int = 0) -> int: """Returns the maximum shared memory per thread block in bytes.""" @@ -379,7 +356,7 @@ def seed_everything(seed: int) -> None: if current_platform.is_cuda_alike(): torch.cuda.manual_seed_all(seed) - if is_xpu(): + if current_platform.is_xpu(): torch.xpu.manual_seed_all(seed) @@ -774,7 +751,7 @@ def is_pin_memory_available() -> bool: print_warning_once("Using 'pin_memory=False' as WSL is detected. " "This may slow down the performance.") return False - elif is_xpu(): + elif current_platform.is_xpu(): print_warning_once("Pin memory is not supported on XPU.") return False elif current_platform.is_neuron(): @@ -795,7 +772,7 @@ def current_memory_usage(self) -> float: if current_platform.is_cuda_alike(): torch.cuda.reset_peak_memory_stats(self.device) mem = torch.cuda.max_memory_allocated(self.device) - elif is_xpu(): + elif current_platform.is_xpu(): torch.xpu.reset_peak_memory_stats(self.device) # type: ignore mem = torch.xpu.max_memory_allocated(self.device) # type: ignore return mem diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 9ad070d042a3d..917866f2d985b 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -17,7 +17,7 @@ from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor import set_random_seed -from vllm.utils import is_xpu +from vllm.platforms import current_platform from vllm.worker.cache_engine import CacheEngine from vllm.worker.worker import Worker from vllm.worker.worker_base import LoraNotSupportedWorkerBase @@ -53,7 +53,7 @@ def __init__( observability_config: Optional[ObservabilityConfig] = None, ) -> None: assert device_config.device_type == "xpu" - assert is_xpu() + assert current_platform.is_xpu() self.model_config = model_config self.parallel_config = parallel_config @@ -91,7 +91,8 @@ def __init__( self.gpu_cache: Optional[List[List[torch.Tensor]]] def init_device(self) -> None: - if self.device_config.device.type == "xpu" and is_xpu(): + if self.device_config.device.type == "xpu" and current_platform.is_xpu( + ): self.device = torch.device(f"xpu:{self.local_rank}") torch.xpu.set_device(self.device) torch.xpu.empty_cache()