From d1fa714cb1c9a708d7da0de27c99f7eee07fe663 Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Fri, 13 Dec 2024 21:39:00 +0800 Subject: [PATCH] [Refactor]A simple device-related refactor (#11163) Signed-off-by: noemotiovon Co-authored-by: noemotiovon --- vllm/platforms/cpu.py | 5 +++++ vllm/platforms/hpu.py | 9 +++++++++ vllm/platforms/interface.py | 17 +++++++++++++++++ vllm/platforms/neuron.py | 9 +++++++++ vllm/platforms/openvino.py | 10 +++++----- vllm/platforms/xpu.py | 5 +++++ vllm/utils.py | 27 +-------------------------- 7 files changed, 51 insertions(+), 31 deletions(-) diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index e5142b985d1f2..aad8755d9fcd8 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -98,3 +98,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: "vllm.worker.cpu_worker.CPUWorker" else: parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker" + + @classmethod + def is_pin_memory_available(cls) -> bool: + logger.warning("Pin memory is not supported on CPU.") + return False diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 7f22bee3eaa74..2b947d280f9f8 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -2,6 +2,8 @@ import torch +from vllm.logger import init_logger + from .interface import Platform, PlatformEnum, _Backend if TYPE_CHECKING: @@ -9,6 +11,8 @@ else: VllmConfig = None +logger = init_logger(__name__) + class HpuPlatform(Platform): _enum = PlatformEnum.HPU @@ -43,3 +47,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config = vllm_config.parallel_config if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker" + + @classmethod + def is_pin_memory_available(cls): + logger.warning("Pin memory is not supported on HPU.") + return False diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index db06d2c18e681..4150b0cdf836a 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -1,6 +1,7 @@ import enum import platform import random +from platform import uname from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union import numpy as np @@ -16,6 +17,11 @@ logger = init_logger(__name__) +def in_wsl() -> bool: + # Reference: https://github.com/microsoft/WSL/issues/4071 + return "microsoft" in " ".join(uname()).lower() + + class _Backend(enum.Enum): FLASH_ATTN = enum.auto() FLASH_ATTN_VLLM_V1 = enum.auto() @@ -221,6 +227,17 @@ def get_cpu_architecture(cls) -> CpuArchEnum: return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN + @classmethod + def is_pin_memory_available(cls) -> bool: + """Checks whether pin memory is available on the current platform.""" + if in_wsl(): + # Pinning memory in WSL is not supported. + # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications + logger.warning("Using 'pin_memory=False' as WSL is detected. " + "This may slow down the performance.") + return False + return True + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py index 1e5c4bddfa24f..86113523385f6 100644 --- a/vllm/platforms/neuron.py +++ b/vllm/platforms/neuron.py @@ -1,5 +1,7 @@ from typing import TYPE_CHECKING, Optional +from vllm.logger import init_logger + from .interface import Platform, PlatformEnum if TYPE_CHECKING: @@ -7,6 +9,8 @@ else: VllmConfig = None +logger = init_logger(__name__) + class NeuronPlatform(Platform): _enum = PlatformEnum.NEURON @@ -28,3 +32,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: if parallel_config.worker_cls == "auto": parallel_config.worker_cls = \ "vllm.worker.neuron_worker.NeuronWorker" + + @classmethod + def is_pin_memory_available(cls) -> bool: + logger.warning("Pin memory is not supported on Neuron.") + return False diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py index e0f8e8b4b49fe..ccd94e8adb3b1 100644 --- a/vllm/platforms/openvino.py +++ b/vllm/platforms/openvino.py @@ -34,7 +34,7 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: return _Backend.OPENVINO @classmethod - def get_device_name(self, device_id: int = 0) -> str: + def get_device_name(cls, device_id: int = 0) -> str: return "openvino" @classmethod @@ -42,19 +42,19 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return False @classmethod - def inference_mode(self): + def inference_mode(cls): return torch.inference_mode(mode=True) @classmethod - def is_openvino_cpu(self) -> bool: + def is_openvino_cpu(cls) -> bool: return "CPU" in envs.VLLM_OPENVINO_DEVICE @classmethod - def is_openvino_gpu(self) -> bool: + def is_openvino_gpu(cls) -> bool: return "GPU" in envs.VLLM_OPENVINO_DEVICE @classmethod - def is_pin_memory_available(self) -> bool: + def is_pin_memory_available(cls) -> bool: logger.warning("Pin memory is not supported on OpenViNO.") return False diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py index 11dbd04d55671..c20190e789d7e 100644 --- a/vllm/platforms/xpu.py +++ b/vllm/platforms/xpu.py @@ -78,3 +78,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.distributed_executor_backend = "ray" if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker" + + @classmethod + def is_pin_memory_available(cls): + logger.warning("Pin memory is not supported on XPU.") + return False diff --git a/vllm/utils.py b/vllm/utils.py index 1882264c19775..fbc3ef7fa7f89 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -24,7 +24,6 @@ from collections import UserDict, defaultdict from collections.abc import Iterable, Mapping from functools import lru_cache, partial, wraps -from platform import uname from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable, Dict, Generic, Hashable, List, Literal, Optional, OrderedDict, Set, Tuple, Type, TypeVar, Union, overload) @@ -344,12 +343,6 @@ def random_uuid() -> str: return str(uuid.uuid4().hex) -@lru_cache(maxsize=None) -def in_wsl() -> bool: - # Reference: https://github.com/microsoft/WSL/issues/4071 - return "microsoft" in " ".join(uname()).lower() - - def make_async( func: Callable[P, T], executor: Optional[concurrent.futures.Executor] = None @@ -729,25 +722,7 @@ def print_warning_once(msg: str) -> None: @lru_cache(maxsize=None) def is_pin_memory_available() -> bool: - - if in_wsl(): - # Pinning memory in WSL is not supported. - # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications - print_warning_once("Using 'pin_memory=False' as WSL is detected. " - "This may slow down the performance.") - return False - elif current_platform.is_xpu(): - print_warning_once("Pin memory is not supported on XPU.") - return False - elif current_platform.is_neuron(): - print_warning_once("Pin memory is not supported on Neuron.") - return False - elif current_platform.is_hpu(): - print_warning_once("Pin memory is not supported on HPU.") - return False - elif current_platform.is_cpu() or current_platform.is_openvino(): - return False - return True + return current_platform.is_pin_memory_available() class DeviceMemoryProfiler: