Skip to content

Commit

Permalink
[Refactor]A simple device-related refactor (vllm-project#11163)
Browse files Browse the repository at this point in the history
Signed-off-by: noemotiovon <[email protected]>
Co-authored-by: noemotiovon <[email protected]>
  • Loading branch information
noemotiovon and noemotiovon authored Dec 13, 2024
1 parent 969da7d commit d1fa714
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 31 deletions.
5 changes: 5 additions & 0 deletions vllm/platforms/cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
"vllm.worker.cpu_worker.CPUWorker"
else:
parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"

@classmethod
def is_pin_memory_available(cls) -> bool:
logger.warning("Pin memory is not supported on CPU.")
return False
9 changes: 9 additions & 0 deletions vllm/platforms/hpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,17 @@

import torch

from vllm.logger import init_logger

from .interface import Platform, PlatformEnum, _Backend

if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None

logger = init_logger(__name__)


class HpuPlatform(Platform):
_enum = PlatformEnum.HPU
Expand Down Expand Up @@ -43,3 +47,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config = vllm_config.parallel_config
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"

@classmethod
def is_pin_memory_available(cls):
logger.warning("Pin memory is not supported on HPU.")
return False
17 changes: 17 additions & 0 deletions vllm/platforms/interface.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import enum
import platform
import random
from platform import uname
from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union

import numpy as np
Expand All @@ -16,6 +17,11 @@
logger = init_logger(__name__)


def in_wsl() -> bool:
# Reference: https://github.com/microsoft/WSL/issues/4071
return "microsoft" in " ".join(uname()).lower()


class _Backend(enum.Enum):
FLASH_ATTN = enum.auto()
FLASH_ATTN_VLLM_V1 = enum.auto()
Expand Down Expand Up @@ -221,6 +227,17 @@ def get_cpu_architecture(cls) -> CpuArchEnum:

return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN

@classmethod
def is_pin_memory_available(cls) -> bool:
"""Checks whether pin memory is available on the current platform."""
if in_wsl():
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
logger.warning("Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance.")
return False
return True


class UnspecifiedPlatform(Platform):
_enum = PlatformEnum.UNSPECIFIED
Expand Down
9 changes: 9 additions & 0 deletions vllm/platforms/neuron.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from typing import TYPE_CHECKING, Optional

from vllm.logger import init_logger

from .interface import Platform, PlatformEnum

if TYPE_CHECKING:
from vllm.config import VllmConfig
else:
VllmConfig = None

logger = init_logger(__name__)


class NeuronPlatform(Platform):
_enum = PlatformEnum.NEURON
Expand All @@ -28,3 +32,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = \
"vllm.worker.neuron_worker.NeuronWorker"

@classmethod
def is_pin_memory_available(cls) -> bool:
logger.warning("Pin memory is not supported on Neuron.")
return False
10 changes: 5 additions & 5 deletions vllm/platforms/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,27 +34,27 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
return _Backend.OPENVINO

@classmethod
def get_device_name(self, device_id: int = 0) -> str:
def get_device_name(cls, device_id: int = 0) -> str:
return "openvino"

@classmethod
def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
return False

@classmethod
def inference_mode(self):
def inference_mode(cls):
return torch.inference_mode(mode=True)

@classmethod
def is_openvino_cpu(self) -> bool:
def is_openvino_cpu(cls) -> bool:
return "CPU" in envs.VLLM_OPENVINO_DEVICE

@classmethod
def is_openvino_gpu(self) -> bool:
def is_openvino_gpu(cls) -> bool:
return "GPU" in envs.VLLM_OPENVINO_DEVICE

@classmethod
def is_pin_memory_available(self) -> bool:
def is_pin_memory_available(cls) -> bool:
logger.warning("Pin memory is not supported on OpenViNO.")
return False

Expand Down
5 changes: 5 additions & 0 deletions vllm/platforms/xpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
parallel_config.distributed_executor_backend = "ray"
if parallel_config.worker_cls == "auto":
parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"

@classmethod
def is_pin_memory_available(cls):
logger.warning("Pin memory is not supported on XPU.")
return False
27 changes: 1 addition & 26 deletions vllm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
from collections import UserDict, defaultdict
from collections.abc import Iterable, Mapping
from functools import lru_cache, partial, wraps
from platform import uname
from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
Dict, Generic, Hashable, List, Literal, Optional,
OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
Expand Down Expand Up @@ -344,12 +343,6 @@ def random_uuid() -> str:
return str(uuid.uuid4().hex)


@lru_cache(maxsize=None)
def in_wsl() -> bool:
# Reference: https://github.com/microsoft/WSL/issues/4071
return "microsoft" in " ".join(uname()).lower()


def make_async(
func: Callable[P, T],
executor: Optional[concurrent.futures.Executor] = None
Expand Down Expand Up @@ -729,25 +722,7 @@ def print_warning_once(msg: str) -> None:

@lru_cache(maxsize=None)
def is_pin_memory_available() -> bool:

if in_wsl():
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
print_warning_once("Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance.")
return False
elif current_platform.is_xpu():
print_warning_once("Pin memory is not supported on XPU.")
return False
elif current_platform.is_neuron():
print_warning_once("Pin memory is not supported on Neuron.")
return False
elif current_platform.is_hpu():
print_warning_once("Pin memory is not supported on HPU.")
return False
elif current_platform.is_cpu() or current_platform.is_openvino():
return False
return True
return current_platform.is_pin_memory_available()


class DeviceMemoryProfiler:
Expand Down

0 comments on commit d1fa714

Please sign in to comment.