From d1fa714cb1c9a708d7da0de27c99f7eee07fe663 Mon Sep 17 00:00:00 2001
From: Chenguang Li <757486878@qq.com>
Date: Fri, 13 Dec 2024 21:39:00 +0800
Subject: [PATCH] [Refactor]A simple device-related refactor (#11163)

Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
---
 vllm/platforms/cpu.py       |  5 +++++
 vllm/platforms/hpu.py       |  9 +++++++++
 vllm/platforms/interface.py | 17 +++++++++++++++++
 vllm/platforms/neuron.py    |  9 +++++++++
 vllm/platforms/openvino.py  | 10 +++++-----
 vllm/platforms/xpu.py       |  5 +++++
 vllm/utils.py               | 27 +--------------------------
 7 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index e5142b985d1f2..aad8755d9fcd8 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -98,3 +98,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "vllm.worker.cpu_worker.CPUWorker"
             else:
                 parallel_config.worker_cls = "vllm.worker.cpu_worker.CPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on CPU.")
+        return False
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 7f22bee3eaa74..2b947d280f9f8 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -2,6 +2,8 @@
 
 import torch
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
@@ -9,6 +11,8 @@
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
@@ -43,3 +47,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         parallel_config = vllm_config.parallel_config
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on HPU.")
+        return False
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index db06d2c18e681..4150b0cdf836a 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,6 +1,7 @@
 import enum
 import platform
 import random
+from platform import uname
 from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
@@ -16,6 +17,11 @@
 logger = init_logger(__name__)
 
 
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
+
+
 class _Backend(enum.Enum):
     FLASH_ATTN = enum.auto()
     FLASH_ATTN_VLLM_V1 = enum.auto()
@@ -221,6 +227,17 @@ def get_cpu_architecture(cls) -> CpuArchEnum:
 
         return CpuArchEnum.OTHER if machine else CpuArchEnum.UNKNOWN
 
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        """Checks whether pin memory is available on the current platform."""
+        if in_wsl():
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
+            return False
+        return True
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 1e5c4bddfa24f..86113523385f6 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -1,5 +1,7 @@
 from typing import TYPE_CHECKING, Optional
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
@@ -7,6 +9,8 @@
 else:
     VllmConfig = None
 
+logger = init_logger(__name__)
+
 
 class NeuronPlatform(Platform):
     _enum = PlatformEnum.NEURON
@@ -28,3 +32,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = \
                 "vllm.worker.neuron_worker.NeuronWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls) -> bool:
+        logger.warning("Pin memory is not supported on Neuron.")
+        return False
diff --git a/vllm/platforms/openvino.py b/vllm/platforms/openvino.py
index e0f8e8b4b49fe..ccd94e8adb3b1 100644
--- a/vllm/platforms/openvino.py
+++ b/vllm/platforms/openvino.py
@@ -34,7 +34,7 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
         return _Backend.OPENVINO
 
     @classmethod
-    def get_device_name(self, device_id: int = 0) -> str:
+    def get_device_name(cls, device_id: int = 0) -> str:
         return "openvino"
 
     @classmethod
@@ -42,19 +42,19 @@ def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return False
 
     @classmethod
-    def inference_mode(self):
+    def inference_mode(cls):
         return torch.inference_mode(mode=True)
 
     @classmethod
-    def is_openvino_cpu(self) -> bool:
+    def is_openvino_cpu(cls) -> bool:
         return "CPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_openvino_gpu(self) -> bool:
+    def is_openvino_gpu(cls) -> bool:
         return "GPU" in envs.VLLM_OPENVINO_DEVICE
 
     @classmethod
-    def is_pin_memory_available(self) -> bool:
+    def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on OpenViNO.")
         return False
 
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 11dbd04d55671..c20190e789d7e 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -78,3 +78,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             parallel_config.distributed_executor_backend = "ray"
         if parallel_config.worker_cls == "auto":
             parallel_config.worker_cls = "vllm.worker.xpu_worker.XPUWorker"
+
+    @classmethod
+    def is_pin_memory_available(cls):
+        logger.warning("Pin memory is not supported on XPU.")
+        return False
diff --git a/vllm/utils.py b/vllm/utils.py
index 1882264c19775..fbc3ef7fa7f89 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -24,7 +24,6 @@
 from collections import UserDict, defaultdict
 from collections.abc import Iterable, Mapping
 from functools import lru_cache, partial, wraps
-from platform import uname
 from typing import (TYPE_CHECKING, Any, AsyncGenerator, Awaitable, Callable,
                     Dict, Generic, Hashable, List, Literal, Optional,
                     OrderedDict, Set, Tuple, Type, TypeVar, Union, overload)
@@ -344,12 +343,6 @@ def random_uuid() -> str:
     return str(uuid.uuid4().hex)
 
 
-@lru_cache(maxsize=None)
-def in_wsl() -> bool:
-    # Reference: https://github.com/microsoft/WSL/issues/4071
-    return "microsoft" in " ".join(uname()).lower()
-
-
 def make_async(
     func: Callable[P, T],
     executor: Optional[concurrent.futures.Executor] = None
@@ -729,25 +722,7 @@ def print_warning_once(msg: str) -> None:
 
 @lru_cache(maxsize=None)
 def is_pin_memory_available() -> bool:
-
-    if in_wsl():
-        # Pinning memory in WSL is not supported.
-        # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-        print_warning_once("Using 'pin_memory=False' as WSL is detected. "
-                           "This may slow down the performance.")
-        return False
-    elif current_platform.is_xpu():
-        print_warning_once("Pin memory is not supported on XPU.")
-        return False
-    elif current_platform.is_neuron():
-        print_warning_once("Pin memory is not supported on Neuron.")
-        return False
-    elif current_platform.is_hpu():
-        print_warning_once("Pin memory is not supported on HPU.")
-        return False
-    elif current_platform.is_cpu() or current_platform.is_openvino():
-        return False
-    return True
+    return current_platform.is_pin_memory_available()
 
 
 class DeviceMemoryProfiler: