From 7edb2fd71247d60b6259f55a3c0c33bbbf4cf09c Mon Sep 17 00:00:00 2001 From: Matt Wong <156021403+mawong-amd@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:19:25 -0500 Subject: [PATCH] Update Dockerfile to 6.2, update ROCm components, remove Cython (#166) * Miscellaneous changes, Dockerfile components update, remove Cython * Restore Dockerfile and Cython for now --- csrc/custom/custom.cu | 1 - vllm/_custom_ops.py | 2 +- vllm/entrypoints/sync_openai/api_server.py | 8 +++++--- vllm/platforms/rocm.py | 9 ++++----- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/csrc/custom/custom.cu b/csrc/custom/custom.cu index e4826b80de769..fae1b4fbfbe33 100644 --- a/csrc/custom/custom.cu +++ b/csrc/custom/custom.cu @@ -1,7 +1,6 @@ #include #include #include -#include "core/registration.h" // declare templates for front (cpp) and back (cuda) sides of function: // template diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 2e7118f23d8ab..10bae4042cfe7 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -646,7 +646,7 @@ def register_buffer(fa: int, t: torch.Tensor, handles: List[str], return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets) -def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]: +def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]: return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa) diff --git a/vllm/entrypoints/sync_openai/api_server.py b/vllm/entrypoints/sync_openai/api_server.py index 4c05742d6a78d..c22bdf1903f3b 100644 --- a/vllm/entrypoints/sync_openai/api_server.py +++ b/vllm/entrypoints/sync_openai/api_server.py @@ -174,9 +174,11 @@ async def _check_model(request: Union[CompletionRequest, async def _guided_decode_logits_processor(request, tokenizer): decoding_config = runner.engine_config.decoding_config - assert decoding_config is not None - guided_decoding_backend = (request.guided_decoding_backend - or decoding_config.guided_decoding_backend) + if request.guided_decoding_backend: + guided_decoding_backend = request.guided_decoding_backend + else: + assert decoding_config is not None + guided_decoding_backend = decoding_config.guided_decoding_backend return await get_guided_decoding_logits_processor(guided_decoding_backend, request, tokenizer) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d3e325d8a613d..e5f6404949950 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -34,7 +34,7 @@ # the major benefit of using AMDSMI is that it will not initialize CUDA -def with_nvml_context(fn): +def with_amdsmi_context(fn): @wraps(fn) def wrapper(*args, **kwargs): @@ -65,12 +65,11 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]: return torch.cuda.get_device_capability(device_id) @staticmethod - @with_nvml_context + @with_amdsmi_context def is_full_nvlink(physical_device_ids: List[int]) -> bool: """ - query if the set of gpus are fully connected by xgmi (1 hop) + Query if the set of gpus are fully connected by xgmi (1 hop) """ - # On ROCm, we instead query if GPUs are connected by 1 hop XGMI handles = [ amdsmi_get_processor_handles()[i] for i in physical_device_ids ] @@ -90,7 +89,7 @@ def is_full_nvlink(physical_device_ids: List[int]) -> bool: return True @staticmethod - @with_nvml_context + @with_amdsmi_context @lru_cache(maxsize=8) def get_device_name(device_id: int = 0) -> str: physical_device_id = device_id_to_physical_device_id(device_id)