From 7edb2fd71247d60b6259f55a3c0c33bbbf4cf09c Mon Sep 17 00:00:00 2001
From: Matt Wong <156021403+mawong-amd@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:19:25 -0500
Subject: [PATCH] Update Dockerfile to 6.2, update ROCm components, remove
 Cython (#166)

* Miscellaneous changes, Dockerfile components update, remove Cython

* Restore Dockerfile and Cython for now
---
 csrc/custom/custom.cu                      | 1 -
 vllm/_custom_ops.py                        | 2 +-
 vllm/entrypoints/sync_openai/api_server.py | 8 +++++---
 vllm/platforms/rocm.py                     | 9 ++++-----
 4 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/csrc/custom/custom.cu b/csrc/custom/custom.cu
index e4826b80de769..fae1b4fbfbe33 100644
--- a/csrc/custom/custom.cu
+++ b/csrc/custom/custom.cu
@@ -1,7 +1,6 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda_runtime.h>
-#include "core/registration.h"
 
 // declare templates for front (cpp) and back (cuda) sides of function:
 // template <typename T>
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2e7118f23d8ab..10bae4042cfe7 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -646,7 +646,7 @@ def register_buffer(fa: int, t: torch.Tensor, handles: List[str],
     return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets)
 
 
-def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]:
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
     return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
 
 
diff --git a/vllm/entrypoints/sync_openai/api_server.py b/vllm/entrypoints/sync_openai/api_server.py
index 4c05742d6a78d..c22bdf1903f3b 100644
--- a/vllm/entrypoints/sync_openai/api_server.py
+++ b/vllm/entrypoints/sync_openai/api_server.py
@@ -174,9 +174,11 @@ async def _check_model(request: Union[CompletionRequest,
 
 async def _guided_decode_logits_processor(request, tokenizer):
     decoding_config = runner.engine_config.decoding_config
-    assert decoding_config is not None
-    guided_decoding_backend = (request.guided_decoding_backend
-                               or decoding_config.guided_decoding_backend)
+    if request.guided_decoding_backend:
+        guided_decoding_backend = request.guided_decoding_backend
+    else:
+        assert decoding_config is not None
+        guided_decoding_backend = decoding_config.guided_decoding_backend
     return await get_guided_decoding_logits_processor(guided_decoding_backend,
                                                       request, tokenizer)
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d3e325d8a613d..e5f6404949950 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -34,7 +34,7 @@
 # the major benefit of using AMDSMI is that it will not initialize CUDA
 
 
-def with_nvml_context(fn):
+def with_amdsmi_context(fn):
 
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -65,12 +65,11 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
         return torch.cuda.get_device_capability(device_id)
 
     @staticmethod
-    @with_nvml_context
+    @with_amdsmi_context
     def is_full_nvlink(physical_device_ids: List[int]) -> bool:
         """
-        query if the set of gpus are fully connected by xgmi (1 hop)
+        Query if the set of gpus are fully connected by xgmi (1 hop)
         """
-        # On ROCm, we instead query if GPUs are connected by 1 hop XGMI
         handles = [
             amdsmi_get_processor_handles()[i] for i in physical_device_ids
         ]
@@ -90,7 +89,7 @@ def is_full_nvlink(physical_device_ids: List[int]) -> bool:
         return True
 
     @staticmethod
-    @with_nvml_context
+    @with_amdsmi_context
     @lru_cache(maxsize=8)
     def get_device_name(device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)