ROCm · gshtras · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/csrc/custom/custom.cu b/csrc/custom/custom.cu
@@ -1,7 +1,6 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda_runtime.h>
-#include "core/registration.h"
 
 // declare templates for front (cpp) and back (cuda) sides of function:
 // template <typename T>

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -646,7 +646,7 @@ def register_buffer(fa: int, t: torch.Tensor, handles: List[str],
     return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets)
 
 
-def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]:
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
     return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
 
 

diff --git a/vllm/entrypoints/sync_openai/api_server.py b/vllm/entrypoints/sync_openai/api_server.py
@@ -174,9 +174,11 @@ async def _check_model(request: Union[CompletionRequest,
 
 async def _guided_decode_logits_processor(request, tokenizer):
     decoding_config = runner.engine_config.decoding_config
-    assert decoding_config is not None
-    guided_decoding_backend = (request.guided_decoding_backend
-                               or decoding_config.guided_decoding_backend)
+    if request.guided_decoding_backend:
+        guided_decoding_backend = request.guided_decoding_backend
+    else:
+        assert decoding_config is not None
+        guided_decoding_backend = decoding_config.guided_decoding_backend
     return await get_guided_decoding_logits_processor(guided_decoding_backend,
                                                       request, tokenizer)
 

diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -34,7 +34,7 @@
 # the major benefit of using AMDSMI is that it will not initialize CUDA
 
 
-def with_nvml_context(fn):
+def with_amdsmi_context(fn):
 
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -65,12 +65,11 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
         return torch.cuda.get_device_capability(device_id)
 
     @staticmethod
-    @with_nvml_context
+    @with_amdsmi_context
     def is_full_nvlink(physical_device_ids: List[int]) -> bool:
         """
-        query if the set of gpus are fully connected by xgmi (1 hop)
+        Query if the set of gpus are fully connected by xgmi (1 hop)
         """
-        # On ROCm, we instead query if GPUs are connected by 1 hop XGMI
         handles = [
             amdsmi_get_processor_handles()[i] for i in physical_device_ids
         ]
@@ -90,7 +89,7 @@ def is_full_nvlink(physical_device_ids: List[int]) -> bool:
         return True
 
     @staticmethod
-    @with_nvml_context
+    @with_amdsmi_context
     @lru_cache(maxsize=8)
     def get_device_name(device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)