From c41825a0e6ea529751734b97a7ef68e237c17010 Mon Sep 17 00:00:00 2001
From: Matthew Wong <Matthew.Wong2@amd.com>
Date: Wed, 4 Sep 2024 08:48:44 -0500
Subject: [PATCH] Miscellaneous changes, Dockerfile components update, remove
 Cython

---
 Dockerfile.rocm                            | 92 +++++++++++++---------
 csrc/custom/custom.cu                      |  1 -
 setup_cython.py                            | 37 ---------
 vllm/_custom_ops.py                        |  2 +-
 vllm/entrypoints/sync_openai/api_server.py |  8 +-
 vllm/platforms/rocm.py                     |  9 +--
 6 files changed, 65 insertions(+), 84 deletions(-)
 delete mode 100644 setup_cython.py

diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 499f55896c35c..b8a505c375205 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -1,17 +1,18 @@
 # default base image
-ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging"
+ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_staging"
 
 ARG COMMON_WORKDIR=/app
 
 # The following ARGs should be "0" or "1". If "1", the respective component will be built and installed on top of the base image
-ARG BUILD_HIPBLASLT="1"
-ARG BUILD_RCCL="1"
-ARG BUILD_FA="1"
+ARG BUILD_HIPBLASLT="0"
+ARG BUILD_RCCL="0"
+ARG BUILD_FA="0"
 ARG BUILD_TRITON="1"
 # This ARG should also be "0" or "1". If "1", the vLLM development directory is obtained via git clone.
 # If "0", it is copied in from the local working directory.
 ARG REMOTE_VLLM="0"
 
+
 # -----------------------
 # vLLM base image
 FROM $BASE_IMAGE AS base
@@ -21,13 +22,35 @@ USER root
 ARG BASE_IMAGE
 ARG COMMON_WORKDIR
 # Used as ARCHes for all components
-ARG ARG_PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH}
+ARG VLLM_ROCM_ARCH="gfx90a;gfx942"
+ENV PYTORCH_ROCM_ARCH=${VLLM_ROCM_ARCH}
 
 # Install some basic utilities
 RUN apt-get update -q -y && apt-get install -q -y python3 python3-pip
 RUN apt-get update -q -y && apt-get install -q -y \
-    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
+    ccache sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev
+
+ENV CCACHE_DIR=/root/.cache/ccache
+
+RUN python3 -m pip install --upgrade pip
+# Remove sccache so it doesn't interfere with ccache
+# TODO: implement sccache support across components
+RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
+# Install torch == 2.5.0 on ROCm
+RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
+        *"rocm-6.1"*) \
+            python3 -m pip uninstall -y torch torchvision \
+            && python3 -m pip install --no-cache-dir --pre \
+                torch==2.5.0.dev20240826 \
+                torchvision==0.20.0.dev20240826 \
+               --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \
+        *"rocm-6.2"*) \
+            python3 -m pip uninstall -y torch torchvision \
+            && python3 -m pip install --no-cache-dir --pre \
+                torch==2.5.0.dev20240826 \
+                torchvision==0.20.0.dev20240826 \
+                --index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
+        *) ;; esac
 
 ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
 ENV PATH=$PATH:/opt/rocm/bin:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/bin:
@@ -36,10 +59,11 @@ ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/opt/conda/envs/py_3.9/lib/python3.9/
 
 WORKDIR ${COMMON_WORKDIR}
 
+
 # -----------------------
 # hipBLASLt build stages
 FROM base AS build_hipblaslt
-ARG HIPBLASLT_BRANCH="6f65c6e"
+ARG HIPBLASLT_BRANCH="e784b4c"
 RUN git clone https://github.com/ROCm/hipBLASLt \
     && cd hipBLASLt \
     && git checkout ${HIPBLASLT_BRANCH} \
@@ -52,10 +76,11 @@ COPY --from=build_hipblaslt ${COMMON_WORKDIR}/hipBLASLt/build/release/*.deb /
 FROM scratch AS export_hipblaslt_0
 FROM export_hipblaslt_${BUILD_HIPBLASLT} AS export_hipblaslt
 
+
 # -----------------------
 # RCCL build stages
 FROM base AS build_rccl
-ARG RCCL_BRANCH="73221b4"
+ARG RCCL_BRANCH="833435b"
 RUN git clone https://github.com/ROCm/rccl \
     && cd rccl \
     && git checkout ${RCCL_BRANCH} \
@@ -66,14 +91,15 @@ COPY --from=build_rccl ${COMMON_WORKDIR}/rccl/build/release/*.deb /
 FROM scratch AS export_rccl_0
 FROM export_rccl_${BUILD_RCCL} AS export_rccl
 
+
 # -----------------------
 # flash attn build stages
 FROM base AS build_flash_attn
-ARG FA_BRANCH="ae7928c"
+ARG FA_BRANCH="23a2b1c"
 ARG FA_REPO="https://github.com/ROCm/flash-attention.git"
 RUN git clone ${FA_REPO} \
     && cd flash-attention \
-    && git checkout ${FA_BRANCH} \
+    && git checkout "${FA_BRANCH}" \
     && git submodule update --init \
     && GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
 FROM scratch AS export_flash_attn_1
@@ -82,12 +108,15 @@ COPY --from=build_flash_attn ${COMMON_WORKDIR}/flash-attention/dist/*.whl /
 FROM scratch AS export_flash_attn_0
 FROM export_flash_attn_${BUILD_FA} AS export_flash_attn
 
+
 # -----------------------
 # Triton build stages
 FROM base AS build_triton
-ARG TRITON_BRANCH="6ddb79b"
-ARG TRITON_REPO="https://github.com/OpenAI/triton.git"
-RUN git clone ${TRITON_REPO} \
+ARG TRITON_BRANCH="15325e6"
+ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install pybind11 \
+    && git clone ${TRITON_REPO} \
     && cd triton \
     && git checkout ${TRITON_BRANCH} \
     && cd python \
@@ -98,13 +127,15 @@ COPY --from=build_triton ${COMMON_WORKDIR}/triton/python/dist/*.whl /
 FROM scratch AS export_triton_0
 FROM export_triton_${BUILD_TRITON} AS export_triton
 
+
 # AMD-SMI build stages
 FROM base AS build_amdsmi
 RUN cd /opt/rocm/share/amd_smi \
-    && pip wheel . --wheel-dir=dist
+    && python3 -m pip wheel . --wheel-dir=dist
 FROM scratch AS export_amdsmi
 COPY --from=build_amdsmi /opt/rocm/share/amd_smi/dist/*.whl /
 
+
 # -----------------------
 # vLLM (and gradlib) fetch stages
 FROM base AS fetch_vllm_0
@@ -117,6 +148,7 @@ ONBUILD RUN git clone ${VLLM_REPO} \
 	    && git checkout ${VLLM_BRANCH}
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 
+
 # -----------------------
 # vLLM (and gradlib) build stages
 FROM fetch_vllm AS build_vllm
@@ -130,7 +162,8 @@ if ls /install/*.deb; then \
     && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
 fi
 # Build vLLM
-RUN cd vllm \
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    cd vllm \
     && python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist
 # Build gradlib
 RUN cd vllm/gradlib \
@@ -154,20 +187,9 @@ ARG COMMON_WORKDIR
 ARG BUILD_FA
 
 RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
-# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
-# Manually remove it so that later steps of numpy upgrade can continue
-RUN case "$(which python3)" in \
-        *"/opt/conda/envs/py_3.9"*) \
-            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
-        *) ;; esac
-
-RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
-    if ls /install/*.deb; then \
-        apt-get purge -y hipblaslt \
-        && dpkg -i /install/*.deb \
-        && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-        && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
-    fi
+# Package upgrades for useful functionality or to avoid dependency issues
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
 
 RUN --mount=type=bind,from=export_rccl,src=/,target=/install \
     if ls /install/*.deb; then \
@@ -200,16 +222,14 @@ RUN --mount=type=bind,from=export_amdsmi,src=/,target=/install \
 RUN python3 -m pip install --upgrade numba scipy huggingface-hub[cli]
 
 # Install vLLM (and gradlib)
-# Make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
     cd /install \
     && pip install -U -r requirements-rocm.txt \
     && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
-           *"rocm-6.0"*) \
-               patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \
            *"rocm-6.1"*) \
-               cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6;; \
+               cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \
+               # Prevent interference if torch bundles its own HIP runtime
+               && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \
            *) ;; esac \
     && pip uninstall -y vllm gradlib \
     && pip install *.whl
@@ -220,7 +240,6 @@ COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests
 COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
 COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite
 
-
 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
 ENV TOKENIZERS_PARALLELISM=false
 
@@ -228,4 +247,3 @@ ENV TOKENIZERS_PARALLELISM=false
 ENV HIP_FORCE_DEV_KERNARG=1
 
 CMD ["/bin/bash"]
-
diff --git a/csrc/custom/custom.cu b/csrc/custom/custom.cu
index e4826b80de769..fae1b4fbfbe33 100644
--- a/csrc/custom/custom.cu
+++ b/csrc/custom/custom.cu
@@ -1,7 +1,6 @@
 #include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda_runtime.h>
-#include "core/registration.h"
 
 // declare templates for front (cpp) and back (cuda) sides of function:
 // template <typename T>
diff --git a/setup_cython.py b/setup_cython.py
deleted file mode 100644
index dca79af61a9f6..0000000000000
--- a/setup_cython.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import Cython.Compiler.Options
-from Cython.Build import cythonize
-from setuptools import setup
-
-Cython.Compiler.Options.annotate = True
-
-infiles = []
-
-infiles += [
-    "vllm/engine/llm_engine.py",
-    "vllm/transformers_utils/detokenizer.py",
-    "vllm/engine/output_processor/single_step.py",
-    "vllm/outputs.py",
-    "vllm/engine/output_processor/stop_checker.py",
-]
-
-infiles += [
-    "vllm/core/scheduler.py",
-    "vllm/sequence.py",
-    "vllm/core/block_manager_v1.py",
-]
-
-infiles += [
-    "vllm/model_executor/layers/sampler.py",
-    "vllm/sampling_params.py",
-    "vllm/utils.py",
-]
-
-setup(ext_modules=cythonize(infiles,
-                            annotate=False,
-                            force=True,
-                            compiler_directives={
-                                'language_level': "3",
-                                'infer_types': True
-                            }))
-
-# example usage: python3 setup_cython.py build_ext --inplace
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 2e7118f23d8ab..10bae4042cfe7 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -646,7 +646,7 @@ def register_buffer(fa: int, t: torch.Tensor, handles: List[str],
     return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets)
 
 
-def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]:
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]:
     return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
 
 
diff --git a/vllm/entrypoints/sync_openai/api_server.py b/vllm/entrypoints/sync_openai/api_server.py
index 4c05742d6a78d..c22bdf1903f3b 100644
--- a/vllm/entrypoints/sync_openai/api_server.py
+++ b/vllm/entrypoints/sync_openai/api_server.py
@@ -174,9 +174,11 @@ async def _check_model(request: Union[CompletionRequest,
 
 async def _guided_decode_logits_processor(request, tokenizer):
     decoding_config = runner.engine_config.decoding_config
-    assert decoding_config is not None
-    guided_decoding_backend = (request.guided_decoding_backend
-                               or decoding_config.guided_decoding_backend)
+    if request.guided_decoding_backend:
+        guided_decoding_backend = request.guided_decoding_backend
+    else:
+        assert decoding_config is not None
+        guided_decoding_backend = decoding_config.guided_decoding_backend
     return await get_guided_decoding_logits_processor(guided_decoding_backend,
                                                       request, tokenizer)
 
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index d3e325d8a613d..e5f6404949950 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -34,7 +34,7 @@
 # the major benefit of using AMDSMI is that it will not initialize CUDA
 
 
-def with_nvml_context(fn):
+def with_amdsmi_context(fn):
 
     @wraps(fn)
     def wrapper(*args, **kwargs):
@@ -65,12 +65,11 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
         return torch.cuda.get_device_capability(device_id)
 
     @staticmethod
-    @with_nvml_context
+    @with_amdsmi_context
     def is_full_nvlink(physical_device_ids: List[int]) -> bool:
         """
-        query if the set of gpus are fully connected by xgmi (1 hop)
+        Query if the set of gpus are fully connected by xgmi (1 hop)
         """
-        # On ROCm, we instead query if GPUs are connected by 1 hop XGMI
         handles = [
             amdsmi_get_processor_handles()[i] for i in physical_device_ids
         ]
@@ -90,7 +89,7 @@ def is_full_nvlink(physical_device_ids: List[int]) -> bool:
         return True
 
     @staticmethod
-    @with_nvml_context
+    @with_amdsmi_context
     @lru_cache(maxsize=8)
     def get_device_name(device_id: int = 0) -> str:
         physical_device_id = device_id_to_physical_device_id(device_id)