From c41825a0e6ea529751734b97a7ef68e237c17010 Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 4 Sep 2024 08:48:44 -0500 Subject: [PATCH] Miscellaneous changes, Dockerfile components update, remove Cython --- Dockerfile.rocm | 92 +++++++++++++--------- csrc/custom/custom.cu | 1 - setup_cython.py | 37 --------- vllm/_custom_ops.py | 2 +- vllm/entrypoints/sync_openai/api_server.py | 8 +- vllm/platforms/rocm.py | 9 +-- 6 files changed, 65 insertions(+), 84 deletions(-) delete mode 100644 setup_cython.py diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 499f55896c35c..b8a505c375205 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,17 +1,18 @@ # default base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" +ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_staging" ARG COMMON_WORKDIR=/app # The following ARGs should be "0" or "1". If "1", the respective component will be built and installed on top of the base image -ARG BUILD_HIPBLASLT="1" -ARG BUILD_RCCL="1" -ARG BUILD_FA="1" +ARG BUILD_HIPBLASLT="0" +ARG BUILD_RCCL="0" +ARG BUILD_FA="0" ARG BUILD_TRITON="1" # This ARG should also be "0" or "1". If "1", the vLLM development directory is obtained via git clone. # If "0", it is copied in from the local working directory. ARG REMOTE_VLLM="0" + # ----------------------- # vLLM base image FROM $BASE_IMAGE AS base @@ -21,13 +22,35 @@ USER root ARG BASE_IMAGE ARG COMMON_WORKDIR # Used as ARCHes for all components -ARG ARG_PYTORCH_ROCM_ARCH="gfx90a;gfx942" -ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH} +ARG VLLM_ROCM_ARCH="gfx90a;gfx942" +ENV PYTORCH_ROCM_ARCH=${VLLM_ROCM_ARCH} # Install some basic utilities RUN apt-get update -q -y && apt-get install -q -y python3 python3-pip RUN apt-get update -q -y && apt-get install -q -y \ - sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev + ccache sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev + +ENV CCACHE_DIR=/root/.cache/ccache + +RUN python3 -m pip install --upgrade pip +# Remove sccache so it doesn't interfere with ccache +# TODO: implement sccache support across components +RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" +# Install torch == 2.5.0 on ROCm +RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-6.1"*) \ + python3 -m pip uninstall -y torch torchvision \ + && python3 -m pip install --no-cache-dir --pre \ + torch==2.5.0.dev20240826 \ + torchvision==0.20.0.dev20240826 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ + *"rocm-6.2"*) \ + python3 -m pip uninstall -y torch torchvision \ + && python3 -m pip install --no-cache-dir --pre \ + torch==2.5.0.dev20240826 \ + torchvision==0.20.0.dev20240826 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ + *) ;; esac ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer ENV PATH=$PATH:/opt/rocm/bin:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/bin: @@ -36,10 +59,11 @@ ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/opt/conda/envs/py_3.9/lib/python3.9/ WORKDIR ${COMMON_WORKDIR} + # ----------------------- # hipBLASLt build stages FROM base AS build_hipblaslt -ARG HIPBLASLT_BRANCH="6f65c6e" +ARG HIPBLASLT_BRANCH="e784b4c" RUN git clone https://github.com/ROCm/hipBLASLt \ && cd hipBLASLt \ && git checkout ${HIPBLASLT_BRANCH} \ @@ -52,10 +76,11 @@ COPY --from=build_hipblaslt ${COMMON_WORKDIR}/hipBLASLt/build/release/*.deb / FROM scratch AS export_hipblaslt_0 FROM export_hipblaslt_${BUILD_HIPBLASLT} AS export_hipblaslt + # ----------------------- # RCCL build stages FROM base AS build_rccl -ARG RCCL_BRANCH="73221b4" +ARG RCCL_BRANCH="833435b" RUN git clone https://github.com/ROCm/rccl \ && cd rccl \ && git checkout ${RCCL_BRANCH} \ @@ -66,14 +91,15 @@ COPY --from=build_rccl ${COMMON_WORKDIR}/rccl/build/release/*.deb / FROM scratch AS export_rccl_0 FROM export_rccl_${BUILD_RCCL} AS export_rccl + # ----------------------- # flash attn build stages FROM base AS build_flash_attn -ARG FA_BRANCH="ae7928c" +ARG FA_BRANCH="23a2b1c" ARG FA_REPO="https://github.com/ROCm/flash-attention.git" RUN git clone ${FA_REPO} \ && cd flash-attention \ - && git checkout ${FA_BRANCH} \ + && git checkout "${FA_BRANCH}" \ && git submodule update --init \ && GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_flash_attn_1 @@ -82,12 +108,15 @@ COPY --from=build_flash_attn ${COMMON_WORKDIR}/flash-attention/dist/*.whl / FROM scratch AS export_flash_attn_0 FROM export_flash_attn_${BUILD_FA} AS export_flash_attn + # ----------------------- # Triton build stages FROM base AS build_triton -ARG TRITON_BRANCH="6ddb79b" -ARG TRITON_REPO="https://github.com/OpenAI/triton.git" -RUN git clone ${TRITON_REPO} \ +ARG TRITON_BRANCH="15325e6" +ARG TRITON_REPO="https://github.com/triton-lang/triton.git" +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install pybind11 \ + && git clone ${TRITON_REPO} \ && cd triton \ && git checkout ${TRITON_BRANCH} \ && cd python \ @@ -98,13 +127,15 @@ COPY --from=build_triton ${COMMON_WORKDIR}/triton/python/dist/*.whl / FROM scratch AS export_triton_0 FROM export_triton_${BUILD_TRITON} AS export_triton + # AMD-SMI build stages FROM base AS build_amdsmi RUN cd /opt/rocm/share/amd_smi \ - && pip wheel . --wheel-dir=dist + && python3 -m pip wheel . --wheel-dir=dist FROM scratch AS export_amdsmi COPY --from=build_amdsmi /opt/rocm/share/amd_smi/dist/*.whl / + # ----------------------- # vLLM (and gradlib) fetch stages FROM base AS fetch_vllm_0 @@ -117,6 +148,7 @@ ONBUILD RUN git clone ${VLLM_REPO} \ && git checkout ${VLLM_BRANCH} FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm + # ----------------------- # vLLM (and gradlib) build stages FROM fetch_vllm AS build_vllm @@ -130,7 +162,8 @@ if ls /install/*.deb; then \ && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \ fi # Build vLLM -RUN cd vllm \ +RUN --mount=type=cache,target=/root/.cache/ccache \ + cd vllm \ && python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist # Build gradlib RUN cd vllm/gradlib \ @@ -154,20 +187,9 @@ ARG COMMON_WORKDIR ARG BUILD_FA RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually remove it so that later steps of numpy upgrade can continue -RUN case "$(which python3)" in \ - *"/opt/conda/envs/py_3.9"*) \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ - *) ;; esac - -RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \ - if ls /install/*.deb; then \ - apt-get purge -y hipblaslt \ - && dpkg -i /install/*.deb \ - && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ - && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \ - fi +# Package upgrades for useful functionality or to avoid dependency issues +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --upgrade numba scipy huggingface-hub[cli] RUN --mount=type=bind,from=export_rccl,src=/,target=/install \ if ls /install/*.deb; then \ @@ -200,16 +222,14 @@ RUN --mount=type=bind,from=export_amdsmi,src=/,target=/install \ RUN python3 -m pip install --upgrade numba scipy huggingface-hub[cli] # Install vLLM (and gradlib) -# Make sure punica kernels are built (for LoRA) -ENV VLLM_INSTALL_PUNICA_KERNELS=1 RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ cd /install \ && pip install -U -r requirements-rocm.txt \ && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-6.0"*) \ - patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \ *"rocm-6.1"*) \ - cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6;; \ + cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \ + # Prevent interference if torch bundles its own HIP runtime + && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \ *) ;; esac \ && pip uninstall -y vllm gradlib \ && pip install *.whl @@ -220,7 +240,6 @@ COPY --from=export_vllm /tests ${COMMON_WORKDIR}/vllm/tests COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples COPY --from=export_vllm /.buildkite ${COMMON_WORKDIR}/vllm/.buildkite - ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV TOKENIZERS_PARALLELISM=false @@ -228,4 +247,3 @@ ENV TOKENIZERS_PARALLELISM=false ENV HIP_FORCE_DEV_KERNARG=1 CMD ["/bin/bash"] - diff --git a/csrc/custom/custom.cu b/csrc/custom/custom.cu index e4826b80de769..fae1b4fbfbe33 100644 --- a/csrc/custom/custom.cu +++ b/csrc/custom/custom.cu @@ -1,7 +1,6 @@ #include #include #include -#include "core/registration.h" // declare templates for front (cpp) and back (cuda) sides of function: // template diff --git a/setup_cython.py b/setup_cython.py deleted file mode 100644 index dca79af61a9f6..0000000000000 --- a/setup_cython.py +++ /dev/null @@ -1,37 +0,0 @@ -import Cython.Compiler.Options -from Cython.Build import cythonize -from setuptools import setup - -Cython.Compiler.Options.annotate = True - -infiles = [] - -infiles += [ - "vllm/engine/llm_engine.py", - "vllm/transformers_utils/detokenizer.py", - "vllm/engine/output_processor/single_step.py", - "vllm/outputs.py", - "vllm/engine/output_processor/stop_checker.py", -] - -infiles += [ - "vllm/core/scheduler.py", - "vllm/sequence.py", - "vllm/core/block_manager_v1.py", -] - -infiles += [ - "vllm/model_executor/layers/sampler.py", - "vllm/sampling_params.py", - "vllm/utils.py", -] - -setup(ext_modules=cythonize(infiles, - annotate=False, - force=True, - compiler_directives={ - 'language_level': "3", - 'infer_types': True - })) - -# example usage: python3 setup_cython.py build_ext --inplace diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 2e7118f23d8ab..10bae4042cfe7 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -646,7 +646,7 @@ def register_buffer(fa: int, t: torch.Tensor, handles: List[str], return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets) -def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]: +def get_graph_buffer_ipc_meta(fa: int) -> Tuple[torch.Tensor, List[int]]: return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa) diff --git a/vllm/entrypoints/sync_openai/api_server.py b/vllm/entrypoints/sync_openai/api_server.py index 4c05742d6a78d..c22bdf1903f3b 100644 --- a/vllm/entrypoints/sync_openai/api_server.py +++ b/vllm/entrypoints/sync_openai/api_server.py @@ -174,9 +174,11 @@ async def _check_model(request: Union[CompletionRequest, async def _guided_decode_logits_processor(request, tokenizer): decoding_config = runner.engine_config.decoding_config - assert decoding_config is not None - guided_decoding_backend = (request.guided_decoding_backend - or decoding_config.guided_decoding_backend) + if request.guided_decoding_backend: + guided_decoding_backend = request.guided_decoding_backend + else: + assert decoding_config is not None + guided_decoding_backend = decoding_config.guided_decoding_backend return await get_guided_decoding_logits_processor(guided_decoding_backend, request, tokenizer) diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index d3e325d8a613d..e5f6404949950 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -34,7 +34,7 @@ # the major benefit of using AMDSMI is that it will not initialize CUDA -def with_nvml_context(fn): +def with_amdsmi_context(fn): @wraps(fn) def wrapper(*args, **kwargs): @@ -65,12 +65,11 @@ def get_device_capability(device_id: int = 0) -> Tuple[int, int]: return torch.cuda.get_device_capability(device_id) @staticmethod - @with_nvml_context + @with_amdsmi_context def is_full_nvlink(physical_device_ids: List[int]) -> bool: """ - query if the set of gpus are fully connected by xgmi (1 hop) + Query if the set of gpus are fully connected by xgmi (1 hop) """ - # On ROCm, we instead query if GPUs are connected by 1 hop XGMI handles = [ amdsmi_get_processor_handles()[i] for i in physical_device_ids ] @@ -90,7 +89,7 @@ def is_full_nvlink(physical_device_ids: List[int]) -> bool: return True @staticmethod - @with_nvml_context + @with_amdsmi_context @lru_cache(maxsize=8) def get_device_name(device_id: int = 0) -> str: physical_device_id = device_id_to_physical_device_id(device_id)