diff --git a/Dockerfile.rocm b/Dockerfile.rocm index befb0499f2e68..85dfda8dbb532 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,11 +1,6 @@ # Default ROCm 6.1 base image ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" -# Tested and supported base rocm/pytorch images -ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \ - ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ - ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" - # Default ROCm ARCHes to build vLLM for. ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" @@ -54,18 +49,6 @@ RUN pip install --upgrade pip RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)" # Install torch == 2.5.0 on ROCm RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-5.7"*) \ - pip uninstall -y torch torchaudio torchvision \ - && pip install --no-cache-dir --pre \ - torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \ - torchvision==0.20.0.dev20240710 \ - --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \ - *"rocm-6.0"*) \ - pip uninstall -y torch torchaudio torchvision \ - && pip install --no-cache-dir --pre \ - torch==2.5.0.dev20240710 torchaudio==2.4.0.dev20240710 \ - torchvision==0.20.0.dev20240710 \ - --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \ *"rocm-6.1"*) \ pip uninstall -y torch torchaudio torchvision \ && pip install --no-cache-dir --pre \ @@ -104,11 +87,6 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \ && cd flash-attention \ && git checkout "${FA_BRANCH}" \ && git submodule update --init \ - && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-5.7"*) \ - export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \ - && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \ - *) ;; esac \ && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ # Create an empty directory otherwise as later build stages expect one else mkdir -p /install; \ @@ -161,12 +139,9 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \ --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-6.0"*) \ - patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \ *"rocm-6.1"*) \ # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \ - && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \ + wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \ # Prevent interference if torch bundles its own HIP runtime && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \ *) ;; esac \ diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index cc41d47296f8d..1f9e4fabc4fc9 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -3,7 +3,7 @@ Installation with ROCm ====================== -vLLM supports AMD GPUs with ROCm 5.7 and 6.0. +vLLM supports AMD GPUs with ROCm 6.1. Requirements ------------ @@ -11,7 +11,7 @@ Requirements * OS: Linux * Python: 3.8 -- 3.11 * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) -* ROCm 6.0 and ROCm 5.7 +* ROCm 6.1 Installation options: @@ -27,10 +27,10 @@ You can build and install vLLM from source. First, build a docker image from `Dockerfile.rocm `_ and launch a docker container from the image. -`Dockerfile.rocm `_ uses ROCm 6.0 by default, but also supports ROCm 5.7. +`Dockerfile.rocm `_ uses ROCm 6.1 by default, but also supports ROCm 5.7 and 6.0 in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: -* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1` +* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. * `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) `_, this should be set to 0 before flash-attention supports this target. * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942` * `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo `_. The default is `ae7928c` @@ -39,24 +39,17 @@ It provides flexibility to customize the build of docker image using the followi Their values can be passed in when running ``docker build`` with ``--build-arg`` options. -To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default: +To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default: .. code-block:: console - $ docker build -f Dockerfile.rocm -t vllm-rocm . + $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . -To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: +To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: .. code-block:: console - $ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . - -To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below: - -.. code-block:: console - - $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \ - -f Dockerfile.rocm -t vllm-rocm . + $ DOCKER_BUILDKIT=1 docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm . To run the above docker image ``vllm-rocm``, use the below command: @@ -85,25 +78,12 @@ Option 2: Build from source 0. Install prerequisites (skip if you are already in an environment/docker with the following installed): - `ROCm `_ -- `Pytorch `_ +- `PyTorch `_ - `hipBLAS `_ -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`. - -Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started `_ - -For rocm6.0: - -.. code-block:: console - - $ pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.0 - - -For rocm5.7: - -.. code-block:: console +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch-nightly`. - $ pip install torch --index-url https://download.pytorch.org/whl/rocm5.7 +Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guild in PyTorch `Getting Started `_ 1. Install `Triton flash attention for ROCm `_ @@ -115,8 +95,6 @@ Install ROCm's Triton flash attention (the default triton-mlir branch) following Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/flash-attention `_ .. note:: - - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly. - - If you fail to install `ROCm/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`. - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention. - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) @@ -131,7 +109,6 @@ Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/fl .. tip:: - - You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation. - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - - To use CK flash-attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. - - The ROCm version of pytorch, ideally, should match the ROCm driver version. + - To use CK flash-attention or PyTorch naive attention, please use this flag ``export VLLM_USE_TRITON_FLASH_ATTN=0`` to turn off triton flash attention. + - The ROCm version of PyTorch, ideally, should match the ROCm driver version. diff --git a/rocm_patch/rocm_bf16.patch b/rocm_patch/rocm_bf16.patch deleted file mode 100644 index a0f07da2a3e2b..0000000000000 --- a/rocm_patch/rocm_bf16.patch +++ /dev/null @@ -1,15 +0,0 @@ ---- amd_hip_bf16.h 2024-02-06 18:28:58.268699142 +0000 -+++ amd_hip_bf16.h.new 2024-02-06 18:28:31.988647133 +0000 -@@ -90,10 +90,10 @@ - #include "math_fwd.h" // ocml device functions - - #if defined(__HIPCC_RTC__) --#define __HOST_DEVICE__ __device__ -+#define __HOST_DEVICE__ __device__ static - #else - #include --#define __HOST_DEVICE__ __host__ __device__ -+#define __HOST_DEVICE__ __host__ __device__ static inline - #endif - - // Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on