From 05af6da8d927f70d15ab1ed25b01df3c967ad961 Mon Sep 17 00:00:00 2001 From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com> Date: Mon, 4 Mar 2024 21:14:53 -0500 Subject: [PATCH] [ROCm] enable cupy in order to enable cudagraph mode for AMD GPUs (#3123) Co-authored-by: lcskrishna --- Dockerfile.rocm | 30 +++++++++++++++++++++++++----- vllm/worker/worker.py | 4 +--- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 54ae06be6e101..a45265d79a6ac 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -23,6 +23,9 @@ RUN echo "FA_BRANCH is $FA_BRANCH" # In that case, we need to use the python reference attention implementation in vllm ARG BUILD_FA="1" +# whether to build cupy on rocm +ARG BUILD_CUPY="1" + # Install some basic utilities RUN apt-get update && apt-get install python3 python3-pip -y @@ -70,16 +73,33 @@ RUN if [ "$BUILD_FA" = "1" ]; then \ && cd ..; \ fi -COPY ./ /app/vllm - -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install xformers==0.0.23 --no-deps - # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Manually removed it so that later steps of numpy upgrade can continue RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi +# build cupy +RUN if [ "$BUILD_CUPY" = "1" ]; then \ + mkdir -p libs \ + && cd libs \ + && git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \ + && cd cupy \ + && pip install mpi4py-mpich \ + && pip install scipy==1.9.3 \ + && pip install cython==0.29.* \ + && env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \ + && export CUPY_INSTALL_USE_HIP=1 \ + && export ROCM_HOME=/opt/rocm \ + && export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \ + && pip install . \ + && cd ..; \ + fi + +COPY ./ /app/vllm + +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install xformers==0.0.23 --no-deps + RUN cd /app \ && cd vllm \ && pip install -U -r requirements-rocm.txt \ diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 9df518d155ec2..157e8c45836b1 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -19,7 +19,6 @@ from vllm.worker.cache_engine import CacheEngine from vllm.worker.model_runner import ModelRunner from vllm.lora.request import LoRARequest -from vllm.utils import is_hip class Worker: @@ -267,8 +266,7 @@ def init_distributed_environment( "cupy.distributed is already initialized but the cupy world " "size does not match parallel_config.world_size " f"({cupy_world_size} vs. {parallel_config.world_size}).") - elif (parallel_config.world_size > 1 and cupy_port is not None - and not is_hip()): + elif (parallel_config.world_size > 1 and cupy_port is not None): # NOTE(woosuk): We don't initialize CuPy process group when world size # is 1. # TODO(woosuk): Support multi-node connection.