From 5cefe6e09571499e329aba429a2e1e6790647c92 Mon Sep 17 00:00:00 2001 From: Ajinkya Tejankar Date: Thu, 25 Jul 2024 09:44:37 -0700 Subject: [PATCH] Update PyTorch, CUDA, vLLM, and Bitsandbytes (#553) --- Dockerfile | 19 ++++++++++++++----- server/Makefile-vllm | 4 ++-- server/lorax_server/utils/paged_attention.py | 11 +++++++---- server/poetry.lock | 16 ++++++++++++---- server/pyproject.toml | 2 +- server/requirements.txt | 2 +- 6 files changed, 37 insertions(+), 17 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1f6698e35..02d6061d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -35,12 +35,12 @@ RUN cargo build --release # Python builder # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile -FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install +FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 as pytorch-install -ARG PYTORCH_VERSION=2.3.0 +ARG PYTORCH_VERSION=2.4.0 ARG PYTHON_VERSION=3.10 # Keep in sync with `server/pyproject.toml -ARG CUDA_VERSION=12.1 +ARG CUDA_VERSION=12.4 ARG MAMBA_VERSION=24.3.0-0 ARG CUDA_CHANNEL=nvidia ARG INSTALL_CHANNEL=pytorch @@ -116,7 +116,6 @@ WORKDIR /usr/src COPY server/Makefile-awq Makefile RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq - # Build Transformers CUDA kernels FROM kernel-builder as custom-kernels-builder WORKDIR /usr/src @@ -127,6 +126,13 @@ RUN python setup.py build # Build vllm CUDA kernels FROM kernel-builder as vllm-builder WORKDIR /usr/src +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + wget \ + && rm -rf /var/lib/apt/lists/* +RUN DEBIAN_FRONTEND=noninteractive apt purge -y --auto-remove cmake +RUN wget 'https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.tar.gz' +RUN tar xzvf 'cmake-3.30.0-linux-x86_64.tar.gz' +RUN ln -s "$(pwd)/cmake-3.30.0-linux-x86_64/bin/cmake" /usr/local/bin/cmake ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX" COPY server/Makefile-vllm Makefile # Build specific version of vllm @@ -155,7 +161,7 @@ COPY server/Makefile-eetq Makefile RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq # LoRAX base image -FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base +FROM nvidia/cuda:12.4.0-base-ubuntu22.04 as base # Conda env ENV PATH=/opt/conda/bin:$PATH \ @@ -166,6 +172,9 @@ ENV HUGGINGFACE_HUB_CACHE=/data \ HF_HUB_ENABLE_HF_TRANSFER=1 \ PORT=80 +# vLLM needs this in order to work without error +ENV LD_PRELOAD=/usr/local/cuda/compat/libcuda.so + WORKDIR /usr/src RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ diff --git a/server/Makefile-vllm b/server/Makefile-vllm index 62fa413f4..3e322379b 100644 --- a/server/Makefile-vllm +++ b/server/Makefile-vllm @@ -1,10 +1,10 @@ vllm-cuda: # Clone vllm pip install -U ninja packaging --no-cache-dir - git clone https://github.com/Narsil/vllm.git vllm + git clone https://github.com/vllm-project/vllm.git vllm build-vllm-cuda: vllm-cuda - cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa + cd vllm && git fetch && git checkout 5448f67 cd vllm && python setup.py build install-vllm-cuda: build-vllm-cuda diff --git a/server/lorax_server/utils/paged_attention.py b/server/lorax_server/utils/paged_attention.py index 7f2ca6cea..e6d3620e6 100644 --- a/server/lorax_server/utils/paged_attention.py +++ b/server/lorax_server/utils/paged_attention.py @@ -8,7 +8,8 @@ import intel_extension_for_pytorch as ipex else: try: - from vllm._C import cache_ops, ops + import torch + import vllm._custom_ops as ops except Exception as e: raise ImportError( f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}" @@ -33,7 +34,7 @@ def reshape_and_cache( if SYSTEM == "xpu": ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slots) else: - cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0) + torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0) def attention( @@ -68,6 +69,8 @@ def attention( block_size = value_cache.shape[3] num_seqs, num_heads, head_size = query.shape max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE + num_kv_heads = 1 + kv_head_mapping.max().item() + if SYSTEM == "xpu": query = query.contiguous() return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention( @@ -96,7 +99,7 @@ def attention( query, key_cache, value_cache, - kv_head_mapping, + num_kv_heads, softmax_scale, block_tables, input_lengths, @@ -129,7 +132,7 @@ def attention( query, key_cache, value_cache, - kv_head_mapping, + num_kv_heads, softmax_scale, block_tables, input_lengths, diff --git a/server/poetry.lock b/server/poetry.lock index 68649649a..859699923 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -192,15 +192,23 @@ files = [ [[package]] name = "bitsandbytes" -version = "0.41.3.post2" +version = "0.43.2" description = "k-bit optimizers and matrix multiplication routines." optional = true python-versions = "*" files = [ - {file = "bitsandbytes-0.41.3.post2-py3-none-any.whl", hash = "sha256:ceb301a3d4e6bf52bdad8d09f3064ac194bdfdeae535994c0315bd2ef7639cca"}, - {file = "bitsandbytes-0.41.3.post2.tar.gz", hash = "sha256:7d25a51fb3b74b58e569473f8b70a5239124c0593dc053479c41cf2cd6730502"}, + {file = "bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:80fbc0f41dded735f51328042d3d45ea640101d87c8abba8ea5bfafa61e2a786"}, + {file = "bitsandbytes-0.43.2-py3-none-win_amd64.whl", hash = "sha256:89e1c506fd2323574615d668ca7eacad4f83db4847044bf2db87580a71852ff1"}, ] +[package.dependencies] +numpy = "*" +torch = "*" + +[package.extras] +benchmark = ["matplotlib", "pandas"] +test = ["scipy"] + [[package]] name = "boto3" version = "1.34.4" @@ -3577,4 +3585,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "e52881fa075d917070103ac7f96cae7c648502cb82b4b48d54ec81d47e1b0ef9" +content-hash = "06b1954d759d902f96836ae358647275ee4e21365369f4d3d46c82c1bcf00c2e" diff --git a/server/pyproject.toml b/server/pyproject.toml index 1cfee5a7e..ba7073b5b 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -16,7 +16,7 @@ grpcio-reflection = "^1.51.1" grpc-interceptor = "^0.15.0" typer = "^0.6.1" accelerate = { version = "^0.24.1", optional = true } -bitsandbytes = { version = "^0.41.1", optional = true } +bitsandbytes = { version = "^0.43.1", optional = true } scipy = { version = "^1.0.0", optional = true } safetensors = "0.4.2" loguru = "^0.6.0" diff --git a/server/requirements.txt b/server/requirements.txt index bb1bd58d8..096165272 100644 --- a/server/requirements.txt +++ b/server/requirements.txt @@ -1,5 +1,5 @@ backoff==2.2.1 ; python_version >= "3.9" and python_version < "4.0" -bitsandbytes==0.41.3.post2 ; python_version >= "3.9" and python_version < "4.0" +bitsandbytes==0.43.1 ; python_version >= "3.9" and python_version < "4.0" boto3==1.34.4 ; python_version >= "3.9" and python_version < "4.0" botocore==1.34.4 ; python_version >= "3.9" and python_version < "4.0" certifi==2023.11.17 ; python_version >= "3.9" and python_version < "4.0"