Update PyTorch, CUDA, vLLM, and Bitsandbytes (#553)

predibase · Jul 25, 2024 · 5cefe6e · 5cefe6e
1 parent 07addea
commit 5cefe6e
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 17 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -35,12 +35,12 @@ RUN cargo build --release
 
 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 as pytorch-install
 
-ARG PYTORCH_VERSION=2.3.0
+ARG PYTORCH_VERSION=2.4.0
 ARG PYTHON_VERSION=3.10
 # Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=12.1
+ARG CUDA_VERSION=12.4
 ARG MAMBA_VERSION=24.3.0-0
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
@@ -116,7 +116,6 @@ WORKDIR /usr/src
 COPY server/Makefile-awq Makefile
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
 
-
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
 WORKDIR /usr/src
@@ -127,6 +126,13 @@ RUN python setup.py build
 # Build vllm CUDA kernels
 FROM kernel-builder as vllm-builder
 WORKDIR /usr/src
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+RUN DEBIAN_FRONTEND=noninteractive apt purge -y --auto-remove cmake
+RUN wget 'https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.tar.gz'
+RUN tar xzvf 'cmake-3.30.0-linux-x86_64.tar.gz'
+RUN ln -s "$(pwd)/cmake-3.30.0-linux-x86_64/bin/cmake" /usr/local/bin/cmake
 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 COPY server/Makefile-vllm Makefile
 # Build specific version of vllm
@@ -155,7 +161,7 @@ COPY server/Makefile-eetq Makefile
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
 
 # LoRAX base image
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base
+FROM nvidia/cuda:12.4.0-base-ubuntu22.04 as base
 
 # Conda env
 ENV PATH=/opt/conda/bin:$PATH \
@@ -166,6 +172,9 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     PORT=80
 
+# vLLM needs this in order to work without error
+ENV LD_PRELOAD=/usr/local/cuda/compat/libcuda.so
+
 WORKDIR /usr/src
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \

diff --git a/server/Makefile-vllm b/server/Makefile-vllm
@@ -1,10 +1,10 @@
 vllm-cuda:
     # Clone vllm
 	pip install -U ninja packaging --no-cache-dir
-	git clone https://github.com/Narsil/vllm.git vllm
+	git clone https://github.com/vllm-project/vllm.git vllm
 
 build-vllm-cuda: vllm-cuda
-	cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
+	cd vllm && git fetch && git checkout 5448f67
 	cd vllm && python setup.py build
 
 install-vllm-cuda: build-vllm-cuda

diff --git a/server/lorax_server/utils/paged_attention.py b/server/lorax_server/utils/paged_attention.py
@@ -8,7 +8,8 @@
     import intel_extension_for_pytorch as ipex
 else:
     try:
-        from vllm._C import cache_ops, ops
+        import torch
+        import vllm._custom_ops as ops
     except Exception as e:
         raise ImportError(
             f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
@@ -33,7 +34,7 @@ def reshape_and_cache(
     if SYSTEM == "xpu":
         ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slots)
     else:
-        cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0)
+        torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0)
 
 
 def attention(
@@ -68,6 +69,8 @@ def attention(
     block_size = value_cache.shape[3]
     num_seqs, num_heads, head_size = query.shape
     max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
+    num_kv_heads = 1 + kv_head_mapping.max().item()
+
     if SYSTEM == "xpu":
         query = query.contiguous()
         return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
@@ -96,7 +99,7 @@ def attention(
             query,
             key_cache,
             value_cache,
-            kv_head_mapping,
+            num_kv_heads,
             softmax_scale,
             block_tables,
             input_lengths,
@@ -129,7 +132,7 @@ def attention(
             query,
             key_cache,
             value_cache,
-            kv_head_mapping,
+            num_kv_heads,
             softmax_scale,
             block_tables,
             input_lengths,

diff --git a/server/poetry.lock b/server/poetry.lock
diff --git a/server/pyproject.toml b/server/pyproject.toml
@@ -16,7 +16,7 @@ grpcio-reflection = "^1.51.1"
 grpc-interceptor = "^0.15.0"
 typer = "^0.6.1"
 accelerate = { version = "^0.24.1", optional = true }
-bitsandbytes = { version = "^0.41.1", optional = true }
+bitsandbytes = { version = "^0.43.1", optional = true }
 scipy = { version = "^1.0.0", optional = true }
 safetensors = "0.4.2"
 loguru = "^0.6.0"

diff --git a/server/requirements.txt b/server/requirements.txt
@@ -1,5 +1,5 @@
 backoff==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
-bitsandbytes==0.41.3.post2 ; python_version >= "3.9" and python_version < "4.0"
+bitsandbytes==0.43.1 ; python_version >= "3.9" and python_version < "4.0"
 boto3==1.34.4 ; python_version >= "3.9" and python_version < "4.0"
 botocore==1.34.4 ; python_version >= "3.9" and python_version < "4.0"
 certifi==2023.11.17 ; python_version >= "3.9" and python_version < "4.0"