From 5cefe6e09571499e329aba429a2e1e6790647c92 Mon Sep 17 00:00:00 2001
From: Ajinkya Tejankar <tejankarajinkya@gmail.com>
Date: Thu, 25 Jul 2024 09:44:37 -0700
Subject: [PATCH] Update PyTorch, CUDA, vLLM, and Bitsandbytes (#553)

---
 Dockerfile                                   | 19 ++++++++++++++-----
 server/Makefile-vllm                         |  4 ++--
 server/lorax_server/utils/paged_attention.py | 11 +++++++----
 server/poetry.lock                           | 16 ++++++++++++----
 server/pyproject.toml                        |  2 +-
 server/requirements.txt                      |  2 +-
 6 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 1f6698e35..02d6061d9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -35,12 +35,12 @@ RUN cargo build --release
 
 # Python builder
 # Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 as pytorch-install
 
-ARG PYTORCH_VERSION=2.3.0
+ARG PYTORCH_VERSION=2.4.0
 ARG PYTHON_VERSION=3.10
 # Keep in sync with `server/pyproject.toml
-ARG CUDA_VERSION=12.1
+ARG CUDA_VERSION=12.4
 ARG MAMBA_VERSION=24.3.0-0
 ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=pytorch
@@ -116,7 +116,6 @@ WORKDIR /usr/src
 COPY server/Makefile-awq Makefile
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq
 
-
 # Build Transformers CUDA kernels
 FROM kernel-builder as custom-kernels-builder
 WORKDIR /usr/src
@@ -127,6 +126,13 @@ RUN python setup.py build
 # Build vllm CUDA kernels
 FROM kernel-builder as vllm-builder
 WORKDIR /usr/src
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+RUN DEBIAN_FRONTEND=noninteractive apt purge -y --auto-remove cmake
+RUN wget 'https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.tar.gz'
+RUN tar xzvf 'cmake-3.30.0-linux-x86_64.tar.gz'
+RUN ln -s "$(pwd)/cmake-3.30.0-linux-x86_64/bin/cmake" /usr/local/bin/cmake
 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 COPY server/Makefile-vllm Makefile
 # Build specific version of vllm
@@ -155,7 +161,7 @@ COPY server/Makefile-eetq Makefile
 RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq
 
 # LoRAX base image
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base
+FROM nvidia/cuda:12.4.0-base-ubuntu22.04 as base
 
 # Conda env
 ENV PATH=/opt/conda/bin:$PATH \
@@ -166,6 +172,9 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
     HF_HUB_ENABLE_HF_TRANSFER=1 \
     PORT=80
 
+# vLLM needs this in order to work without error
+ENV LD_PRELOAD=/usr/local/cuda/compat/libcuda.so
+
 WORKDIR /usr/src
 
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
diff --git a/server/Makefile-vllm b/server/Makefile-vllm
index 62fa413f4..3e322379b 100644
--- a/server/Makefile-vllm
+++ b/server/Makefile-vllm
@@ -1,10 +1,10 @@
 vllm-cuda:
     # Clone vllm
 	pip install -U ninja packaging --no-cache-dir
-	git clone https://github.com/Narsil/vllm.git vllm
+	git clone https://github.com/vllm-project/vllm.git vllm
 
 build-vllm-cuda: vllm-cuda
-	cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
+	cd vllm && git fetch && git checkout 5448f67
 	cd vllm && python setup.py build
 
 install-vllm-cuda: build-vllm-cuda
diff --git a/server/lorax_server/utils/paged_attention.py b/server/lorax_server/utils/paged_attention.py
index 7f2ca6cea..e6d3620e6 100644
--- a/server/lorax_server/utils/paged_attention.py
+++ b/server/lorax_server/utils/paged_attention.py
@@ -8,7 +8,8 @@
     import intel_extension_for_pytorch as ipex
 else:
     try:
-        from vllm._C import cache_ops, ops
+        import torch
+        import vllm._custom_ops as ops
     except Exception as e:
         raise ImportError(
             f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
@@ -33,7 +34,7 @@ def reshape_and_cache(
     if SYSTEM == "xpu":
         ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slots)
     else:
-        cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0)
+        torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0)
 
 
 def attention(
@@ -68,6 +69,8 @@ def attention(
     block_size = value_cache.shape[3]
     num_seqs, num_heads, head_size = query.shape
     max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
+    num_kv_heads = 1 + kv_head_mapping.max().item()
+
     if SYSTEM == "xpu":
         query = query.contiguous()
         return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
@@ -96,7 +99,7 @@ def attention(
             query,
             key_cache,
             value_cache,
-            kv_head_mapping,
+            num_kv_heads,
             softmax_scale,
             block_tables,
             input_lengths,
@@ -129,7 +132,7 @@ def attention(
             query,
             key_cache,
             value_cache,
-            kv_head_mapping,
+            num_kv_heads,
             softmax_scale,
             block_tables,
             input_lengths,
diff --git a/server/poetry.lock b/server/poetry.lock
index 68649649a..859699923 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -192,15 +192,23 @@ files = [
 
 [[package]]
 name = "bitsandbytes"
-version = "0.41.3.post2"
+version = "0.43.2"
 description = "k-bit optimizers and matrix multiplication routines."
 optional = true
 python-versions = "*"
 files = [
-    {file = "bitsandbytes-0.41.3.post2-py3-none-any.whl", hash = "sha256:ceb301a3d4e6bf52bdad8d09f3064ac194bdfdeae535994c0315bd2ef7639cca"},
-    {file = "bitsandbytes-0.41.3.post2.tar.gz", hash = "sha256:7d25a51fb3b74b58e569473f8b70a5239124c0593dc053479c41cf2cd6730502"},
+    {file = "bitsandbytes-0.43.2-py3-none-manylinux_2_24_x86_64.whl", hash = "sha256:80fbc0f41dded735f51328042d3d45ea640101d87c8abba8ea5bfafa61e2a786"},
+    {file = "bitsandbytes-0.43.2-py3-none-win_amd64.whl", hash = "sha256:89e1c506fd2323574615d668ca7eacad4f83db4847044bf2db87580a71852ff1"},
 ]
 
+[package.dependencies]
+numpy = "*"
+torch = "*"
+
+[package.extras]
+benchmark = ["matplotlib", "pandas"]
+test = ["scipy"]
+
 [[package]]
 name = "boto3"
 version = "1.34.4"
@@ -3577,4 +3585,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "e52881fa075d917070103ac7f96cae7c648502cb82b4b48d54ec81d47e1b0ef9"
+content-hash = "06b1954d759d902f96836ae358647275ee4e21365369f4d3d46c82c1bcf00c2e"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 1cfee5a7e..ba7073b5b 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -16,7 +16,7 @@ grpcio-reflection = "^1.51.1"
 grpc-interceptor = "^0.15.0"
 typer = "^0.6.1"
 accelerate = { version = "^0.24.1", optional = true }
-bitsandbytes = { version = "^0.41.1", optional = true }
+bitsandbytes = { version = "^0.43.1", optional = true }
 scipy = { version = "^1.0.0", optional = true }
 safetensors = "0.4.2"
 loguru = "^0.6.0"
diff --git a/server/requirements.txt b/server/requirements.txt
index bb1bd58d8..096165272 100644
--- a/server/requirements.txt
+++ b/server/requirements.txt
@@ -1,5 +1,5 @@
 backoff==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
-bitsandbytes==0.41.3.post2 ; python_version >= "3.9" and python_version < "4.0"
+bitsandbytes==0.43.1 ; python_version >= "3.9" and python_version < "4.0"
 boto3==1.34.4 ; python_version >= "3.9" and python_version < "4.0"
 botocore==1.34.4 ; python_version >= "3.9" and python_version < "4.0"
 certifi==2023.11.17 ; python_version >= "3.9" and python_version < "4.0"