Skip to content

Commit

Permalink
Update PyTorch, CUDA, vLLM, and Bitsandbytes (#553)
Browse files Browse the repository at this point in the history
  • Loading branch information
ajtejankar authored Jul 25, 2024
1 parent 07addea commit 5cefe6e
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 17 deletions.
19 changes: 14 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ RUN cargo build --release

# Python builder
# Adapted from: https://github.com/pytorch/pytorch/blob/master/Dockerfile
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 as pytorch-install
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04 as pytorch-install

ARG PYTORCH_VERSION=2.3.0
ARG PYTORCH_VERSION=2.4.0
ARG PYTHON_VERSION=3.10
# Keep in sync with `server/pyproject.toml
ARG CUDA_VERSION=12.1
ARG CUDA_VERSION=12.4
ARG MAMBA_VERSION=24.3.0-0
ARG CUDA_CHANNEL=nvidia
ARG INSTALL_CHANNEL=pytorch
Expand Down Expand Up @@ -116,7 +116,6 @@ WORKDIR /usr/src
COPY server/Makefile-awq Makefile
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-awq


# Build Transformers CUDA kernels
FROM kernel-builder as custom-kernels-builder
WORKDIR /usr/src
Expand All @@ -127,6 +126,13 @@ RUN python setup.py build
# Build vllm CUDA kernels
FROM kernel-builder as vllm-builder
WORKDIR /usr/src
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
wget \
&& rm -rf /var/lib/apt/lists/*
RUN DEBIAN_FRONTEND=noninteractive apt purge -y --auto-remove cmake
RUN wget 'https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.tar.gz'
RUN tar xzvf 'cmake-3.30.0-linux-x86_64.tar.gz'
RUN ln -s "$(pwd)/cmake-3.30.0-linux-x86_64/bin/cmake" /usr/local/bin/cmake
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
COPY server/Makefile-vllm Makefile
# Build specific version of vllm
Expand Down Expand Up @@ -155,7 +161,7 @@ COPY server/Makefile-eetq Makefile
RUN TORCH_CUDA_ARCH_LIST="8.0;8.6+PTX" make build-eetq

# LoRAX base image
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 as base
FROM nvidia/cuda:12.4.0-base-ubuntu22.04 as base

# Conda env
ENV PATH=/opt/conda/bin:$PATH \
Expand All @@ -166,6 +172,9 @@ ENV HUGGINGFACE_HUB_CACHE=/data \
HF_HUB_ENABLE_HF_TRANSFER=1 \
PORT=80

# vLLM needs this in order to work without error
ENV LD_PRELOAD=/usr/local/cuda/compat/libcuda.so

WORKDIR /usr/src

RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
Expand Down
4 changes: 2 additions & 2 deletions server/Makefile-vllm
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
vllm-cuda:
# Clone vllm
pip install -U ninja packaging --no-cache-dir
git clone https://github.com/Narsil/vllm.git vllm
git clone https://github.com/vllm-project/vllm.git vllm

build-vllm-cuda: vllm-cuda
cd vllm && git fetch && git checkout b5dfc61db88a81069e45b44f7cc99bd9e62a60fa
cd vllm && git fetch && git checkout 5448f67
cd vllm && python setup.py build

install-vllm-cuda: build-vllm-cuda
Expand Down
11 changes: 7 additions & 4 deletions server/lorax_server/utils/paged_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import intel_extension_for_pytorch as ipex
else:
try:
from vllm._C import cache_ops, ops
import torch
import vllm._custom_ops as ops
except Exception as e:
raise ImportError(
f"Could not import vllm paged attention. Make sure your installation is correct. Complete error: {e}"
Expand All @@ -33,7 +34,7 @@ def reshape_and_cache(
if SYSTEM == "xpu":
ipex.llm.modules.PagedAttention.reshape_and_cache(key, value, key_cache, value_cache, slots)
else:
cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0)
torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slots, "fp8" if fp8_supported else "auto", 1.0)


def attention(
Expand Down Expand Up @@ -68,6 +69,8 @@ def attention(
block_size = value_cache.shape[3]
num_seqs, num_heads, head_size = query.shape
max_num_partitions = (max_s + _PARTITION_SIZE - 1) // _PARTITION_SIZE
num_kv_heads = 1 + kv_head_mapping.max().item()

if SYSTEM == "xpu":
query = query.contiguous()
return ipex.llm.modules.PagedAttention.single_query_cached_kv_attention(
Expand Down Expand Up @@ -96,7 +99,7 @@ def attention(
query,
key_cache,
value_cache,
kv_head_mapping,
num_kv_heads,
softmax_scale,
block_tables,
input_lengths,
Expand Down Expand Up @@ -129,7 +132,7 @@ def attention(
query,
key_cache,
value_cache,
kv_head_mapping,
num_kv_heads,
softmax_scale,
block_tables,
input_lengths,
Expand Down
16 changes: 12 additions & 4 deletions server/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion server/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ grpcio-reflection = "^1.51.1"
grpc-interceptor = "^0.15.0"
typer = "^0.6.1"
accelerate = { version = "^0.24.1", optional = true }
bitsandbytes = { version = "^0.41.1", optional = true }
bitsandbytes = { version = "^0.43.1", optional = true }
scipy = { version = "^1.0.0", optional = true }
safetensors = "0.4.2"
loguru = "^0.6.0"
Expand Down
2 changes: 1 addition & 1 deletion server/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
backoff==2.2.1 ; python_version >= "3.9" and python_version < "4.0"
bitsandbytes==0.41.3.post2 ; python_version >= "3.9" and python_version < "4.0"
bitsandbytes==0.43.1 ; python_version >= "3.9" and python_version < "4.0"
boto3==1.34.4 ; python_version >= "3.9" and python_version < "4.0"
botocore==1.34.4 ; python_version >= "3.9" and python_version < "4.0"
certifi==2023.11.17 ; python_version >= "3.9" and python_version < "4.0"
Expand Down

0 comments on commit 5cefe6e

Please sign in to comment.