Skip to content

Commit

Permalink
Update vllm build and compile scripts #8
Browse files Browse the repository at this point in the history
  • Loading branch information
jasonacox committed Jun 8, 2024
1 parent 06dea33 commit 5dbb701
Show file tree
Hide file tree
Showing 12 changed files with 152 additions and 156 deletions.
103 changes: 43 additions & 60 deletions vllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,81 +63,64 @@ cd vllm
# git checkout 220a476
```

2. Create Dockerfile ([link](./Dockerfile.source))

```dockerfile
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
RUN apt-get update -y \
&& apt-get install -y python3-pip
WORKDIR /app
COPY . .
RUN python3 -m pip install -e .
EXPOSE 8000
COPY entrypoint.sh /usr/local/bin/
CMD [ "entrypoint.sh" ]
```

3. Create entrypoint.sh ([link](./entrypoint.sh))

```bash
# Start the vLLM OpenAI API compatible server
python3 -m vllm.entrypoints.openai.api_server \
--tensor-parallel-size ${NUM_GPU} \
--worker-use-ray \
--host 0.0.0.0 \
--port "${PORT}" \
--model "${MODEL}" \
--served-model-name "${MODEL}" ${EXTRA_ARGS}
```

4. Edit setup.py (see [patch](./setup.py.patch))
2. Edit Dockerfile and CMakeList.txt:

```patch
--- _setup.py 2024-01-27 18:44:45.509406538 +0000
+++ setup.py 2024-01-28 00:02:23.581639719 +0000
@@ -18,7 +18,7 @@
MAIN_CUDA_VERSION = "12.1"
--- _Dockerfile 2024-06-07 22:09:30.069782339 -0700
+++ Dockerfile 2024-06-07 22:10:02.357875428 -0700
@@ -35,7 +35,7 @@
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ARG torch_cuda_arch_list='6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################

# Supported NVIDIA GPU architectures.
-NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
+NVIDIA_SUPPORTED_ARCHS = {"6.0", "6.1", "6.2", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
--- _CMakeList.txt 2024-06-07 22:08:27.657601121 -0700
+++ CMakeLists.txt 2024-06-07 22:09:01.541699767 -0700
@@ -16,7 +16,7 @@
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")

@@ -184,9 +184,9 @@
device_count = torch.cuda.device_count()
for i in range(device_count):
major, minor = torch.cuda.get_device_capability(i)
- if major < 7:
+ if major < 6:
raise RuntimeError(
- "GPUs with compute capability below 7.0 are not supported.")
+ "GPUs with compute capability below 6.0 are not supported.")
compute_capabilities.add(f"{major}.{minor}")
# Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
+set(CUDA_SUPPORTED_ARCHS "6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0")

ext_modules = []
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
```

5. Create build.sh ([link](./build.sh))
3. Create build.sh ([link](./build.sh))

```bash
# Create Container
nvidia-docker build -t vllm .
nvidia-docker build . -f Dockerfile --target vllm-openai --tag vllm
```

6. Create run.sh ([link](./run.sh))
4. Create run.sh ([link](./run.sh))

```bash
# Run Container
nvidia-docker run -d -p 8000:8000 --gpus=all --shm-size=10.24gb \
-e MODEL=mistralai/Mistral-7B-Instruct-v0.1 \
-e PORT=8000 \
-e HF_HOME=/app/models \
-e NUM_GPU=4 \
-e EXTRA_ARGS="--dtype float --max-model-len 20000" \
-v $PWD/models:/app/models \
--name vllm \
vllm
nvidia-docker run -d --gpus all -shm-size=10.24gb -p 8000:8000 \
-v $PWD/models:/root/.cache/huggingface \
--env "HF_TOKEN={Your_Hugingface_Token}" \
--restart unless-stopped \
--name $CONTAINER \
vllm \
--host 0.0.0.0 \
--model=mistralai/Mistral-7B-Instruct-v0.1 \
--served-model-name vllm \
--dtype=float \
--max-model-len 20000

# Additional arguments to pass to the API server on startup:
# --gpu-memory-utilization 0.95
# --dtype auto|half|float
# --quantization awq
# --disable-log-requests
# --tensor-parallel-size NUM_GPU
# --enforce-eager

# Print Running Logs - ^C to Stop Viewing Logs
docker logs vllm -f
```
Expand Down
2 changes: 1 addition & 1 deletion vllm/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@

echo "Build vllm docker image..."

docker build -t vllm .
DOCKER_BUILDKIT=1 docker build . -f Dockerfile --target vllm-openai --tag vllm
10 changes: 3 additions & 7 deletions vllm/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,21 @@ cd vllm

# Copy helpful files
echo "Copying helpful files..."
mv Dockerfile Dockerfile.orig # save original
cp ../Dockerfile.source Dockerfile
cp ../entrypoint.sh entrypoint.sh
cp ../run-pascal.sh run.sh
cp ../build.sh build.sh
cp ../setup.py.patch setup.py.patch
cp ../vllm.patch vllm.patch

# Patch the source code
echo "Patching source code..."
cp setup.py setup.py.orig # save original
patch setup.py setup.py.patch
patch < vllm.patch

# Build the docker image
echo "Building docker image..."
./build.sh

# Make models directory
echo "Creating models directory..."
mkdir models
mkdir -p models
echo "Models will be stored in ${PWD}/models."

# Done
Expand Down
File renamed without changes.
File renamed without changes.
9 changes: 9 additions & 0 deletions vllm/optional_build/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# Build vllm docker image
#
# Date: 27-Jan-2024
# https://github.com/jasonacox/TinyLLM

echo "Build vllm docker image..."

DOCKER_BUILDKIT=1 docker build . -f Dockerfile --target vllm-openai --tag vllm
File renamed without changes.
30 changes: 24 additions & 6 deletions vllm/run-pascal.sh → vllm/optional_build/run.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,34 @@
#!/bin/bash
# Run vllm docker image
#
# Usage: run.sh <model> <container_name>
#
# Author: Jason A. Cox,
# Date: 27-Jan-2024
# https://github.com/jasonacox/TinyLLM

LLM=mistralai/Mistral-7B-Instruct-v0.1
CONTAINER=vllm-mistral-1
#LLM=mistralai/Mistral-7B-Instruct-v0.2
#CONTAINER=vllm-mistral-2
# Set Defaults
LLM_DEFAULT=mistralai/Mistral-7B-Instruct-v0.1
CONTAINER_DEFAULT=vllm-mistral-1x

# Check if user asked for help
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
echo "Usage: $0 <model> <container_name>"
echo "Example: $0 mistralai/Mistral-7B-Instruct-v0.1 vllm-mistral-1"
exit 0
fi

# If user provided arguments, use them
if [[ ! -z "$1" && ! -z "$2" ]]; then
LLM=$1
CONTAINER=$2
fi

# Set variables to default if not set
if [[ -z "${LLM}" ]]; then
LLM=$LLM_DEFAULT
CONTAINER=$CONTAINER_DEFAULT
fi

echo "Stopping and removing any previous $CONTAINER instance..."
docker stop $CONTAINER
Expand All @@ -23,8 +43,6 @@ docker run -d \
-e MODEL=$LLM \
-e PORT=8000 \
-e GPU_MEMORY_UTILIZATION=0.95 \
-e DTYPE=float \
-e MAX_MODEL_LEN=20000 \
-e NUM_GPU=1 \
-e SERVED_MODEL_NAME=tinyllm \
-e HF_HOME=/app/models \
Expand Down
44 changes: 26 additions & 18 deletions vllm/run-awq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,32 @@ docker rm $CONTAINER

echo "Starting new $CONTAINER instance..."

docker run -d \
-p 8000:8000 \
--shm-size=10.24gb \
--gpus all \
-e MODEL=$LLM \
-e PORT=8000 \
-e GPU_MEMORY_UTILIZATION=0.95 \
-e QUANTIZATION=awq \
-e DTYPE=$QT \
-e NUM_GPU=1 \
-e SERVED_MODEL_NAME=tinyllm \
-e HF_HOME=/app/models \
-v $PWD/models:/app/models \
--restart unless-stopped \
--name $CONTAINER \
vllm

# Additional options: -e EXTRA_ARGS="" -e MAX_MODEL_LEN=xxxxx
docker run -d --gpus all \
-v $PWD/models:/root/.cache/huggingface \
-p 8000:8000 \
--env "HF_TOKEN={Your_Hugingface_Token}" \
--restart unless-stopped \
--name $CONTAINER \
vllm/vllm-openai:latest \
--host 0.0.0.0 \
--model=$MODEL \
--gpu-memory-utilization 0.95 \
--enforce-eager \
--served-model-name $LLM \
--disable-log-requests \
--dtype=auto \
--quantization awq

# Additional arguments to pass to the API server on startup:
# --gpu-memory-utilization
# --max-model-len
# --dtype
# --quantization
# --enforce-eager
# --disable-log-requests


-q ${QUANTIZATION} --dtype ${DTYPE}"
echo "Printing logs (^C to quit)..."
Expand Down
63 changes: 23 additions & 40 deletions vllm/run.sh
Original file line number Diff line number Diff line change
@@ -1,57 +1,40 @@
#!/bin/bash
# Run vllm docker image
#
# Usage: run.sh <model> <container_name>
#
# Author: Jason A. Cox,
# Date: 27-Jan-2024
# https://github.com/jasonacox/TinyLLM

# Set Defaults
LLM_DEFAULT=mistralai/Mistral-7B-Instruct-v0.1
CONTAINER_DEFAULT=vllm-mistral-1x

# Check if user asked for help
if [[ "$1" == "-h" || "$1" == "--help" ]]; then
echo "Usage: $0 <model> <container_name>"
echo "Example: $0 mistralai/Mistral-7B-Instruct-v0.1 vllm-mistral-1"
exit 0
fi

# If user provided arguments, use them
if [[ ! -z "$1" && ! -z "$2" ]]; then
LLM=$1
CONTAINER=$2
fi

# Set variables to default if not set
if [[ -z "${LLM}" ]]; then
LLM=$LLM_DEFAULT
CONTAINER=$CONTAINER_DEFAULT
fi
LLM=mistralai/Mistral-7B-Instruct-v0.1
CONTAINER=vllm

echo "Stopping and removing any previous $CONTAINER instance..."
docker stop $CONTAINER
docker rm $CONTAINER

echo "Starting new $CONTAINER instance..."

docker run -d \
-p 8000:8000 \
--shm-size=10.24gb \
--gpus all \
-e MODEL=$LLM \
-e PORT=8000 \
-e GPU_MEMORY_UTILIZATION=0.95 \
-e NUM_GPU=1 \
-e SERVED_MODEL_NAME=tinyllm \
-e HF_HOME=/app/models \
-v $PWD/models:/app/models \
--restart unless-stopped \
--name $CONTAINER \
vllm

# Additional options: -e EXTRA_ARGS="" -e MAX_MODEL_LEN=xxxxx -e QUANTIZATION=awq -e DTYPE=auto
echo "Starting vLLM $CONTAINER..."
docker run -d --gpus all \
-v $PWD/models:/root/.cache/huggingface \
-p 8000:8000 \
--env "HF_TOKEN={Your_Hugingface_Token}" \
--restart unless-stopped \
--name $CONTAINER \
vllm \
--host 0.0.0.0 \
--model=$MODEL \
--gpu-memory-utilization 0.95 \
--enforce-eager \
--served-model-name $LLM \
--dtype=float

# Additional arguments to pass to the API server on startup:
# --gpu-memory-utilization
# --max-model-len
# --dtype auto|half
# --quantization
# --disable-log-requests

echo "Printing logs (^C to quit)..."

Expand Down
24 changes: 0 additions & 24 deletions vllm/setup.py.patch

This file was deleted.

Loading

0 comments on commit 5dbb701

Please sign in to comment.