Update vllm build and compile scripts #8

jasonacox · Jun 8, 2024 · 5dbb701 · 5dbb701
1 parent 06dea33
commit 5dbb701
Show file tree

Hide file tree

Showing 12 changed files with 152 additions and 156 deletions.
diff --git a/vllm/README.md b/vllm/README.md
@@ -63,81 +63,64 @@ cd vllm
 # git checkout 220a476
 ```
 
-2. Create Dockerfile ([link](./Dockerfile.source))
-
-```dockerfile
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04
-RUN apt-get update -y \
-     && apt-get install -y python3-pip
-WORKDIR /app
-COPY . .
-RUN python3 -m pip install -e .
-EXPOSE 8000
-COPY entrypoint.sh /usr/local/bin/
-CMD [ "entrypoint.sh" ]
-```
-
-3. Create entrypoint.sh ([link](./entrypoint.sh))
-
-```bash
-# Start the vLLM OpenAI API compatible server
-python3 -m vllm.entrypoints.openai.api_server \
-    --tensor-parallel-size ${NUM_GPU} \
-    --worker-use-ray \
-    --host 0.0.0.0 \
-    --port "${PORT}" \
-    --model "${MODEL}" \
-    --served-model-name "${MODEL}" ${EXTRA_ARGS}
-```
-
-4. Edit setup.py (see [patch](./setup.py.patch))
+2. Edit Dockerfile and CMakeList.txt:
 
 ```patch
---- _setup.py	2024-01-27 18:44:45.509406538 +0000
-+++ setup.py	2024-01-28 00:02:23.581639719 +0000
-@@ -18,7 +18,7 @@
- MAIN_CUDA_VERSION = "12.1"
+--- _Dockerfile	2024-06-07 22:09:30.069782339 -0700
++++ Dockerfile	2024-06-07 22:10:02.357875428 -0700
+@@ -35,7 +35,7 @@
+ # can be useful for both `dev` and `test`
+ # explicitly set the list to avoid issues with torch 2.2
+ # see https://github.com/pytorch/pytorch/pull/123243
+-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
++ARG torch_cuda_arch_list='6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
+ #################### BASE BUILD IMAGE ####################
 
- # Supported NVIDIA GPU architectures.
--NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-+NVIDIA_SUPPORTED_ARCHS = {"6.0", "6.1", "6.2", "7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
- ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
- # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
+--- _CMakeList.txt	2024-06-07 22:08:27.657601121 -0700
++++ CMakeLists.txt	2024-06-07 22:09:01.541699767 -0700
+@@ -16,7 +16,7 @@
+ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 
-@@ -184,9 +184,9 @@
-     device_count = torch.cuda.device_count()
-     for i in range(device_count):
-         major, minor = torch.cuda.get_device_capability(i)
--        if major < 7:
-+        if major < 6:
-             raise RuntimeError(
--                "GPUs with compute capability below 7.0 are not supported.")
-+                "GPUs with compute capability below 6.0 are not supported.")
-         compute_capabilities.add(f"{major}.{minor}")
+ # Supported NVIDIA architectures.
+-set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
++set(CUDA_SUPPORTED_ARCHS "6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0")
 
- ext_modules = []
+ # Supported AMD GPU architectures.
+ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
 ```
 
-5. Create build.sh ([link](./build.sh))
+3. Create build.sh ([link](./build.sh))
 
 ```bash
 # Create Container
-nvidia-docker build -t vllm .
+nvidia-docker build . -f Dockerfile --target vllm-openai --tag vllm
 ```
 
-6. Create run.sh ([link](./run.sh))
+4. Create run.sh ([link](./run.sh))
 
 ```bash
 # Run Container
-nvidia-docker run -d -p 8000:8000 --gpus=all --shm-size=10.24gb \
-  -e MODEL=mistralai/Mistral-7B-Instruct-v0.1 \
-  -e PORT=8000 \
-  -e HF_HOME=/app/models \
-  -e NUM_GPU=4 \
-  -e EXTRA_ARGS="--dtype float --max-model-len 20000" \
-  -v $PWD/models:/app/models \
-  --name vllm \
-  vllm 
+nvidia-docker run -d --gpus all -shm-size=10.24gb -p 8000:8000 \
+    -v $PWD/models:/root/.cache/huggingface \
+    --env "HF_TOKEN={Your_Hugingface_Token}" \
+    --restart unless-stopped \
+    --name $CONTAINER \
+    vllm \
+    --host 0.0.0.0 \
+    --model=mistralai/Mistral-7B-Instruct-v0.1 \
+    --served-model-name vllm \
+    --dtype=float \
+    --max-model-len 20000
+
+    # Additional arguments to pass to the API server on startup:
+    # --gpu-memory-utilization 0.95
+    # --dtype auto|half|float
+    # --quantization awq
+    # --disable-log-requests
+    # --tensor-parallel-size NUM_GPU
+    # --enforce-eager 
+
 # Print Running Logs - ^C to Stop Viewing Logs
 docker logs vllm -f
 ```

diff --git a/vllm/build.sh b/vllm/build.sh
@@ -6,4 +6,4 @@
 
 echo "Build vllm docker image..."
 
-docker build -t vllm .
+DOCKER_BUILDKIT=1 docker build . -f Dockerfile --target vllm-openai --tag vllm
diff --git a/vllm/compile.sh b/vllm/compile.sh
@@ -12,25 +12,21 @@ cd vllm
 
 # Copy helpful files
 echo "Copying helpful files..."
-mv Dockerfile Dockerfile.orig       # save original
-cp ../Dockerfile.source Dockerfile
-cp ../entrypoint.sh entrypoint.sh 
 cp ../run-pascal.sh run.sh
 cp ../build.sh build.sh
-cp ../setup.py.patch setup.py.patch
+cp ../vllm.patch vllm.patch
 
 # Patch the source code
 echo "Patching source code..."
-cp setup.py setup.py.orig           # save original
-patch setup.py setup.py.patch
+patch < vllm.patch
 
 # Build the docker image
 echo "Building docker image..."
 ./build.sh
 
 # Make models directory
 echo "Creating models directory..."
-mkdir models
+mkdir -p models
 echo "Models will be stored in ${PWD}/models."
 
 # Done

diff --git a/vllm/Dockerfile → vllm/optional_build/Dockerfile b/vllm/Dockerfile → vllm/optional_build/Dockerfile
diff --git a/vllm/Dockerfile.source → vllm/optional_build/Dockerfile.source b/vllm/Dockerfile.source → vllm/optional_build/Dockerfile.source
diff --git a/vllm/optional_build/build.sh b/vllm/optional_build/build.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Build vllm docker image
+#
+# Date: 27-Jan-2024
+# https://github.com/jasonacox/TinyLLM
+
+echo "Build vllm docker image..."
+
+DOCKER_BUILDKIT=1 docker build . -f Dockerfile --target vllm-openai --tag vllm
diff --git a/vllm/entrypoint.sh → vllm/optional_build/entrypoint.sh b/vllm/entrypoint.sh → vllm/optional_build/entrypoint.sh
diff --git a/vllm/run-pascal.sh → vllm/optional_build/run.sh b/vllm/run-pascal.sh → vllm/optional_build/run.sh
@@ -1,14 +1,34 @@
 #!/bin/bash
 # Run vllm docker image
 # 
+# Usage: run.sh <model> <container_name>
+# 
 # Author: Jason A. Cox,
 # Date: 27-Jan-2024
 # https://github.com/jasonacox/TinyLLM
 
-LLM=mistralai/Mistral-7B-Instruct-v0.1
-CONTAINER=vllm-mistral-1
-#LLM=mistralai/Mistral-7B-Instruct-v0.2
-#CONTAINER=vllm-mistral-2
+# Set Defaults
+LLM_DEFAULT=mistralai/Mistral-7B-Instruct-v0.1
+CONTAINER_DEFAULT=vllm-mistral-1x
+
+# Check if user asked for help
+if [[ "$1" == "-h" || "$1" == "--help" ]]; then
+    echo "Usage: $0 <model> <container_name>"
+    echo "Example: $0 mistralai/Mistral-7B-Instruct-v0.1 vllm-mistral-1"
+    exit 0
+fi
+
+# If user provided arguments, use them
+if [[ ! -z "$1" && ! -z "$2" ]]; then
+    LLM=$1
+    CONTAINER=$2
+fi
+
+# Set variables to default if not set
+if [[ -z "${LLM}" ]]; then
+    LLM=$LLM_DEFAULT
+    CONTAINER=$CONTAINER_DEFAULT
+fi
 
 echo "Stopping and removing any previous $CONTAINER instance..."
 docker stop $CONTAINER
@@ -23,8 +43,6 @@ docker run -d \
   -e MODEL=$LLM \
   -e PORT=8000 \
   -e GPU_MEMORY_UTILIZATION=0.95 \
-  -e DTYPE=float \
-  -e MAX_MODEL_LEN=20000 \
   -e NUM_GPU=1 \
   -e SERVED_MODEL_NAME=tinyllm \
   -e HF_HOME=/app/models \

diff --git a/vllm/run-awq.sh b/vllm/run-awq.sh
@@ -18,24 +18,32 @@ docker rm $CONTAINER
 
 echo "Starting new $CONTAINER instance..."
 
-docker run -d \
-  -p 8000:8000 \
-  --shm-size=10.24gb \
-  --gpus all \
-  -e MODEL=$LLM \
-  -e PORT=8000 \
-  -e GPU_MEMORY_UTILIZATION=0.95 \
-  -e QUANTIZATION=awq \
-  -e DTYPE=$QT \
-  -e NUM_GPU=1 \
-  -e SERVED_MODEL_NAME=tinyllm \
-  -e HF_HOME=/app/models \
-  -v $PWD/models:/app/models \
-  --restart unless-stopped \
-  --name $CONTAINER \
-  vllm
-
-# Additional options: -e EXTRA_ARGS="" -e MAX_MODEL_LEN=xxxxx
+docker run -d --gpus all \
+    -v $PWD/models:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN={Your_Hugingface_Token}" \
+    --restart unless-stopped \
+    --name $CONTAINER \
+    vllm/vllm-openai:latest \
+    --host 0.0.0.0 \
+    --model=$MODEL \
+    --gpu-memory-utilization 0.95 \
+    --enforce-eager \
+    --served-model-name $LLM \
+    --disable-log-requests \
+    --dtype=auto \
+    --quantization awq
+
+    # Additional arguments to pass to the API server on startup:
+    # --gpu-memory-utilization 
+    # --max-model-len
+    # --dtype
+    # --quantization
+    # --enforce-eager
+    # --disable-log-requests
+
+
+-q ${QUANTIZATION} --dtype ${DTYPE}"
 
 echo "Printing logs (^C to quit)..."
 

diff --git a/vllm/run.sh b/vllm/run.sh
@@ -1,57 +1,40 @@
 #!/bin/bash
 # Run vllm docker image
 # 
-# Usage: run.sh <model> <container_name>
-# 
 # Author: Jason A. Cox,
 # Date: 27-Jan-2024
 # https://github.com/jasonacox/TinyLLM
 
-# Set Defaults
-LLM_DEFAULT=mistralai/Mistral-7B-Instruct-v0.1
-CONTAINER_DEFAULT=vllm-mistral-1x
-
-# Check if user asked for help
-if [[ "$1" == "-h" || "$1" == "--help" ]]; then
-    echo "Usage: $0 <model> <container_name>"
-    echo "Example: $0 mistralai/Mistral-7B-Instruct-v0.1 vllm-mistral-1"
-    exit 0
-fi
-
-# If user provided arguments, use them
-if [[ ! -z "$1" && ! -z "$2" ]]; then
-    LLM=$1
-    CONTAINER=$2
-fi
-
-# Set variables to default if not set
-if [[ -z "${LLM}" ]]; then
-    LLM=$LLM_DEFAULT
-    CONTAINER=$CONTAINER_DEFAULT
-fi
+LLM=mistralai/Mistral-7B-Instruct-v0.1
+CONTAINER=vllm
 
 echo "Stopping and removing any previous $CONTAINER instance..."
 docker stop $CONTAINER
 docker rm $CONTAINER
 
 echo "Starting new $CONTAINER instance..."
 
-docker run -d \
-  -p 8000:8000 \
-  --shm-size=10.24gb \
-  --gpus all \
-  -e MODEL=$LLM \
-  -e PORT=8000 \
-  -e GPU_MEMORY_UTILIZATION=0.95 \
-  -e NUM_GPU=1 \
-  -e SERVED_MODEL_NAME=tinyllm \
-  -e HF_HOME=/app/models \
-  -v $PWD/models:/app/models \
-  --restart unless-stopped \
-  --name $CONTAINER \
-  vllm
-
-# Additional options: -e EXTRA_ARGS="" -e MAX_MODEL_LEN=xxxxx -e QUANTIZATION=awq -e DTYPE=auto 
+echo "Starting vLLM $CONTAINER..."
+docker run -d --gpus all \
+    -v $PWD/models:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HF_TOKEN={Your_Hugingface_Token}" \
+    --restart unless-stopped \
+    --name $CONTAINER \
+    vllm \
+    --host 0.0.0.0 \
+    --model=$MODEL \
+    --gpu-memory-utilization 0.95 \
+    --enforce-eager \
+    --served-model-name $LLM \
+    --dtype=float
+
+    # Additional arguments to pass to the API server on startup:
+    # --gpu-memory-utilization 
+    # --max-model-len
+    # --dtype auto|half
+    # --quantization 
+    # --disable-log-requests
 
 echo "Printing logs (^C to quit)..."
 

diff --git a/vllm/setup.py.patch b/vllm/setup.py.patch
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,4 +6,4 @@

		echo "Build vllm docker image..."

		docker build -t vllm .
		DOCKER_BUILDKIT=1 docker build . -f Dockerfile --target vllm-openai --tag vllm