Skip to content

Commit

Permalink
[CI/CD] add neuron docker and ci test scripts (vllm-project#3571)
Browse files Browse the repository at this point in the history
  • Loading branch information
liangfu authored Apr 18, 2024
1 parent 87fa80c commit cd2f63f
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 4 deletions.
37 changes: 37 additions & 0 deletions .buildkite/run-neuron-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set -e

# Try building the docker image
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
docker build -t neuron -f Dockerfile.neuron .

# Setup cleanup
remove_docker_container() { docker rm -f neuron || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &

# Wait for the server to start
wait_for_server_to_start() {
timeout=300
counter=0

while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
sleep 1
counter=$((counter + 1))
if [ $counter -ge $timeout ]; then
echo "Timeout after $timeout seconds"
break
fi
done
}
wait_for_server_to_start

# Test a simple prompt
curl -X POST -H "Content-Type: application/json" \
localhost:8000/generate \
-d '{"prompt": "San Francisco is a"}'
5 changes: 5 additions & 0 deletions .buildkite/test-template.j2
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ steps:
queue: amd
command: bash .buildkite/run-amd-test.sh

- label: "Neuron Test"
agents:
queue: neuron
command: bash .buildkite/run-neuron-test.sh

- label: "CPU Test"
command: bash .buildkite/run-cpu-test.sh

Expand Down
36 changes: 36 additions & 0 deletions Dockerfile.neuron
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# default base image
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"

FROM $BASE_IMAGE

RUN echo "Base image is $BASE_IMAGE"

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y

### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}

RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U

COPY ./vllm /app/vllm/vllm
COPY ./setup.py /app/vllm/setup.py
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt

RUN cd /app/vllm \
&& python3 -m pip install -U -r requirements-neuron.txt

ENV VLLM_BUILD_WITH_NEURON 1
RUN cd /app/vllm \
&& pip install -e . \
&& cd ..

CMD ["/bin/bash"]
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,8 @@ def _is_neuron() -> bool:
subprocess.run(["neuron-ls"], capture_output=True, check=True)
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
torch_neuronx_installed = False
return torch_neuronx_installed
return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
False)


def _is_cpu() -> bool:
Expand Down
4 changes: 2 additions & 2 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,8 @@ def from_engine_args(
engine_config = engine_args.create_engine_config()

if engine_config.device_config.device_type == "neuron":
raise NotImplementedError("Neuron is not supported for "
"async engine yet.")
from vllm.executor.neuron_executor import NeuronExecutorAsync
executor_class = NeuronExecutorAsync
elif engine_config.parallel_config.worker_use_ray:
initialize_ray_cluster(engine_config.parallel_config)
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
Expand Down
22 changes: 21 additions & 1 deletion vllm/executor/neuron_executor.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from typing import Dict, List, Set, Tuple

from vllm.executor.executor_base import ExecutorBase
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import make_async

logger = init_logger(__name__)

Expand Down Expand Up @@ -73,3 +74,22 @@ def check_health(self) -> None:
# NeuronExecutor will always be healthy as long as
# it's running.
return


class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):

async def execute_model_async(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
blocks_to_copy: Dict[int, List[int]],
) -> SamplerOutput:
output = await make_async(self.driver_worker.execute_model)(
seq_group_metadata_list=seq_group_metadata_list, )
return output

async def check_health_async(self) -> None:
# NeuronExecutor will always be healthy as long as
# it's running.
return

0 comments on commit cd2f63f

Please sign in to comment.