From 7ad8ffdc08c6d3e3fc41ac711f0065d20d6de997 Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Fri, 17 May 2024 23:13:21 +0000 Subject: [PATCH 1/8] feat: add async neuron execturo --- aphrodite/engine/async_aphrodite.py | 4 ++-- aphrodite/executor/neuron_executor.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/aphrodite/engine/async_aphrodite.py b/aphrodite/engine/async_aphrodite.py index 5dcbfe217..62fbe631c 100644 --- a/aphrodite/engine/async_aphrodite.py +++ b/aphrodite/engine/async_aphrodite.py @@ -340,8 +340,8 @@ def from_engine_args(cls, engine_config = engine_args.create_engine_config() if engine_config.device_config.device_type == "neuron": - raise NotImplementedError("Neuron is not supported for " - "async engine yet.") + from aphrodite.executor.neuron_executor import NeuronExecutor + executor_class = NeuronExecutor elif engine_config.device_config.device_type == "cpu": from aphrodite.executor.cpu_executor import CPUExecutor executor_class = CPUExecutor diff --git a/aphrodite/executor/neuron_executor.py b/aphrodite/executor/neuron_executor.py index 94de2e285..1fe7ba586 100644 --- a/aphrodite/executor/neuron_executor.py +++ b/aphrodite/executor/neuron_executor.py @@ -3,6 +3,7 @@ from aphrodite.lora.request import LoRARequest from aphrodite.executor.executor_base import ExecutorBase from aphrodite.common.sequence import SamplerOutput, SequenceGroupMetadata +from aphrodite.common.utils import make_async class NeuronExecutor(ExecutorBase): @@ -57,6 +58,23 @@ def execute_model(self, seq_group_metadata_list=seq_group_metadata_list) return output + async def execute_model_async( + self, seq_group_metadata_list: List[SequenceGroupMetadata], + blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, + int], + blocks_to_copy: Dict[int, List[int]], + num_lookahead_slots: int) -> List[SamplerOutput]: + assert (blocks_to_swap_in == {} and blocks_to_swap_out == {} + and blocks_to_copy == {}), ( + "Cache operations are not supported for Neuron backend.") + assert num_lookahead_slots == 0, ( + "lookahead not supported for Neuron backend.") + + output = await make_async( + self.driver_worker.execute_model + )(seq_group_metadata_list=seq_group_metadata_list) + return output + def add_lora(self, lora_request: LoRARequest) -> bool: return self.driver_worker.add_lora(lora_request) From 573feadae56de49529ae91a9b7f418e3037adc7b Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Fri, 17 May 2024 23:14:26 +0000 Subject: [PATCH 2/8] fix: imports for neuron --- aphrodite/transformers_utils/config.py | 5 ++++- aphrodite/transformers_utils/tokenizer.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/aphrodite/transformers_utils/config.py b/aphrodite/transformers_utils/config.py index f950b7de0..30f04bc44 100644 --- a/aphrodite/transformers_utils/config.py +++ b/aphrodite/transformers_utils/config.py @@ -8,7 +8,10 @@ from aphrodite.transformers_utils.configs import (BaiChuanConfig, DbrxConfig, ChatGLMConfig, MPTConfig, QWenConfig, RWConfig) -from aphrodite.quantization.gguf_utils import GGUFReader +from aphrodite.common.utils import is_neuron + +if not is_neuron(): + from aphrodite.quantization.gguf_utils import GGUFReader _CONFIG_REGISTRY = { "baichuan": BaiChuanConfig, diff --git a/aphrodite/transformers_utils/tokenizer.py b/aphrodite/transformers_utils/tokenizer.py index 05e0edbe0..5bda4b0ca 100644 --- a/aphrodite/transformers_utils/tokenizer.py +++ b/aphrodite/transformers_utils/tokenizer.py @@ -8,10 +8,11 @@ from loguru import logger from aphrodite.lora.request import LoRARequest -from aphrodite.common.utils import make_async -from aphrodite.quantization.gguf_utils import GGUFReader +from aphrodite.common.utils import make_async, is_neuron from aphrodite.transformers_utils.tokenizers import BaichuanTokenizer +if not is_neuron(): + from aphrodite.quantization.gguf_utils import GGUFReader def convert_gguf_to_tokenizer(checkpoint): if os.path.isfile(checkpoint): From 624f592d7ef0d823ed4c3e75ad8c594613ced5ed Mon Sep 17 00:00:00 2001 From: AlpinDale Date: Sat, 18 May 2024 04:50:53 +0000 Subject: [PATCH 3/8] remove triton from common requirements --- requirements-common.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements-common.txt b/requirements-common.txt index 6132cab1d..1215b61b7 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -13,7 +13,6 @@ fastapi colorlog einops # for phi prometheus_client # for prometheus metrics -triton >= 2.2.0 lark == 1.1.8 # for grammars scipy # for quip rich From 7dac145f946722be01156b38ac4838ea764c5c71 Mon Sep 17 00:00:00 2001 From: theobjectivedad Date: Tue, 18 Jun 2024 10:21:57 -0500 Subject: [PATCH 4/8] Add user to Docker image + cleanup --- docker/Dockerfile | 81 +++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index adcdeb1d1..cde89a2b0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,52 +1,65 @@ FROM nvidia/cuda:12.1.1-devel-ubuntu22.04 -ENV HOME=/app/aphrodite-engine +# Adjust UID/GID when mounting volumes need to match the host UID/GID +ARG UID=1000 +ARG GID=1000 -WORKDIR $HOME +ARG APHRODITE_BRANCH=main + +# Setting MAX_JOBS allows build server to limit ninja build jobs. For reference +# see https://github.com/PygmalionAI/aphrodite-engine/wiki/1.-Installation#build-from-source +ARG MAX_JOBS + +# Setting TORCH_CUDA_ARCH_LIST specifies the CUDA architectures to compile for. +# a list of possible values can be found at: +# https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list +ARG TORCH_CUDA_ARCH_LIST="6.1 7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + +ARG FLASH_ATTN_VERSION="2.5.8" # Upgrade OS Packages + Prepare Python Environment RUN set -eux; \ - export DEBIAN_FRONTEND=noninteractive \ - && apt-get update \ - && apt-get upgrade -y \ - && apt-get install -y bzip2 g++ git make python3-pip tzdata \ - && rm -fr /var/lib/apt/lists/* - -# Alias python3 to python -RUN ln -s /usr/bin/python3 /usr/bin/python + export DEBIAN_FRONTEND=noninteractive; \ + apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y \ + bzip2 \ + g++ \ + git \ + make \ + python3-pip \ + tzdata && \ + python3 -m pip install --no-cache-dir --upgrade pip && \ + ln -s /usr/bin/python3 /usr/bin/python && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* -RUN python3 -m pip install --no-cache-dir --upgrade pip +# Add image service account +ENV USER=aphrodite-engine +ENV APP_HOME=/app +RUN groupadd -g ${GID} ${USER} && \ + useradd -u ${UID} -g ${GID} -d /home/${USER} -m -s /bin/bash ${USER} -RUN git clone https://github.com/PygmalionAI/aphrodite-engine.git /tmp/aphrodite-engine \ - && mv /tmp/aphrodite-engine/* . \ - && rm -fr /tmp/aphrodite-engine \ - && chmod +x docker/entrypoint.sh +# Install aphrodite-engine, creates APP_HOME +RUN git clone -b ${APHRODITE_BRANCH} https://github.com/PygmalionAI/aphrodite-engine.git ${APP_HOME} && \ + chmod +x ${APP_HOME}/docker/entrypoint.sh -# Allow build servers to limit ninja build jobs. For reference -# see https://github.com/PygmalionAI/aphrodite-engine/wiki/1.-Installation#build-from-source -ARG MAX_JOBS +# Install aphrodite-engine dependencies ENV MAX_JOBS=${MAX_JOBS} - -# Export the CUDA_HOME variable correctly ENV CUDA_HOME=/usr/local/cuda - ENV HF_HOME=/tmp ENV NUMBA_CACHE_DIR=$HF_HOME/numba_cache -ENV TORCH_CUDA_ARCH_LIST="6.1 7.0 7.5 8.0 8.6 8.9 9.0+PTX" -RUN python3 -m pip install --no-cache-dir -e . - -# Workaround to properly install flash-attn. For reference -# see: https://github.com/Dao-AILab/flash-attention/issues/453 -RUN python3 -m pip install 'flash-attn>=2.5.8' --no-build-isolation +ENV TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST} +RUN python3 -m pip install --no-cache-dir -e ${APP_HOME} -# Entrypoint exec form doesn't do variable substitution automatically ($HOME) -ENTRYPOINT ["/app/aphrodite-engine/docker/entrypoint.sh"] +# Workaround to properly install flash-attn. This needs to be executed after installing aphrodite-engine +# build dependencies. For reference see: https://github.com/Dao-AILab/flash-attention/issues/453 +RUN python3 -m pip install "flash-attn>=${FLASH_ATTN_VERSION}" --no-build-isolation --no-cache-dir EXPOSE 7860 -# Service UID needs write access to $HOME to create temporary folders, see #458 -RUN chown 1000:1000 ${HOME} - -USER 1000:0 - VOLUME ["/tmp"] + +USER ${USER} +WORKDIR ${APP_HOME} +ENTRYPOINT ["/bin/sh", "-c", "${APP_HOME}/docker/entrypoint.sh"] From cdc0f9b9ebe4976f7ea8cb43dd49764b40900e5f Mon Sep 17 00:00:00 2001 From: AlpinDale <52078762+AlpinDale@users.noreply.github.com> Date: Mon, 1 Jul 2024 21:15:50 +0000 Subject: [PATCH 5/8] Update neuron_executor.py --- aphrodite/executor/neuron_executor.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/aphrodite/executor/neuron_executor.py b/aphrodite/executor/neuron_executor.py index ee06c58bc..97dd333db 100644 --- a/aphrodite/executor/neuron_executor.py +++ b/aphrodite/executor/neuron_executor.py @@ -59,23 +59,6 @@ def execute_model(self, seq_group_metadata_list=seq_group_metadata_list) return output - async def execute_model_async( - self, seq_group_metadata_list: List[SequenceGroupMetadata], - blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int, - int], - blocks_to_copy: Dict[int, List[int]], - num_lookahead_slots: int) -> List[SamplerOutput]: - assert (blocks_to_swap_in == {} and blocks_to_swap_out == {} - and blocks_to_copy == {}), ( - "Cache operations are not supported for Neuron backend.") - assert num_lookahead_slots == 0, ( - "lookahead not supported for Neuron backend.") - - output = await make_async( - self.driver_worker.execute_model - )(seq_group_metadata_list=seq_group_metadata_list) - return output - def add_lora(self, lora_request: LoRARequest) -> bool: return self.driver_worker.add_lora(lora_request) From a44ced0d5706c501aa7a18106b61c94c20ccd552 Mon Sep 17 00:00:00 2001 From: AlpinDale <52078762+AlpinDale@users.noreply.github.com> Date: Mon, 1 Jul 2024 21:16:08 +0000 Subject: [PATCH 6/8] Update tokenizer.py --- aphrodite/transformers_utils/tokenizer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/aphrodite/transformers_utils/tokenizer.py b/aphrodite/transformers_utils/tokenizer.py index a486dc394..c1a1020d1 100644 --- a/aphrodite/transformers_utils/tokenizer.py +++ b/aphrodite/transformers_utils/tokenizer.py @@ -10,9 +10,6 @@ from aphrodite.lora.request import LoRARequest from aphrodite.transformers_utils.tokenizers import BaichuanTokenizer -if not is_neuron(): - from aphrodite.quantization.gguf_utils import GGUFReader - def get_cached_tokenizer( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: From 5852f0222bfc6c0214d8db2f69d27972178dab22 Mon Sep 17 00:00:00 2001 From: AlpinDale <52078762+AlpinDale@users.noreply.github.com> Date: Mon, 1 Jul 2024 21:16:26 +0000 Subject: [PATCH 7/8] Update tokenizer.py --- aphrodite/transformers_utils/tokenizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/aphrodite/transformers_utils/tokenizer.py b/aphrodite/transformers_utils/tokenizer.py index c1a1020d1..caffbf168 100644 --- a/aphrodite/transformers_utils/tokenizer.py +++ b/aphrodite/transformers_utils/tokenizer.py @@ -10,6 +10,7 @@ from aphrodite.lora.request import LoRARequest from aphrodite.transformers_utils.tokenizers import BaichuanTokenizer + def get_cached_tokenizer( tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: From fa9cce32d9603318874029f863c64b58b3d21ec6 Mon Sep 17 00:00:00 2001 From: AlpinDale <52078762+AlpinDale@users.noreply.github.com> Date: Mon, 1 Jul 2024 21:18:16 +0000 Subject: [PATCH 8/8] Update docker/Dockerfile --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 41b9c6e9c..8f5eb7cf1 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -13,7 +13,7 @@ ARG MAX_JOBS # Setting TORCH_CUDA_ARCH_LIST specifies the CUDA architectures to compile for. # a list of possible values can be found at: # https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#gpu-feature-list -ARG TORCH_CUDA_ARCH_LIST="6.1 7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" +ARG TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" ARG FLASH_ATTN_VERSION="2.5.8"