From 2f24d9d06cf1d506ef46e6f801f7f24f370848ee Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Fri, 2 Jun 2023 17:09:37 -0700 Subject: [PATCH] Slim down docker, fix OSS cluster config (#106) Closes https://github.com/anyscale/aviary/issues/92 Closes https://github.com/ray-project/aviary/issues/5 --------- Signed-off-by: Antoni Baum --- deploy/_internal/backend/cluster-env.yaml | 7 ------- deploy/ray/Dockerfile | 20 ++++++++------------ deploy/ray/aviary-cluster.yaml | 20 ++++++++++++++++++++ setup.py | 7 ------- 4 files changed, 28 insertions(+), 26 deletions(-) diff --git a/deploy/_internal/backend/cluster-env.yaml b/deploy/_internal/backend/cluster-env.yaml index 0d11f643..3a05c7b6 100644 --- a/deploy/_internal/backend/cluster-env.yaml +++ b/deploy/_internal/backend/cluster-env.yaml @@ -15,7 +15,6 @@ post_build_cmds: pip install \ "async_timeout" \ "markdown-it-py[plugins]" \ - "git+https://github.com/huggingface/diffusers.git" \ "accelerate" \ "transformers>=4.25.1" \ "datasets" \ @@ -29,14 +28,8 @@ post_build_cmds: "bitsandbytes" \ "git+https://github.com/Yard1/DeepSpeed.git@aviary" \ "numpy<1.24" \ - "pytorch-lightning" \ "ninja" \ "protobuf<3.21.0" \ "git+https://github.com/huggingface/optimum.git" \ "torchmetrics" \ - "git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836" \ - "lm_eval==0.3.0" \ - "tiktoken==0.1.2" \ - "pybind11==2.6.2" \ - "einops==0.3.0" \ "safetensors" diff --git a/deploy/ray/Dockerfile b/deploy/ray/Dockerfile index 218ae64c..3b692fb2 100644 --- a/deploy/ray/Dockerfile +++ b/deploy/ray/Dockerfile @@ -1,22 +1,18 @@ -FROM rayproject/ray-ml:nightly-gpu +FROM rayproject/ray:nightly-cu118 -ENV HF_HUB_ENABLE_HF_TRANSFER=1 - -RUN sudo apt-get update -RUN sudo apt-get install -y libaio-dev git-lfs awscli +RUN sudo apt-get update && sudo apt-get install -y libaio-dev git-lfs awscli && sudo rm -rf /var/lib/apt/lists/* -RUN pip install --upgrade pip -RUN pip uninstall -y ray torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric tensorflow RUN conda install python=3.10 -RUN pip install "ray[default,serve] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl" +RUN pip install --upgrade pip && pip install "ray[default,serve] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl" +RUN pip install -i https://download.pytorch.org/whl/cu118 torch torchvision torchaudio COPY "./dist" "/home/ray/dist" RUN cd /home/ray/dist && pip install "$(ls *.whl | head -n1)[backend]" - -COPY "./deploy/ray/backend.yaml" "/home/ray/abcd" - # The build context should be the root of the repo # So this gives the model definitions COPY "./models" "/home/ray/models" -RUN echo "Testing aviary install" && python -c "import aviary.backend" \ No newline at end of file +ENV HF_HUB_ENABLE_HF_TRANSFER=1 +RUN echo "Testing aviary install" && python -c "import aviary.backend" + +RUN pip cache purge && conda clean -a && rm -rf ~/.cache \ No newline at end of file diff --git a/deploy/ray/aviary-cluster.yaml b/deploy/ray/aviary-cluster.yaml index ec9de3ee..e45e5977 100644 --- a/deploy/ray/aviary-cluster.yaml +++ b/deploy/ray/aviary-cluster.yaml @@ -10,6 +10,26 @@ docker: image: "anyscale/aviary:latest" container_name: "aviary" +# All the 'conda activate' are necessary to ensure we are in the +# python 3.10 conda env. +setup_commands: + - echo "conda activate" >> ~/.bashrc + +head_setup_commands: + - conda activate && pip install 'boto3>=1.4.8' + +worker_setup_commands: [] + +head_start_ray_commands: + - conda activate && ray stop + - conda activate && ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0 + +worker_start_ray_commands: + - conda activate && ray stop + # We need to make sure RAY_HEAD_IP env var is accessible + # after conda activate. + - export RAY_HEAD_IP && echo "export RAY_HEAD_IP=$RAY_HEAD_IP" >> ~/.bashrc && conda activate && ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 + available_node_types: head_node_type: node_config: diff --git a/setup.py b/setup.py index 077fab87..fbb03181 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,6 @@ "torch>=2.0.0", "torchaudio>=2.0.0", "torchvision>=0.15.2", - "diffusers @ git+https://github.com/huggingface/diffusers.git", "accelerate", "transformers>=4.25.1", "datasets", @@ -35,16 +34,10 @@ "bitsandbytes", "deepspeed @ git+https://github.com/Yard1/DeepSpeed.git@aviary", "numpy<1.24", - "pytorch-lightning", "ninja", "protobuf<3.21.0", "optimum @ git+https://github.com/huggingface/optimum.git", "torchmetrics", - "lm_dataformat @ git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836", - "lm_eval==0.3.0", - "tiktoken==0.1.2", - "pybind11==2.6.2", - "einops==0.3.0", "safetensors", "pydantic==1.10.7", "markdown-it-py[plugins]",