Slim down docker, fix OSS cluster config (#106)

Closes https://github.com/anyscale/aviary/issues/92 Closes #5 --------- Signed-off-by: Antoni Baum <[email protected]>
ray-project · Jun 3, 2023 · 2f24d9d · 2f24d9d
1 parent 5f1fa11
commit 2f24d9d
Show file tree

Hide file tree

Showing 4 changed files with 28 additions and 26 deletions.
diff --git a/deploy/_internal/backend/cluster-env.yaml b/deploy/_internal/backend/cluster-env.yaml
@@ -15,7 +15,6 @@ post_build_cmds:
   pip install \
     "async_timeout" \
     "markdown-it-py[plugins]" \
-    "git+https://github.com/huggingface/diffusers.git" \
     "accelerate" \
     "transformers>=4.25.1" \
     "datasets" \
@@ -29,14 +28,8 @@ post_build_cmds:
     "bitsandbytes" \
     "git+https://github.com/Yard1/DeepSpeed.git@aviary" \
     "numpy<1.24" \
-    "pytorch-lightning" \
     "ninja" \
     "protobuf<3.21.0" \
     "git+https://github.com/huggingface/optimum.git" \
     "torchmetrics" \
-    "git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836" \
-    "lm_eval==0.3.0" \
-    "tiktoken==0.1.2" \
-    "pybind11==2.6.2" \
-    "einops==0.3.0" \
     "safetensors"
diff --git a/deploy/ray/Dockerfile b/deploy/ray/Dockerfile
@@ -1,22 +1,18 @@
-FROM rayproject/ray-ml:nightly-gpu
+FROM rayproject/ray:nightly-cu118
 
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
-
-RUN sudo apt-get update
-RUN sudo apt-get install -y libaio-dev git-lfs awscli
+RUN sudo apt-get update && sudo apt-get install -y libaio-dev git-lfs awscli && sudo rm -rf /var/lib/apt/lists/*
 
-RUN pip install --upgrade pip
-RUN pip uninstall -y ray torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric tensorflow
 RUN conda install python=3.10
-RUN pip install "ray[default,serve] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl"
+RUN pip install --upgrade pip && pip install "ray[default,serve] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl"
+RUN pip install -i https://download.pytorch.org/whl/cu118 torch torchvision torchaudio
 COPY "./dist" "/home/ray/dist"
 RUN cd /home/ray/dist && pip install "$(ls *.whl | head -n1)[backend]"
 
-
-COPY "./deploy/ray/backend.yaml" "/home/ray/abcd"
-
 # The build context should be the root of the repo
 # So this gives the model definitions
 COPY "./models" "/home/ray/models"
 
-RUN echo "Testing aviary install" && python -c "import aviary.backend"
+ENV HF_HUB_ENABLE_HF_TRANSFER=1
+RUN echo "Testing aviary install" && python -c "import aviary.backend"
+
+RUN pip cache purge && conda clean -a && rm -rf ~/.cache
diff --git a/deploy/ray/aviary-cluster.yaml b/deploy/ray/aviary-cluster.yaml
@@ -10,6 +10,26 @@ docker:
     image: "anyscale/aviary:latest"
     container_name: "aviary"
 
+# All the 'conda activate' are necessary to ensure we are in the
+# python 3.10 conda env.
+setup_commands:
+    - echo "conda activate" >> ~/.bashrc
+
+head_setup_commands:
+    - conda activate && pip install 'boto3>=1.4.8'
+
+worker_setup_commands: []
+
+head_start_ray_commands:
+    - conda activate && ray stop
+    - conda activate && ulimit -n 65536; ray start --head --port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --dashboard-host=0.0.0.0
+
+worker_start_ray_commands:
+    - conda activate && ray stop
+    # We need to make sure RAY_HEAD_IP env var is accessible
+    # after conda activate.
+    - export RAY_HEAD_IP && echo "export RAY_HEAD_IP=$RAY_HEAD_IP" >> ~/.bashrc && conda activate && ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+
 available_node_types:
   head_node_type:
     node_config:

diff --git a/setup.py b/setup.py
@@ -21,7 +21,6 @@
             "torch>=2.0.0",
             "torchaudio>=2.0.0",
             "torchvision>=0.15.2",
-            "diffusers @ git+https://github.com/huggingface/diffusers.git",
             "accelerate",
             "transformers>=4.25.1",
             "datasets",
@@ -35,16 +34,10 @@
             "bitsandbytes",
             "deepspeed @ git+https://github.com/Yard1/DeepSpeed.git@aviary",
             "numpy<1.24",
-            "pytorch-lightning",
             "ninja",
             "protobuf<3.21.0",
             "optimum @ git+https://github.com/huggingface/optimum.git",
             "torchmetrics",
-            "lm_dataformat @ git+https://github.com/EleutherAI/lm_dataformat.git@4eec05349977071bf67fc072290b95e31c8dd836",
-            "lm_eval==0.3.0",
-            "tiktoken==0.1.2",
-            "pybind11==2.6.2",
-            "einops==0.3.0",
             "safetensors",
             "pydantic==1.10.7",
             "markdown-it-py[plugins]",