diff --git a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile index 49ab8c03..ff229aa7 100644 --- a/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile +++ b/2.ami_and_containers/containers/pytorch/0.nvcr-pytorch-aws.dockerfile @@ -3,7 +3,8 @@ #################################################################################################### # This is a sample Dockerfile, with optional stanzas. Please read through this Dockerfile, -# understand what it does, then create your own Dockerfile. +# understand what it does, then create your own Dockerfile. Software versions are provided for +# illustration only. # # Sample build instructions: # @@ -19,13 +20,13 @@ # # Load image to local docker registry -> on head node, or new compute/build node. # docker load < /fsx/nvidia-pt-od__latest.tar #################################################################################################### -FROM nvcr.io/nvidia/pytorch:23.12-py3 +FROM nvcr.io/nvidia/pytorch:24.03-py3 ENV DEBIAN_FRONTEND=noninteractive # The three must-be-built packages. # Efa-installer>=1.29.1 required for nccl>=2.19.0 to avoid libfabric NCCL error. -ENV EFA_INSTALLER_VERSION=1.30.0 -ENV AWS_OFI_NCCL_VERSION=1.8.1-aws +ENV EFA_INSTALLER_VERSION=1.32.0 +ENV AWS_OFI_NCCL_VERSION=1.9.1-aws ENV NCCL_TESTS_VERSION=master ## Uncomment below when this Dockerfile builds a container image with efa-installer<1.29.1 and @@ -88,10 +89,13 @@ ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH # [CUSTOM_NCCL_OPTION_1] Uncomment below stanza to install another NCCL version using the official # binaries. # +# Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to +# find out the prebuilt nccl version in the parent image. +# # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the # aws-ofi-ccnl. #################################################################################################### -#ENV NCCL_VERSION=2.19.3-1 +#ENV NCCL_VERSION=2.21.5-1 #RUN cd /opt && \ # wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.0-1_all.deb && \ # dpkg -i cuda-keyring_1.0-1_all.deb && \ @@ -101,17 +105,21 @@ ENV PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:$PATH #################################################################################################### -# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. The -# benefits of installing to the same location as the built-in version are: +# [CUSTOM_NCCL_OPTION_2] Install NCCL from source to the same location as the built-in ones. +# +# Please consult https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html to +# find out the prebuilt nccl version in the parent image. +# +# Installation mechanics: # -# 1. There's only ever a single libnccl version offered by this image, preventing application from -# mistakenly chooses a wrong version. -# 2. No longer needing extra settings for LD_LIBRARY_PATH or LD_PRELOAD. +# 1. Remove pre-installed nccl to ensure there's only ever a single libnccl version offered by this +# image, preventing application from mistakenly chooses a wrong version. +# 2. Install to default location, so no more extra settings for LD_LIBRARY_PATH or LD_PRELOAD. # # NCCL EFA plugin (aws-ofi-nccl) depends on mpi, hence we must rebuild openmpi before building the # aws-ofi-ccnl. #################################################################################################### -ENV NCCL_VERSION=2.19.3-1 +ENV NCCL_VERSION=2.21.5-1 RUN apt-get remove -y libnccl2 libnccl-dev \ && cd /tmp \ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION} \ diff --git a/2.ami_and_containers/containers/pytorch/README.md b/2.ami_and_containers/containers/pytorch/README.md index 79d65795..f0fc5c14 100644 --- a/2.ami_and_containers/containers/pytorch/README.md +++ b/2.ami_and_containers/containers/pytorch/README.md @@ -13,6 +13,10 @@ With that said, feel free to explore the example. Happy coding, and experimentin ## 1. Essential software +Please note that software versions in the template are provided for illustration only. For +well-tested combinations, please refer to the various Dockerfile files under `3.test_cases/` and +`4.validation_and_observability/0.nccl_tests/`. + In principle, the reference `Dockerfile` does the following: - Provide PyTorch built for NVidia CUDA devices, by using a recent NVidia PyTorch image as the