Skip to content

Commit

Permalink
Templatize Dockerfiles & update workflows (#223)
Browse files Browse the repository at this point in the history
Now build images by a script with a shared Dockerfile template

---------

Co-authored-by: Binyang Li <[email protected]>
Co-authored-by: Saeed Maleki <[email protected]>
  • Loading branch information
3 people authored Nov 22, 2023
1 parent 15f6dcc commit dab19e0
Show file tree
Hide file tree
Showing 16 changed files with 113 additions and 222 deletions.
13 changes: 3 additions & 10 deletions .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ jobs:
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2

pool:
name: mscclpp
Expand All @@ -30,10 +30,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down Expand Up @@ -122,10 +120,5 @@ jobs:
set -e
export PATH=/usr/local/mpi/bin:$PATH
python3 -m pip install .
if [[ '$(containerImage)' == *'cuda11'* ]]; then
pip3 install -r ./python/requirements_cu11.txt
else
pip3 install -r ./python/requirements_cu12.txt
fi
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
workingDirectory: '$(System.DefaultWorkingDirectory)'
8 changes: 3 additions & 5 deletions .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ jobs:
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
pool:
name: mscclpp-it
container:
Expand All @@ -25,10 +25,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
make -j
make pylib-copy
workingDirectory: '$(System.DefaultWorkingDirectory)'
Expand Down
15 changes: 4 additions & 11 deletions .azure-pipelines/ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ jobs:
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2

container:
image: $[ variables['containerImage'] ]
Expand All @@ -30,10 +30,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down Expand Up @@ -80,10 +78,5 @@ jobs:
set -e
export PATH=/usr/local/mpi/bin:$PATH
cd build && make pylib-copy
if [[ '$(containerImage)' == *'cuda11'* ]]; then
pip3 install -r ../python/requirements_cu11.txt
else
pip3 install -r ../python/requirements_cu12.txt
fi
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ../python/test/test_mscclpp.py -x
workingDirectory: '$(System.DefaultWorkingDirectory)'
6 changes: 3 additions & 3 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
name: Analyze
runs-on: 'ubuntu-latest'
container:
image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda-version }}

permissions:
actions: read
Expand All @@ -24,7 +24,7 @@ jobs:
fail-fast: false
matrix:
language: [ 'cpp', 'python' ]
cuda-version: [ 'cuda11.8', 'cuda12.1' ]
cuda-version: [ 'cuda11.8', 'cuda12.2' ]

steps:
- name: Checkout repository
Expand All @@ -45,7 +45,7 @@ jobs:
- name: Build
run: |
MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON .
cmake -DBYPASS_PEERMEM_CHECK=ON .
make -j
- name: Perform CodeQL Analysis
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/integration-test-backup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ jobs:
shell: bash
strategy:
matrix:
cuda: [ cuda11.8, cuda12.1 ]
cuda: [ cuda11.8, cuda12.2 ]

container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
Expand All @@ -23,7 +23,7 @@ jobs:
- name: Build
run: |
mkdir build && cd build
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
- name: Lock GPU clock frequency
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ut-backup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ jobs:
timeout-minutes: 30
strategy:
matrix:
cuda: [ cuda11.8, cuda12.1 ]
cuda: [ cuda11.8, cuda12.2 ]

container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
Expand All @@ -24,7 +24,7 @@ jobs:
- name: Build
run: |
mkdir build && cd build
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
working-directory: ${{ github.workspace }}

Expand Down
59 changes: 0 additions & 59 deletions docker/base-cuda12.1.dockerfile

This file was deleted.

38 changes: 38 additions & 0 deletions docker/base-dev-x.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
ARG BASE_IMAGE=ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
FROM ${BASE_IMAGE}

LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp

RUN apt-get update && \
apt-get install -y --no-install-recommends \
htop \
lcov \
vim \
&& \
apt-get autoremove && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

# Install cmake 3.26.4
ENV CMAKE_VERSION="3.26.4"
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
rm -rf ${CMAKE_HOME}.tar.gz
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"

# Install Python dependencies
ADD . /tmp/mscclpp
WORKDIR /tmp/mscclpp
ARG TARGET="cuda12.1"
RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \
python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt

# Set PATH
RUN echo PATH="${PATH}" > /etc/environment

# Cleanup
RUN rm -rf /tmp/mscclpp
WORKDIR /
10 changes: 6 additions & 4 deletions docker/base-cuda11.8.dockerfile → docker/base-x.dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
FROM nvidia/cuda:11.8.0-devel-ubuntu20.04
ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu20.04
FROM ${BASE_IMAGE}

LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
Expand All @@ -7,8 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive

RUN rm -rf /opt/nvidia

RUN apt-get clean && \
apt-get update && \
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
Expand Down Expand Up @@ -50,10 +50,12 @@ RUN cd /tmp && \
cd .. && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*

ARG EXTRA_LD_PATH=/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64
ENV PATH="/usr/local/mpi/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-11.8/lib64:${LD_LIBRARY_PATH}"
LD_LIBRARY_PATH="/usr/local/mpi/lib:${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}"

RUN echo PATH="${PATH}" > /etc/environment && \
echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment

ENTRYPOINT []
WORKDIR /
46 changes: 46 additions & 0 deletions docker/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash

set -e

declare -A baseImageTable
baseImageTable=(
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
)

declare -A extraLdPathTable
extraLdPathTable=(
["cuda11.8"]="/usr/local/cuda-11.8/lib64"
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
)

GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
TARGET=${1}

print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]"
}

if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
echo "Invalid target: ${TARGET}"
print_usage
exit 1
fi
echo "Target: ${TARGET}"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

cd ${SCRIPT_DIR}/..

docker build -t ${GHCR}:base-${TARGET} \
-f docker/base-x.dockerfile \
--build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} .

docker build -t ${GHCR}:base-dev-${TARGET} \
-f docker/base-dev-x.dockerfile \
--build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
--build-arg TARGET=${TARGET} .
28 changes: 0 additions & 28 deletions docker/dev-cuda11.8.dockerfile

This file was deleted.

27 changes: 0 additions & 27 deletions docker/dev-cuda12.1.dockerfile

This file was deleted.

Loading

0 comments on commit dab19e0

Please sign in to comment.