diff --git a/.ci/docker/README.md b/.ci/docker/README.md new file mode 100644 index 00000000..9be91510 --- /dev/null +++ b/.ci/docker/README.md @@ -0,0 +1,22 @@ +# Docker images for TorchTitan CI + +This directory contains everything needed to build the Docker images +that are used in TorchTitan CI. The content of this directory are copied +from PyTorch CI https://github.com/pytorch/pytorch/tree/main/.ci/docker. +It also uses the same directory structure as PyTorch. + +## Contents + +* `build.sh` -- dispatch script to launch all builds +* `common` -- scripts used to execute individual Docker build stages +* `ubuntu` -- Dockerfile for Ubuntu image for CPU build and test jobs + +## Usage + +```bash +# Generic usage +./build.sh "${IMAGE_NAME}" "${DOCKER_BUILD_PARAMETERS}" + +# Build a specific image +./build.sh torchtitan-ubuntu-20.04-clang12 -t myimage:latest +``` diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh new file mode 100755 index 00000000..f1d3d94e --- /dev/null +++ b/.ci/docker/build.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -exu + +IMAGE_NAME="$1" +shift + +echo "Building ${IMAGE_NAME} Docker image" + +OS=ubuntu +OS_VERSION=20.04 +CLANG_VERSION="" +PYTHON_VERSION=3.11 +MINICONDA_VERSION=24.3.0-0 + +case "${IMAGE_NAME}" in + torchtitan-ubuntu-20.04-clang12) + CLANG_VERSION=12 + ;; + *) + echo "Invalid image name ${IMAGE_NAME}" + exit 1 +esac + +docker build \ + --no-cache \ + --progress=plain \ + --build-arg "OS_VERSION=${OS_VERSION}" \ + --build-arg "CLANG_VERSION=${CLANG_VERSION}" \ + --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \ + --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \ + -f "${OS}"/Dockerfile \ + "$@" \ + . diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh new file mode 100755 index 00000000..cbca22cf --- /dev/null +++ b/.ci/docker/common/install_base.sh @@ -0,0 +1,44 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +install_ubuntu() { + apt-get update + + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + git \ + wget \ + sudo \ + vim \ + jq \ + vim \ + unzip \ + gdb \ + rsync \ + libssl-dev \ + zip + + # Cleanup package manager + apt-get autoclean && apt-get clean + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +} + +# Install base packages depending on the base OS +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +case "$ID" in + ubuntu) + install_ubuntu + ;; + *) + echo "Unable to determine OS..." + exit 1 + ;; +esac diff --git a/.ci/docker/common/install_clang.sh b/.ci/docker/common/install_clang.sh new file mode 100755 index 00000000..ea0848a8 --- /dev/null +++ b/.ci/docker/common/install_clang.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +install_ubuntu() { + apt-get update + + apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" + apt-get install -y --no-install-recommends llvm-"$CLANG_VERSION" + # Also require LLD linker from llvm and libomp to build PyTorch from source + apt-get install -y lld "libomp-${CLANG_VERSION}-dev" + + # Use update-alternatives to make this version the default + update-alternatives --install /usr/bin/clang clang /usr/bin/clang-"$CLANG_VERSION" 50 + update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-"$CLANG_VERSION" 50 + # Override cc/c++ to clang as well + update-alternatives --install /usr/bin/cc cc /usr/bin/clang 50 + update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang++ 50 + + # Cleanup package manager + apt-get autoclean && apt-get clean + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +} + +if [ -n "$CLANG_VERSION" ]; then + # Install base packages depending on the base OS + ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') + case "$ID" in + ubuntu) + install_ubuntu + ;; + *) + echo "Unable to determine OS..." + exit 1 + ;; + esac +fi diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh new file mode 100755 index 00000000..c89e21b6 --- /dev/null +++ b/.ci/docker/common/install_conda.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +# shellcheck source=/dev/null +source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" + +install_miniconda() { + BASE_URL="https://repo.anaconda.com/miniconda" + CONDA_FILE="Miniconda3-py${PYTHON_VERSION//./}_${MINICONDA_VERSION}-Linux-x86_64.sh" + + mkdir -p /opt/conda + chown ci-user:ci-user /opt/conda + + pushd /tmp + wget -q "${BASE_URL}/${CONDA_FILE}" + # Install miniconda + as_ci_user bash "${CONDA_FILE}" -b -f -p "/opt/conda" + # Clean up the download file + rm "${CONDA_FILE}" + popd + + sed -e 's|PATH="\(.*\)"|PATH="/opt/conda/bin:\1"|g' -i /etc/environment + export PATH="/opt/conda/bin:$PATH" +} + +install_python() { + pushd /opt/conda + # Install the correct Python version + as_ci_user conda create -n "py_${PYTHON_VERSION}" -y --file /opt/conda/conda-env-ci.txt python="${PYTHON_VERSION}" + popd +} + +install_pip_dependencies() { + pushd /opt/conda + # Install all Python dependencies + pip_install -r /opt/conda/dev-requirements.txt + pip_install -r /opt/conda/requirements.txt + popd +} + +fix_conda_ubuntu_libstdcxx() { + cat /etc/issue + # WARNING: This is a HACK from PyTorch core to be able to build PyTorch on 22.04. + # Specifically, ubuntu-20+ all comes lib libstdc++ newer than 3.30+, but anaconda + # is stuck with 3.29. So, remove libstdc++6.so.3.29 as installed by + # https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0 + # + # PyTorch sev: https://github.com/pytorch/pytorch/issues/105248 + # Ref: https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_conda.sh + if grep -e "2[02].04." /etc/issue >/dev/null; then + rm "/opt/conda/envs/py_${PYTHON_VERSION}/lib/libstdc++.so.6" + fi +} + +install_miniconda +install_python +install_pip_dependencies +fix_conda_ubuntu_libstdcxx diff --git a/.ci/docker/common/install_gcc.sh b/.ci/docker/common/install_gcc.sh new file mode 100755 index 00000000..0a1f2025 --- /dev/null +++ b/.ci/docker/common/install_gcc.sh @@ -0,0 +1,22 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +if [ -n "$GCC_VERSION" ]; then + + apt-get update + apt-get install -y g++-"$GCC_VERSION" + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50 + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50 + update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50 + + # Cleanup package manager + apt-get autoclean && apt-get clean + rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +fi diff --git a/.ci/docker/common/install_user.sh b/.ci/docker/common/install_user.sh new file mode 100755 index 00000000..f454c5d3 --- /dev/null +++ b/.ci/docker/common/install_user.sh @@ -0,0 +1,24 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +set -ex + +# Same as ec2-user +echo "ci-user:x:1000:1000::/var/lib/ci-user:" >> /etc/passwd +echo "ci-user:x:1000:" >> /etc/group +# Needed on Focal or newer +echo "ci-user:*:19110:0:99999:7:::" >> /etc/shadow + +# Create $HOME +mkdir -p /var/lib/ci-user +chown ci-user:ci-user /var/lib/ci-user + +# Allow sudo +echo 'ci-user ALL=(ALL) NOPASSWD:ALL' > /etc/sudoers.d/ci-user + +# Test that sudo works +sudo -u ci-user sudo -v diff --git a/.ci/docker/common/utils.sh b/.ci/docker/common/utils.sh new file mode 100644 index 00000000..427f4201 --- /dev/null +++ b/.ci/docker/common/utils.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +as_ci_user() { + # NB: unsetting the environment variables works around a conda bug + # https://github.com/conda/conda/issues/6576 + # NB: Pass on PATH and LD_LIBRARY_PATH to sudo invocation + # NB: This must be run from a directory that the user has access to + sudo -E -H -u ci-user env -u SUDO_UID -u SUDO_GID -u SUDO_COMMAND -u SUDO_USER env "PATH=${PATH}" "LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}" "$@" +} + +conda_install() { + # Ensure that the install command don't upgrade/downgrade Python + # This should be called as + # conda_install pkg1 pkg2 ... [-c channel] + as_ci_user conda install -q -n "py_${PYTHON_VERSION}" -y python="${PYTHON_VERSION}" "$@" +} + +conda_run() { + as_ci_user conda run -n "py_${PYTHON_VERSION}" --no-capture-output "$@" +} + +pip_install() { + as_ci_user conda run -n "py_${PYTHON_VERSION}" pip install --progress-bar off "$@" +} diff --git a/.ci/docker/conda-env-ci.txt b/.ci/docker/conda-env-ci.txt new file mode 100644 index 00000000..9ebfe654 --- /dev/null +++ b/.ci/docker/conda-env-ci.txt @@ -0,0 +1,2 @@ +cmake=3.22.1 +ninja=1.10.2 diff --git a/.ci/docker/dev-requirements.txt b/.ci/docker/dev-requirements.txt new file mode 100644 index 00000000..1f960b0b --- /dev/null +++ b/.ci/docker/dev-requirements.txt @@ -0,0 +1,3 @@ +pytest +pytest-cov +pre-commit diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt new file mode 100644 index 00000000..b82120a6 --- /dev/null +++ b/.ci/docker/requirements.txt @@ -0,0 +1,7 @@ +torch >= 2.2.0.dev +datasets +tomli >= 1.1.0 ; python_version < "3.11" +tensorboard +sentencepiece +tiktoken +blobfile diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile new file mode 100644 index 00000000..ba276c29 --- /dev/null +++ b/.ci/docker/ubuntu/Dockerfile @@ -0,0 +1,40 @@ +ARG OS_VERSION + +FROM nvidia/cuda:12.1.0-cudnn8-runtime-ubuntu${OS_VERSION} + +ARG OS_VERSION + +ENV DEBIAN_FRONTEND noninteractive + +# Install common dependencies +COPY ./common/install_base.sh install_base.sh +RUN bash ./install_base.sh && rm install_base.sh + +# Install clang +ARG CLANG_VERSION +COPY ./common/install_clang.sh install_clang.sh +RUN bash ./install_clang.sh && rm install_clang.sh + +# Install gcc +ARG GCC_VERSION +COPY ./common/install_gcc.sh install_gcc.sh +RUN bash ./install_gcc.sh && rm install_gcc.sh + +# Setup user +COPY ./common/install_user.sh install_user.sh +RUN bash ./install_user.sh && rm install_user.sh + +# Install conda and other dependencies +ARG MINICONDA_VERSION +ARG PYTHON_VERSION +ENV PYTHON_VERSION=$PYTHON_VERSION +ENV PATH /opt/conda/envs/py_$PYTHON_VERSION/bin:/opt/conda/bin:$PATH +COPY dev-requirements.txt /opt/conda/ +COPY requirements.txt /opt/conda/ +COPY conda-env-ci.txt /opt/conda/ +COPY ./common/install_conda.sh install_conda.sh +COPY ./common/utils.sh utils.sh +RUN bash ./install_conda.sh && rm install_conda.sh utils.sh /opt/conda/dev-requirements.txt /opt/conda/requirements.txt /opt/conda/conda-env-ci.txt + +USER ci-user +CMD ["bash"] diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml new file mode 100644 index 00000000..11ff5390 --- /dev/null +++ b/.github/workflows/docker-builds.yml @@ -0,0 +1,64 @@ +name: docker-builds + +on: + workflow_dispatch: + pull_request: + paths: + - .ci/docker/** + - .github/workflows/docker-builds.yml + push: + branches: + - main + - release/* + paths: + - .ci/docker/** + - .github/workflows/docker-builds.yml + schedule: + - cron: 1 3 * * 3 + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true + +jobs: + docker-build: + runs-on: [self-hosted, linux.2xlarge] + timeout-minutes: 240 + strategy: + fail-fast: false + matrix: + include: + - docker-image-name: torchtitan-ubuntu-20.04-clang12 + env: + DOCKER_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchtitan/${{ matrix.docker-image-name }} + steps: + - name: Clean workspace + shell: bash + run: | + echo "${GITHUB_WORKSPACE}" + sudo rm -rf "${GITHUB_WORKSPACE}" + mkdir "${GITHUB_WORKSPACE}" + + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + - name: Checkout the repo + uses: actions/checkout@v3 + + - name: Setup Linux + uses: pytorch/test-infra/.github/actions/setup-linux@main + + - name: Build docker image + id: build-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ matrix.docker-image-name }} + always-rebuild: true + push: true + force-push: true + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() diff --git a/.github/workflows/unit_test_4gpu.yaml b/.github/workflows/unit_test_4gpu.yaml index bef4238f..9c1977ca 100644 --- a/.github/workflows/unit_test_4gpu.yaml +++ b/.github/workflows/unit_test_4gpu.yaml @@ -18,17 +18,16 @@ jobs: gpu-arch-version: "12.1" # This image is faster to clone than the default, but it lacks CC needed by triton # (1m25s vs 2m37s). - docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime" - repository: "pytorch/torchtitan" - upload-artifact: "outputs" + docker-image: torchtitan-ubuntu-20.04-clang12 + repository: pytorch/torchtitan + upload-artifact: outputs script: | - conda install -y -q git clang clangxx - export CC=clang - export CXX=clangxx - pip config --user set global.progress_bar off - pip install --upgrade pip + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 - python -m pip install -r requirements.txt - python -m pip install -r dev-requirements.txt mkdir artifacts-to-be-uploaded python ./test_runner.py artifacts-to-be-uploaded diff --git a/.github/workflows/unit_test_cpu.yaml b/.github/workflows/unit_test_cpu.yaml index ccb706ce..e7380bf1 100644 --- a/.github/workflows/unit_test_cpu.yaml +++ b/.github/workflows/unit_test_cpu.yaml @@ -13,10 +13,14 @@ jobs: build-test: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: - docker-image: "pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime" - repository: "pytorch/torchtitan" + docker-image: torchtitan-ubuntu-20.04-clang12 + repository: pytorch/torchtitan script: | - pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 - python -m pip install -r requirements.txt - python -m pip install -r dev-requirements.txt + set -eux + + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 pytest test --cov=. --cov-report=xml --durations=20 -vv diff --git a/dev-requirements.txt b/dev-requirements.txt deleted file mode 100644 index 1f960b0b..00000000 --- a/dev-requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pytest -pytest-cov -pre-commit diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 120000 index 00000000..d5ba075d --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1 @@ +.ci/docker/dev-requirements.txt \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index b82120a6..00000000 --- a/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -torch >= 2.2.0.dev -datasets -tomli >= 1.1.0 ; python_version < "3.11" -tensorboard -sentencepiece -tiktoken -blobfile diff --git a/requirements.txt b/requirements.txt new file mode 120000 index 00000000..72b541c1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +.ci/docker/requirements.txt \ No newline at end of file