From b6d12d88c9efa8c60df2e7b7178102dcda3210d4 Mon Sep 17 00:00:00 2001 From: atalman Date: Wed, 25 Sep 2024 20:39:56 +0000 Subject: [PATCH] Use amazon linux 2023 runners for Docker builds (#136544) Migrate these builds to linux 2023. We want to build and test the Docker images in CD. Looks like we are hitting this issue: https://github.com/docker/buildx/issues/379 when trying to build Docker on Amazon Linux 2023. Conda Docker build is timing out. While Manywheel is executing but failing because BUILDKIT is turned off: https://github.com/pytorch/pytorch/actions/runs/11036043157/job/30653543264?pr=136544 Proposed Solution is to fix it in user_data . Please see: https://github.com/pytorch/test-infra/issues/5712 I see docker builds are executed successfully here: https://github.com/pytorch/pytorch/actions/runs/11040149229/job/30667448668?pr=136544 Workaround timeout problem (reported in https://bugzilla.redhat.com/show_bug.cgi?id=1537564 ) by configuring number of open files per container to 1048576 Pull Request resolved: https://github.com/pytorch/pytorch/pull/136544 Approved by: https://github.com/ZainRizvi Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com> --- .ci/docker/conda/build.sh | 6 ++++++ .ci/docker/manywheel/Dockerfile | 1 + .ci/docker/manywheel/build.sh | 9 ++++++++- .github/workflows/build-conda-images.yml | 2 +- .github/workflows/build-manywheel-images.yml | 6 +++--- 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/.ci/docker/conda/build.sh b/.ci/docker/conda/build.sh index 6e8a1c37ff9fb9..b613e195ade0d9 100755 --- a/.ci/docker/conda/build.sh +++ b/.ci/docker/conda/build.sh @@ -37,6 +37,12 @@ esac ( set -x + # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712 + # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023. + sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service + sudo systemctl daemon-reload + sudo systemctl restart docker + docker build \ --target final \ --progress plain \ diff --git a/.ci/docker/manywheel/Dockerfile b/.ci/docker/manywheel/Dockerfile index 39b5d04b4d20ad..a4fb127d178e2a 100644 --- a/.ci/docker/manywheel/Dockerfile +++ b/.ci/docker/manywheel/Dockerfile @@ -10,6 +10,7 @@ ENV LANG en_US.UTF-8 ENV LANGUAGE en_US.UTF-8 ARG DEVTOOLSET_VERSION=9 + # Note: This is required patch since CentOS have reached EOL # otherwise any yum install setp will fail RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo diff --git a/.ci/docker/manywheel/build.sh b/.ci/docker/manywheel/build.sh index 0cfb88ef72fb6a..2ea02c4eb1d005 100755 --- a/.ci/docker/manywheel/build.sh +++ b/.ci/docker/manywheel/build.sh @@ -124,7 +124,14 @@ if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then fi ( set -x - DOCKER_BUILDKIT=1 docker build \ + + # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712 + # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023. + sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service + sudo systemctl daemon-reload + sudo systemctl restart docker + + DOCKER_BUILDKIT=1 docker build \ ${DOCKER_GPU_BUILD_ARG} \ --build-arg "GPU_IMAGE=${GPU_IMAGE}" \ --target "${TARGET}" \ diff --git a/.github/workflows/build-conda-images.yml b/.github/workflows/build-conda-images.yml index 4962276321cc6f..4d2f146a7577d3 100644 --- a/.github/workflows/build-conda-images.yml +++ b/.github/workflows/build-conda-images.yml @@ -32,7 +32,7 @@ concurrency: jobs: build-docker: environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} - runs-on: am2.linux.9xlarge.ephemeral + runs-on: linux.9xlarge.ephemeral strategy: matrix: cuda_version: ["11.8", "12.1", "12.4", "cpu"] diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml index 750ee99d52e38d..7ecf278c585759 100644 --- a/.github/workflows/build-manywheel-images.yml +++ b/.github/workflows/build-manywheel-images.yml @@ -45,7 +45,7 @@ jobs: build-docker-cuda: environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} needs: get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral" + runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral" strategy: matrix: cuda_version: ["12.4", "12.1", "11.8"] @@ -156,7 +156,7 @@ jobs: build-docker-rocm: environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} needs: get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral" + runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral" strategy: matrix: rocm_version: ["6.1", "6.2"] @@ -192,7 +192,7 @@ jobs: build-docker-cpu: environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} needs: get-label-type - runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral" + runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral" steps: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main