From f2aedcb1d13e4b2657898b06c47cf4d49dd9dea2 Mon Sep 17 00:00:00 2001 From: Nicholas Sielicki Date: Thu, 15 Aug 2024 09:23:47 -0700 Subject: [PATCH] feat(ci): add package generation stack-info: PR: https://github.com/aws/aws-ofi-nccl/pull/592, branch: aws-nslick/stack/33 Signed-off-by: Nicholas Sielicki --- .docker/cfg/use-cluster.hcl | 0 .docker/cfg/use-github.hcl | 0 .docker/cfg/use-local.hcl | 22 ++ .docker/containers/Dockerfile.cache_efa | 49 +++ .docker/containers/Dockerfile.dnf | 44 +++ .docker/containers/Dockerfile.dpkg | 65 ++++ .../containers/Dockerfile.dpkg_add_cuda_repo | 11 + .docker/containers/Dockerfile.install_efa | 17 + .docker/containers/Dockerfile.makedist | 23 ++ .docker/containers/Dockerfile.srpm | 18 ++ .docker/containers/Dockerfile.yum | 42 +++ .docker/eks/cluster.yml | 32 ++ .docker/eks/nodepools.yml | 88 ++++++ .docker/eks/provisioner.yml | 20 ++ .docker/eks/scale.yml | 27 ++ .dockerignore | 81 +++++ .github/workflows/packages.yaml | 94 ++++++ .gitignore | 1 + .packit.yml | 20 ++ .packit/libnccl-net-ofi.spec | 98 ++++++ .version | 1 + Makefile.am | 1 + configure.ac | 2 +- docker-bake.hcl | 291 ++++++++++++++++++ 24 files changed, 1046 insertions(+), 1 deletion(-) create mode 100644 .docker/cfg/use-cluster.hcl create mode 100644 .docker/cfg/use-github.hcl create mode 100644 .docker/cfg/use-local.hcl create mode 100644 .docker/containers/Dockerfile.cache_efa create mode 100644 .docker/containers/Dockerfile.dnf create mode 100644 .docker/containers/Dockerfile.dpkg create mode 100644 .docker/containers/Dockerfile.dpkg_add_cuda_repo create mode 100644 .docker/containers/Dockerfile.install_efa create mode 100644 .docker/containers/Dockerfile.makedist create mode 100644 .docker/containers/Dockerfile.srpm create mode 100644 .docker/containers/Dockerfile.yum create mode 100644 .docker/eks/cluster.yml create mode 100644 .docker/eks/nodepools.yml create mode 100644 .docker/eks/provisioner.yml create mode 100644 .docker/eks/scale.yml create mode 100644 .dockerignore create mode 100644 .github/workflows/packages.yaml create mode 100644 .packit.yml create mode 100644 .packit/libnccl-net-ofi.spec create mode 100644 .version create mode 100644 docker-bake.hcl diff --git a/.docker/cfg/use-cluster.hcl b/.docker/cfg/use-cluster.hcl new file mode 100644 index 000000000..e69de29bb diff --git a/.docker/cfg/use-github.hcl b/.docker/cfg/use-github.hcl new file mode 100644 index 000000000..e69de29bb diff --git a/.docker/cfg/use-local.hcl b/.docker/cfg/use-local.hcl new file mode 100644 index 000000000..a52b219fd --- /dev/null +++ b/.docker/cfg/use-local.hcl @@ -0,0 +1,22 @@ +# +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# +# What is this? +# This overrides defaults for caching, targets, tags, platforms, etc. This +# file is expected to be symlinked into the root of the tree to select the +# builder used by `docker buildx bake'. +# +# This file configures local builds. +# +# See https://docs.docker.com/build/bake/reference/#file-format + +variable "VERSION" { default = "master" } + +target "efainstaller" { + platforms = [ "linux/amd64", "linux/arm64" ] + context = "." + dockerfile = ".docker/containers/Dockerfile.efa" + output = ["type=cacheonly"] +} diff --git a/.docker/containers/Dockerfile.cache_efa b/.docker/containers/Dockerfile.cache_efa new file mode 100644 index 000000000..49642e888 --- /dev/null +++ b/.docker/containers/Dockerfile.cache_efa @@ -0,0 +1,49 @@ +# +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# + +FROM alpine:latest AS extractor +RUN apk add --no-cache tar + +FROM extractor AS extracted +ARG INSTALLER_PREFIX +ENV INSTALLER_PREFIX=${INSTALLER_PREFIX} +COPY --from=efa_installer_tarball aws-efa-installer*.tar.gz . +RUN tar xvf *.tar.gz \ + --wildcards "aws-efa-installer/*.txt" \ + --wildcards "aws-efa-installer/*.sh" \ + --wildcards "aws-efa-installer/**/${INSTALLER_PREFIX}/$(uname -m)" && \ + rm -rf *.tar.gz + +FROM distro_image AS installed +ARG ENABLE_EFA_INSTALLER_DEBUG_INFO=0 +ARG ENABLE_MPI4=0 +ARG ENABLE_MPI5=0 + +ENV ENABLE_EFA_INSTALLER_DEBUG_INFO=${ENABLE_EFA_INSTALLER_DEBUG_INFO} +ENV ENABLE_MPI4=${ENABLE_MPI4} +ENV ENABLE_MPI5=${ENABLE_MPI5} + +RUN mkdir /aws-efa-installer +COPY --from=extracted /aws-efa-installer /aws-efa-installer + +# XXX: the EFA installer script should refresh the package caches itself. +# XXX: the EFA installer depends on util-linux, which many contianers don't have. +RUN ( ! command -v getopt && ( apt install -y util-linux || \ + dnf -y install util-linux || \ + yum install -y util-linux ) || /bin/true) && \ + ((command -v apt-get && apt-get update -y) || /bin/true ) && \ + ((command -v yum && yum update -y ) || /bin/true ) && \ + ((command -v dnf && dnf -y update ) || /bin/true ) && \ + cd /aws-efa-installer && \ + ./efa_installer.sh -y -n -l -k -g \ + $(test "$ENABLE_EFA_INSTALLER_DEBUG_INFO" -eq "1" && echo "-d") \ + $(test "$ENABLE_MPI4" -eq "1" && echo "--mpi openmpi4") \ + $(test "$ENABLE_MPI5" -eq "1" && echo "--mpi openmpi5") && \ + cd && rm -rf /aws-efa-installer && \ + ((command -v apt-get && apt-get purge -y && apt-get clean -y) || /bin/true ) && \ + ((command -v dnf && dnf clean all -y) || /bin/true ) && \ + ((command -v yum && yum clean all -y) || /bin/true ) && \ + ((command -v zypper && zypper clean ) || /bin/true ) diff --git a/.docker/containers/Dockerfile.dnf b/.docker/containers/Dockerfile.dnf new file mode 100644 index 000000000..67bd052a4 --- /dev/null +++ b/.docker/containers/Dockerfile.dnf @@ -0,0 +1,44 @@ +# +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# + +ARG FAMILY=fedora +ARG VERSION=rawhide +ARG VARIANT=cuda +ARG CUDA_DISTRO +ARG AWS_BUILD +ARG ENABLE_POWERTOOLS + +# Install EFA-installer deps. +FROM ${FAMILY}:${VERSION} AS builder +ARG CUDA_DISTRO +ARG ENABLE_POWERTOOLS +ENV CUDA_DISTRO=${CUDA_DISTRO} +ENV ENABLE_POWERTOOLS=${ENABLE_POWERTOOLS} +# Add NVIDIA repo for CUDA builds. +COPY --from=efainstaller / / +RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \ + --mount=type=cache,target=/var/cache/dnf,sharing=locked \ + bash -c "cd /aws-efa-installer && dnf install -y gcc rpmdevtools rpmlint dnf-plugins-core util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \ + ( test "${ENABLE_POWERTOOLS}" = "1" && sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo || /bin/true ) && \ + dnf -y update && dnf -y upgrade +RUN rpmdev-setuptree + +FROM builder AS environment +ARG VARIANT +ARG AWS_BUILD +ENV VARIANT=${VARIANT} +ENV AWS_BUILD=${AWS_BUILD} +COPY --from=srpm . . +RUN yum search hwloc +RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros +RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros +RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \ + --mount=type=cache,target=/var/cache/dnf,sharing=locked \ + dnf -y install cuda-cudart-devel-12-6 && dnf -y builddep *.src.rpm && rpmbuild --rebuild *.src.rpm + +FROM scratch +COPY --from=environment /root/rpmbuild/RPMS/**/* / diff --git a/.docker/containers/Dockerfile.dpkg b/.docker/containers/Dockerfile.dpkg new file mode 100644 index 000000000..475cad99e --- /dev/null +++ b/.docker/containers/Dockerfile.dpkg @@ -0,0 +1,65 @@ +# +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# + +ARG FAMILY=ubuntu +ARG VERSION=latest +ARG CUDA_DISTRO +ARG DEBIAN_FRONTEND=noninteractive +ARG AWS_BUILD + +FROM ${FAMILY}:${VERSION} AS build +ARG CUDA_DISTRO +ENV CUDA_DISTRO=${CUDA_DISTRO} +ARG AWS_BUILD=0 +ENV AWS_BUILD=${AWS_BUILD} + +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt-get update -y && apt-get install wget -y + +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb + +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + dpkg -i cuda-keyring_1.1-1_all.deb + +COPY --from=efainstaller / . +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + bash -c "apt-get update -y && cd /aws-efa-installer && ./efa_installer.sh /efa_installer.sh -n -l -k -d -y && apt-get install -y autoconf automake libtool gcc g++ git libhwloc-dev make && rm -rf /aws-efa-installer" + +COPY --from=makedist / . +RUN tar xvf ./aws-ofi-nccl*.tar.gz -C . +RUN cd aws-ofi-nccl* && \ + ./configure --$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") \ + --prefix=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \ + --with-libfabric=/opt/amazon/efa \ + --disable-tests \ + --$(test "$AWS_BUILD" -eq 0 && echo -n "disable" || echo -n "enable")-platform-aws \ + --with-mpi=no && make -j && make install + +FROM ubuntu:latest AS packager +ARG FAMILY +ARG VERSION +ARG AWS_BUILD=0 +ENV AWS_BUILD=${AWS_BUILD} +ENV FAMILY=${FAMILY} +ENV VERSION=${VERSION} +COPY --from=build /opt/amazon/ /opt/amazon/ +RUN find /opt/amazon/ | grep -E \.la$ | xargs rm +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt-get update -y && apt-get install -y ruby tar squashfs-tools binutils && gem install fpm +RUN fpm \ + -s dir -t deb \ + --license Apache2.0 \ + -p /libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")-${FAMILY}-${VERSION}.deb \ + --name nccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \ + /opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")/=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") + +FROM scratch +COPY --from=packager /libnccl-net-ofi* / + diff --git a/.docker/containers/Dockerfile.dpkg_add_cuda_repo b/.docker/containers/Dockerfile.dpkg_add_cuda_repo new file mode 100644 index 000000000..474aab5d2 --- /dev/null +++ b/.docker/containers/Dockerfile.dpkg_add_cuda_repo @@ -0,0 +1,11 @@ +FROM alpine:latest AS downloader +ARG CUDA_DISTRO +RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb + +FROM base_image +ARG CUDA_TOOLKIT_VERSION_SUFFIX +ENV CUDA_TOOLKIT_VERSION_SUFFIX=${CUDA_TOOLKIT_VERSION_SUFFIX} +COPY --from=downloader cuda-keyring*.deb . +RUN apt-get update -y && apt-get install -y ca-certificates && dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring*.deb && \ + apt-get update -y && apt-get install -y cuda-cudart-dev-${CUDA_TOOLKIT_VERSION_SUFFIX} && \ + apt-get purge -y && apt-get clean -y diff --git a/.docker/containers/Dockerfile.install_efa b/.docker/containers/Dockerfile.install_efa new file mode 100644 index 000000000..668e5894c --- /dev/null +++ b/.docker/containers/Dockerfile.install_efa @@ -0,0 +1,17 @@ +FROM distro_image +RUN mkdir /aws-efa-installer +COPY --from=efa_installer_contents /aws-efa-installer /aws-efa-installer + +# XXX: EFA installer doesn't refresh the package caches if they're unpopulated, +# as they always are in container images. +# +# XXX: EFA installer depends on util-linux, which many contianers don't have. +RUN (command -v getopt || apt install -y util-linux 2>/dev/null || \ + dnf -y install util-linux 2>/dev/null || yum -y install util-linux 2>/dev/null) && \ + (command -v apt-get && apt-get update -y || /bin/true ) && \ + (! command -v yum || yum update -y ) && \ + cd /aws-efa-installer && \ + ./efa_installer.sh -d -y -n -l -k -g --mpi openmpi4,openmpi5 && \ + cd && rm -rf /aws-efa-installer && (command -v apt-get && apt-get purge -y && apt-get clean -y || /bin/true ) \ + (command -v dnf && dnf clean -y || /bin/true ) \ + (command -v yum && yum clean -y || /bin/true ) diff --git a/.docker/containers/Dockerfile.makedist b/.docker/containers/Dockerfile.makedist new file mode 100644 index 000000000..2f979177b --- /dev/null +++ b/.docker/containers/Dockerfile.makedist @@ -0,0 +1,23 @@ +# +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# + +FROM base_image AS buildenv +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt-get update -y && apt-get install -y automake autoconf libtool libhwloc-dev + +FROM buildenv AS distbuilder +ARG ACCELERATOR +ENV ACCELERATOR=${ACCELERATOR} +COPY ../ /proj +WORKDIR /proj +RUN autoreconf -ivf && \ + ./configure --with-libfabric=/opt/amazon/efa \ + --$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") && \ + make -j dist + +FROM scratch +COPY --from=distbuilder /proj/aws-ofi-nccl*.tar.gz / diff --git a/.docker/containers/Dockerfile.srpm b/.docker/containers/Dockerfile.srpm new file mode 100644 index 000000000..09e1138b5 --- /dev/null +++ b/.docker/containers/Dockerfile.srpm @@ -0,0 +1,18 @@ +# +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# + +FROM fedora:rawhide AS packitimg +RUN dnf install -y packit mock + +FROM packitimg AS srpm +RUN mkdir /proj +WORKDIR /proj +COPY --from=src . . +COPY --from=makedist . . +RUN packit srpm + +FROM scratch +COPY --from=srpm /proj/*.src.rpm / diff --git a/.docker/containers/Dockerfile.yum b/.docker/containers/Dockerfile.yum new file mode 100644 index 000000000..4d02d7580 --- /dev/null +++ b/.docker/containers/Dockerfile.yum @@ -0,0 +1,42 @@ +# +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# + +ARG FAMILY=amazonlinux +ARG VERSION=2 +ARG VARIANT=cuda +ARG CUDA_DISTRO +ARG AWS_BUILD + +# Install EFA-installer deps. +FROM ${FAMILY}:${VERSION} AS builder +ARG CUDA_DISTRO +ENV CUDA_DISTRO=${CUDA_DISTRO} +# Add NVIDIA repo for CUDA builds. +COPY --from=efainstaller / / +RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \ + --mount=type=cache,target=/var/cache/dnf,sharing=locked \ + bash -c "cd /aws-efa-installer && yum install -y gcc rpmdevtools rpmlint yum-utils util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \ + yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \ + yum update -y +RUN rpmdev-setuptree + +FROM builder AS environment +ARG VARIANT +ARG AWS_BUILD +ARG TOOLKIT_VERSION=12-6 +ENV VARIANT=${VARIANT} +ENV AWS_BUILD=${AWS_BUILD} +ENV TOOLKIT_VERSION=${TOOLKIT_VERSION} +COPY --from=srpm . . +RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros +RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros +RUN echo "%_cuda_toolkit_version ${TOOLKIT_VERSION}" >> ~/.rpmmacros +RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \ + --mount=type=cache,target=/var/cache/dnf,sharing=locked \ + yum install -y cuda-cudart-devel-${TOOLKIT_VERSION} && yum-builddep -y *.src.rpm && rpmbuild --rebuild *.src.rpm + +FROM scratch +COPY --from=environment /root/rpmbuild/RPMS/**/* / diff --git a/.docker/eks/cluster.yml b/.docker/eks/cluster.yml new file mode 100644 index 000000000..32393c012 --- /dev/null +++ b/.docker/eks/cluster.yml @@ -0,0 +1,32 @@ +--- +apiVersion: eksctl.io/v1alpha5 +kind: ClusterConfig +metadata: + name: ect-build + region: us-west-1 + version: "1.30" + tags: + karpenter.sh/discovery: ect-build +karpenter: + version: '1.0.2' + withSpotInterruptionQueue: true + createServiceAccount: true +iam: + withOIDC: true +availabilityZones: +- us-west-1b +- us-west-1c +managedNodeGroups: +- name: mgmt-ng-1 + amiFamily: Bottlerocket + desiredCapacity: 1 + minSize: 1 + maxSize: 2 + labels: { role: management } + tags: + nodegroup-role: management + instanceSelector: + cpuArchitecture: arm64 + vCPUs: 8 +addons: +- name: eks-pod-identity-agent diff --git a/.docker/eks/nodepools.yml b/.docker/eks/nodepools.yml new file mode 100644 index 000000000..10e3a4ccf --- /dev/null +++ b/.docker/eks/nodepools.yml @@ -0,0 +1,88 @@ +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: pool-amd64 + annotations: + kubernetes.io/description: "Build nodepool for amd64 workloads" +spec: + disruption: + consolidateAfter: 1m0s + consolidationPolicy: WhenEmptyOrUnderutilized + #expireAfter: 8h + limits: + cpu: 2048 + template: + metadata: {} + spec: + requirements: + - key: kubernetes.io/arch + operator: In + values: ["amd64"] + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: bottlerocket +--- +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: pool-arm64 + annotations: + kubernetes.io/description: "Build nodepool for amd64 workloads" +spec: + disruption: + consolidateAfter: 1m0s + consolidationPolicy: WhenEmptyOrUnderutilized + #expireAfter: 8h + limits: + cpu: 2048 + template: + metadata: {} + spec: + requirements: + - key: kubernetes.io/arch + operator: In + values: ["arm64"] + - key: kubernetes.io/os + operator: In + values: ["linux"] + - key: karpenter.sh/capacity-type + operator: In + values: ["spot"] + nodeClassRef: + group: karpenter.k8s.aws + kind: EC2NodeClass + name: bottlerocket +--- +apiVersion: karpenter.k8s.aws/v1 +kind: EC2NodeClass +metadata: + name: bottlerocket +spec: + role: "KarpenterNodeRole-ect-build" + subnetSelectorTerms: + - tags: + karpenter.sh/discovery: "ect-build" + securityGroupSelectorTerms: + - tags: + karpenter.sh/discovery: "ect-build" + amiSelectorTerms: + - alias: bottlerocket@latest + blockDeviceMappings: + - deviceName: /dev/xvda + ebs: + volumeType: gp3 + volumeSize: 10Gi + deleteOnTermination: true + - deviceName: /dev/xvdb + ebs: + volumeType: gp3 + volumeSize: 40Gi + deleteOnTermination: true diff --git a/.docker/eks/provisioner.yml b/.docker/eks/provisioner.yml new file mode 100644 index 000000000..9492e705d --- /dev/null +++ b/.docker/eks/provisioner.yml @@ -0,0 +1,20 @@ +--- +apiVersion: karpenter.sh/v1 +kind: Provisioner +metadata: + name: management +spec: + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: ["on-demand"] + limits: + resources: + cpu: 1000 + provider: + instanceProfile: eksctl-KarpenterNodeInstanceProfile-ect-build + subnetSelector: + karpenter.sh/discovery: ect-build + securityGroupSelector: + karpenter.sh/discovery: ect-build + ttlSecondsAfterEmpty: 30 diff --git a/.docker/eks/scale.yml b/.docker/eks/scale.yml new file mode 100644 index 000000000..2b7ba3c62 --- /dev/null +++ b/.docker/eks/scale.yml @@ -0,0 +1,27 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: inflate +spec: + replicas: 0 + selector: + matchLabels: + app: inflate + template: + metadata: + labels: + app: inflate + spec: + terminationGracePeriodSeconds: 0 + securityContext: + runAsUser: 1000 + runAsGroup: 3000 + fsGroup: 2000 + containers: + - name: inflate + image: public.ecr.aws/eks-distro/kubernetes/pause:3.7 + resources: + requests: + cpu: 1 + securityContext: + allowPrivilegeEscalation: false diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..f9a9bc2ca --- /dev/null +++ b/.dockerignore @@ -0,0 +1,81 @@ +*~ + +src/*.o +src/tuner/*.o +src/*.lo +src/*.la +libnccl-net.so +tags + +src/tuner/.dirstamp +src/tuner/*.lo + +tests/functional/*.o +tests/unit/*.o +tests/unit/*.log +tests/unit/*.trs +tests/functional/nccl_connection +tests/functional/nccl_message_transfer +tests/functional/ring +tests/functional/cuda_check +tests/unit/msgbuff +tests/unit/freelist +tests/unit/deque +tests/unit/scheduler +tests/unit/idpool +tests/unit/show_tuner_decisions +tests/unit/show_tuner_costs +tests/unit/ep_addr_list +tests/unit/mr + +# http://www.gnu.org/software/automake +.deps/ +Makefile.in +Makefile +/ar-lib +/mdate-sh +/py-compile +/test-driver +/ylwrap + +# http://www.gnu.org/software/autoconf +build-aux/ +autom4te.cache +/autoscan.log +/autoscan-*.log +/aclocal.m4 +/compile +/config.guess +/config.log +/config.status +/config.sub +/config.cache +/configure +/configure.scan +/depcomp +/install-sh +/missing +/stamp-h1 +/include/stamp-h1 +/include/config.h +/include/config.h.in + +# https://www.gnu.org/software/libtool/ +/ltmain.sh +.libs/ +libtool + +# http://www.gnu.org/software/m4/ +m4/libtool.m4 +m4/ltoptions.m4 +m4/ltsugar.m4 +m4/ltversion.m4 +m4/lt~obsolete.m4 + +# other +.idea/ +.devenv/ +.direnv +*.src.rpm +dockerbld +result diff --git a/.github/workflows/packages.yaml b/.github/workflows/packages.yaml new file mode 100644 index 000000000..1a6eb00fb --- /dev/null +++ b/.github/workflows/packages.yaml @@ -0,0 +1,94 @@ +name: Package Generation +on: + workflow_dispatch: + push: + branches: + - master + - main + - v* + pull_request: + +jobs: + dist: + name: Call make dist + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + platforms: linux/amd64,linux/arm64 + - name: docker buildx bake makedist + uses: docker/bake-action@v5 + with: + set: | + *.cache-from=type=gha + *.cache-to=type=gha,mode=max + push: true + targets: makedist + srpm: + name: Generate a universal SRPM + needs: [ dist ] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + platforms: linux/amd64,linux/arm64 + - name: docker buildx bake srpm + uses: docker/bake-action@v5 + with: + set: | + *.cache-from=type=gha + *.cache-to=type=gha,mode=max + push: true + targets: srpm + debs: + name: Generate Debian-like Packages + needs: [ dist ] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + platforms: linux/amd64,linux/arm64 + - name: docker buildx bake debs + uses: docker/bake-action@v5 + with: + set: | + *.cache-from=type=gha + *.cache-to=type=gha,mode=max + push: ${{ github.event_name != 'pull_request' }} + targets: debs + rpms: + name: Generate RPM-like Packages + needs: [ srpm ] + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + platforms: linux/amd64,linux/arm64 + - name: docker buildx bake rpms + uses: docker/bake-action@v5 + with: + set: | + *.cache-from=type=gha + *.cache-to=type=gha,mode=max + push: ${{ github.event_name != 'pull_request' }} + targets: rpms diff --git a/.gitignore b/.gitignore index df184fc99..0fe83c9a2 100644 --- a/.gitignore +++ b/.gitignore @@ -77,3 +77,4 @@ m4/lt~obsolete.m4 .devenv/ .direnv result +dockerbld diff --git a/.packit.yml b/.packit.yml new file mode 100644 index 000000000..823daacb1 --- /dev/null +++ b/.packit.yml @@ -0,0 +1,20 @@ +--- +# vi:ts=2 sw=2 et: +# +# Docs: https://packit.dev/docs/ + + +srpm_build_deps: + - git + +actions: + get-current-version: + - bash -c "cat .version" + create-archive: + - bash -c "echo ./aws-ofi-nccl-${PACKIT_PROJECT_VERSION}.tar.gz" + +specfile_path: .packit/libnccl-net-ofi.spec +upstream_package_name: libnccl-net-ofi +downstream_package_name: libnccl-net-ofi +release_suffix: "{PACKIT_PROJECT_BRANCH}" +update_release: false diff --git a/.packit/libnccl-net-ofi.spec b/.packit/libnccl-net-ofi.spec new file mode 100644 index 000000000..de412133a --- /dev/null +++ b/.packit/libnccl-net-ofi.spec @@ -0,0 +1,98 @@ +# Whether to build with cuda support. Default: on if neuron +%if "%{with_cuda}" == "1" && "%{with_neuron}" == "1" +%{error:Neuron and CUDA must not be enabled together} +%endif + +%if "%{with_cuda}" == "0" && "%{with_neuron}" == "0" +%{error:One of Neuron or CUDA must be enabled} +%endif + +%if "%{with_cuda}" == "1" +%{!?target: %global target nccl} +%endif +%if "%{with_neuron}" == "1" +%{!?target: %global target nccom} +%endif + +%global pname_base lib%{!?with_neuron:nccl}%{?with_neuron:nccom}-net-ofi +%global pname %{pname_base}%{?with_platform_aws:-aws} + +%if "%{with_platform_aws}" +%global _prefix /opt/amazon/%{pname_base} +%endif + +# (CUDA only) what toolkit package to declare a build dependency on. Default: 12-6 +%{!?_cuda_toolkit_version: %global _cuda_toolkit_version 12-6} + +Name: %{pname} +Version: null +Release: 0%{dist} +Summary: NCCL + libfabric compatibility layer +License: Apache-2.0 +URL: https://github.com/aws/aws-ofi-nccl +Source0: null +%if "%{_vendor}" == "debbuild" +Group: devel +%else +Group: Development/Tools%{?suse_version:/Building} +BuildRequires: hwloc-devel +BuildRequires: make +BuildRequires: gcc +BuildRequires: gcc-c++ +%if "%{with_platform_aws}" +BuildRequires: libfabric-aws-devel +Requires: libfabric-aws +%else +BuildRequires: libfabric1-devel +Requires: libfabric +%endif +%if "%{with_cuda}" == "1" +BuildRequires: cuda-cudart-devel-%{_cuda_toolkit_version} +%endif +%endif +Requires: hwloc + +%description +This is a plugin which lets EC2 developers use libfabric as network provider +while running NCCL applications. + + +%prep +%setup +%build +%configure \ + --prefix="%{_prefix}" \ + --disable-tests \ + --with-mpi=no \ +%if "%{with_cuda}" == "1" + --with-cuda=/usr/local/cuda-12 \ + --enable-neuron=no \ +%else + --with-cuda=no \ + --enable-neuron=yes \ +%endif +%if "%{with_platform_aws}" == "1" + --enable-platform-aws \ + --with-libfabric=/opt/amazon/efa +%else + --disable-platform-aws +%endif +%make_build + + +%install +%make_install +find %{buildroot} -name '*.la' -exec rm -f {} ';' +%ldconfig_scriptlets + + +%files +%{_libdir}/*.so +%{_datadir}/aws-ofi-nccl/xml/*.xml +%license LICENSE NOTICE +%doc + + +%changelog +* Thu Aug 08 2024 Nicholas Sielicki +Initial Package diff --git a/.version b/.version new file mode 100644 index 000000000..f32954fbd --- /dev/null +++ b/.version @@ -0,0 +1 @@ +1.12.0pre diff --git a/Makefile.am b/Makefile.am index 5b57c7216..d5278ce22 100644 --- a/Makefile.am +++ b/Makefile.am @@ -7,6 +7,7 @@ ACLOCAL_AMFLAGS = -I m4 SUBDIRS = include 3rd-party src topology tests EXTRA_DIST = \ + .version \ autogen.sh \ CODE_OF_CONDUCT.md \ CONTRIBUTING.md \ diff --git a/configure.ac b/configure.ac index d40840404..c535140be 100644 --- a/configure.ac +++ b/configure.ac @@ -6,7 +6,7 @@ # # Initialization -AC_INIT([aws-ofi-nccl], [GitHub-dev], [al-ofi-nccl-team@amazon.com], , [http://github.com/aws/aws-ofi-nccl]) +AC_INIT([aws-ofi-nccl], m4_normalize(m4_include([.version])), [al-ofi-nccl-team@amazon.com], , [http://github.com/aws/aws-ofi-nccl]) AC_PREREQ([2.69]) AC_CONFIG_SRCDIR([src/nccl_ofi_net.c]) AC_CONFIG_AUX_DIR([build-aux]) diff --git a/docker-bake.hcl b/docker-bake.hcl new file mode 100644 index 000000000..cf023fe79 --- /dev/null +++ b/docker-bake.hcl @@ -0,0 +1,291 @@ +# +# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved. +# +# See LICENSE.txt for license information +# +# +# Usage: https://docs.docker.com/reference/cli/docker/buildx/bake/ + +# Notes: +# * arm64 builds will use qemu by default, but requires containerd snapshotting +# to be enabled in docker's daemon.json, or explicit creation of an arm64 +# capable context. +# +# * developers should strongly consider standing up an eks cluster and +# configuring a k8s builder for native arm64 builds: +# https://docs.docker.com/build/builders/drivers/kubernetes/ + +group "default" { targets = [ "rpms", "debs" ] } + +variable "base_images" { + type = list(string) + default = [ + "amazonlinux:2", + "amazonlinux:2023", + "rockylinux:8", + "rockylinux:9", + "opensuse/leap:15", + # Intentionaly not included + # "centos:centos7" + # Debian + "debian:10", + "debian:11", + # "debian:12" # not supported by EFA installer. + "ubuntu:20.04", + "ubuntu:22.04", + "ubuntu:24.04", + ] +} + + +function "efa_installer_dir_name" { + params = [base_image] + result = "${replace(replace(replace(replace(upper(replace(replace(base_image, ".", ""), ":", "")), "MAZON", ""), "/", ""), "OPEN", ""), "LEAP15", "")}" +} + +function "plaintext_image_name" { + params = [base_image] + result = "${replace(replace(replace(base_image, "/", "_"), ":", ""), ".", "")}" +} + +function "baseimage_to_cuda_repo_name" { + params = [base_image] + result = "${replace(replace(base_image, ":", ""), ".", "")}" +} + +# Caches efa installer packages, without actually installing them. +target "efa_installer_base_images" { + name = "${plaintext_image_name(base_image)}_base_efa-installer-${replace(efa_installer_version, ".", "-")}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}" + tags = [ "982534352369.dkr.ecr.us-west-1.amazonaws.com/common/efa_installer/${plaintext_image_name(base_image)}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}:${efa_installer_version}" ] + matrix = { + base_image = base_images, + # "amazonlinux:2", + # "amazonlinux:2023", + # "rockylinux:8", + # "rockylinux:9", + # "opensuse/leap:15", + # # Intentionaly not included + # # "centos:centos7" + # # Debian + # "debian:10", + # "debian:11", + # # "debian:12" # not supported by EFA installer. + # "ubuntu:20.04", + # "ubuntu:22.04", + # "ubuntu:24.04", + #] + efa_installer_version = [ + "1.34.0", + "1.33.0", + ] + item = [ + { mpi4_enabled = 1, mpi5_enabled = 0 }, + { mpi4_enabled = 0, mpi5_enabled = 1 }, + { mpi4_enabled = 0, mpi5_enabled = 0 }, + ] + debug_enabled = [ 0, 1 ] + } + contexts = { + efa_installer_tarball = "https://efa-installer.amazonaws.com/aws-efa-installer-${efa_installer_version}.tar.gz" + distro_image = "docker-image://${base_image}" + } + targets = [ "linux/amd64", "linux/arm64" ] + args = { + INSTALLER_PREFIX = "${efa_installer_dir_name(base_image)}" + ENABLE_EFA_INSTALLER_DEBUG_INFO = debug_enabled + ENABLE_MPI4 = item.mpi4_enabled, + ENABLE_MPI5 = item.mpi5_enabled, + } + dockerfile = ".docker/containers/Dockerfile.cache_efa" + #output = ["type=image,push=true"] +} + +target "cuda_enabled_build_images" { + name = "${plaintext_image_name(base_image)}_efa-installer-${replace(efa_installer_version, ".", "-")}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}" + tags = [ "982534352369.dkr.ecr.us-west-1.amazonaws.com/common/efa_installer/${plaintext_image_name(base_image)}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}:${efa_installer_version}-cuda${cuda_version}" ] + dockerfile = ".docker/containers/Dockerfile.dpkg_add_cuda_repo" + output = ["type=cacheonly"] + args = { CUDA_DISTRO = "${baseimage_to_cuda_repo_name(base_image)}", CUDA_TOOLKIT_VERSION_SUFFIX = "${replace(CUDA_VERSION, ".", "-")}" } + contexts = { + base_image = "target:${plaintext_image_name(base_image)}_base_efa-installer-${replace(efa_installer_version, ".", "-")}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}" + } + matrix = { + base_image = [ + "amazonlinux:2", + "amazonlinux:2023", + "rockylinux:8", + "rockylinux:9", + "opensuse/leap:15", + # Intentionaly not included + # "centos:centos7" + "debian:10", + "debian:11", + #"debian:12", + "ubuntu:20.04", + "ubuntu:22.04", + "ubuntu:24.04", + ] + efa_installer_version = [ "1.34.0", "1.33.0" ] + cuda_version = [ + "12.6", + "12.5", + "12.4", + "12.3", + "12.2", + "12.1", + "12.0", + "11.8", + "11.7", + ] + } +} + +# Generate a `make dist` tarball. Note that this requires ./configure to be +# called, and that the contents of this "dist tarball" may differ depending on +# the configuration options passed. Requires dependencies to be installed as +# ./configure aborts if they cannot resolve. +#target "makedist" { +# name = "makedist-${item.accelerator}" +# matrix = { +# item = [ +# { accelerator = "neuron", base_image = "target:ubuntu2204_efa-installer-${replace(, ".", "-")}" }, +# { accelerator = "cuda", base_image = "target:ubuntu2204_efa-installer-${replace(EFA_INSTALLER_VERSION, ".", "-")}" }, +# ] +# } +# contexts = { src = ".", base_image = "${item.base_image}" } +# args = { ACCELERATOR = item.accelerator } +# dockerfile = ".docker/containers/Dockerfile.makedist" +# output = ["type=local,dest=dockerbld/tarball"] +#} + +# # Generate a universal srpm using packit. +# target "srpm" { +# contexts = { src = ".", makedist = "target:makedist-neuron" } +# dockerfile = ".docker/containers/Dockerfile.srpm" +# output = ["type=local,dest=dockerbld/srpm"] +# } +# +# # Generate RPMs from the srpm above. +# target "rpms" { +# name = "pkg${item.aws == "1" ? "-aws" : ""}-${replace(item.family, "/", "_")}-${replace(item.version, ".", "_")}" +# matrix = { +# item = [ +# { +# family = "amazonlinux", +# package_frontend = "dnf", +# version = "2023", +# efa = "latest", +# cuda_distro = "amzn2023", +# toolkit_version = "12-6", +# accelerator = "cuda", +# enable_powertools = "0", +# aws = "1" +# }, +# { +# family = "amazonlinux", +# package_frontend = "yum", +# version = "2", +# efa = "latest", +# cuda_distro = "rhel7", +# toolkit_version = "12-3", +# accelerator = "cuda", +# enable_powertools = "0", +# aws = "1" +# }, +# { +# family = "rockylinux", +# package_frontend = "dnf", +# version = "8", +# efa = "latest", +# cuda_distro = "rhel8", +# toolkit_version = "12-6", +# accelerator = "cuda", +# enable_powertools = "1", +# aws = "1" +# }, +# { +# family = "rockylinux", +# package_frontend = "dnf", +# version = "9", +# efa = "latest", +# cuda_distro = "rhel9", +# toolkit_version = "12-6", +# accelerator = "cuda", +# enable_powertools = "0", +# aws = "1" +# }, +# ] +# } +# contexts = { +# efainstaller = "target:efainstaller" +# srpm = "target:srpm" +# } +# dockerfile = ".docker/containers/Dockerfile.${item.package_frontend}" +# output = ["type=local,dest=dockerbld/pkgs"] +# args = { +# FAMILY = item.family, +# VERSION = item.version +# EFA_INSTALLER_VERSION = item.efa +# CUDA_DISTRO = item.cuda_distro +# VARIANT = item.accelerator +# AWS_BUILD = item.aws +# TOOLKIT_VERSION = item.toolkit_version +# ENABLE_POWERTOOLS = item.enable_powertools +# } +# } +# +# # Build and package for debian-like distributions by building and invoking fpm. +# target "debs" { +# name = "pkg-${item.accelerator}${item.aws == "1" ? "-aws" : ""}-${replace(item.family, "/", "_")}-${replace(item.version, ".", "_")}" +# matrix = { +# item = [ +# { accelerator = "cuda", aws = "1", family = "debian", version = "oldstable", cuda_distro = "debian11" }, +# # XXX: EFA Installer lacks support. +# #{ accelerator = "cuda", aws = "1", platform = "amd64", family = "debian", version = "stable", cuda_distro = "debian11" }, +# { accelerator = "cuda", aws = "1", family = "ubuntu", version = "20.04", cuda_distro = "ubuntu2004" }, +# { accelerator = "cuda", aws = "1", family = "ubuntu", version = "22.04", cuda_distro = "ubuntu2204" }, +# { accelerator = "cuda", aws = "1", family = "ubuntu", version = "24.04", cuda_distro = "ubuntu2404" }, +# { accelerator = "cuda", aws = "0", family = "debian", version = "oldstable", cuda_distro = "debian11" }, +# # XXX: EFA Installer lacks support. +# #{ accelerator = "cuda", aws = "0", family = "debian", version = "stable", cuda_distro = "debian11" }, +# { accelerator = "cuda", aws = "0", family = "ubuntu", version = "20.04", cuda_distro = "ubuntu2004" }, +# { accelerator = "cuda", aws = "0", family = "ubuntu", version = "22.04", cuda_distro = "ubuntu2204" }, +# { accelerator = "cuda", aws = "0", family = "ubuntu", version = "24.04", cuda_distro = "ubuntu2404" }, +# +# # XXX: todo +# # { accelerator = "neuron", aws = "1", platform = "amd64", family = "debian", version = "oldstable", cuda_distro = "debian11" }, +# # #{ accelerator = "neuron", aws = "1", platform = "amd64", family = "debian", version = "stable", cuda_distro = "debian11" }, +# # { accelerator = "neuron", aws = "1", platform = "amd64", family = "ubuntu", version = "20.04", cuda_distro = "ubuntu2004" }, +# # { accelerator = "neuron", aws = "1", platform = "amd64", family = "ubuntu", version = "22.04", cuda_distro = "ubuntu2204" }, +# # { accelerator = "neuron", aws = "1", platform = "amd64", family = "ubuntu", version = "24.04", cuda_distro = "ubuntu2404" }, +# +# # { accelerator = "neuron", aws = "0", platform = "amd64", family = "debian", version = "oldstable", cuda_distro = "debian11" }, +# # #{ accelerator = "neuron", aws = "0", platform = "amd64", family = "debian", version = "stable", cuda_distro = "debian11" }, +# # { accelerator = "neuron", aws = "0", platform = "amd64", family = "ubuntu", version = "20.04", cuda_distro = "ubuntu2004" }, +# # { accelerator = "neuron", aws = "0", platform = "amd64", family = "ubuntu", version = "22.04", cuda_distro = "ubuntu2204" }, +# # { accelerator = "neuron", aws = "0", platform = "amd64", family = "ubuntu", version = "24.04", cuda_distro = "ubuntu2404" }, +# ] +# } +# contexts = { +# efainstaller = "target:efainstaller" +# makedist = "target:makedist-${item.accelerator}" +# } +# dockerfile = ".docker/containers/Dockerfile.dpkg" +# output = ["type=local,dest=dockerbld/pkgs"] +# args = { +# FAMILY = item.family, +# VERSION = item.version +# CUDA_DISTRO = item.cuda_distro +# AWS_BUILD = item.aws +# } +# } +# +# target "nccl_tests" { +# matrix = { +# +# } +# contexts = { +# nccl_tests = "https://github.com/NVIDIA/nccl-tests.git" +# } +# } +#