Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(ci): add package generation
Browse files Browse the repository at this point in the history
stack-info: PR: aws#592, branch: aws-nslick/stack/33
Signed-off-by: Nicholas Sielicki <[email protected]>
aws-nslick committed Sep 27, 2024
1 parent 4122c6c commit 6755d76
Showing 24 changed files with 1,046 additions and 1 deletion.
Empty file added .docker/cfg/use-cluster.hcl
Empty file.
Empty file added .docker/cfg/use-github.hcl
Empty file.
22 changes: 22 additions & 0 deletions .docker/cfg/use-local.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#
# What is this?
# This overrides defaults for caching, targets, tags, platforms, etc. This
# file is expected to be symlinked into the root of the tree to select the
# builder used by `docker buildx bake'.
#
# This file configures local builds.
#
# See https://docs.docker.com/build/bake/reference/#file-format

variable "VERSION" { default = "master" }

target "efainstaller" {
platforms = [ "linux/amd64", "linux/arm64" ]
context = "."
dockerfile = ".docker/containers/Dockerfile.efa"
output = ["type=cacheonly"]
}
49 changes: 49 additions & 0 deletions .docker/containers/Dockerfile.cache_efa
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

FROM alpine:latest AS extractor
RUN apk add --no-cache tar

FROM extractor AS extracted
ARG INSTALLER_PREFIX
ENV INSTALLER_PREFIX=${INSTALLER_PREFIX}
COPY --from=efa_installer_tarball aws-efa-installer*.tar.gz .
RUN tar xvf *.tar.gz \
--wildcards "aws-efa-installer/*.txt" \
--wildcards "aws-efa-installer/*.sh" \
--wildcards "aws-efa-installer/**/${INSTALLER_PREFIX}/$(uname -m)" && \
rm -rf *.tar.gz

FROM distro_image AS installed
ARG ENABLE_EFA_INSTALLER_DEBUG_INFO=0
ARG ENABLE_MPI4=0
ARG ENABLE_MPI5=0

ENV ENABLE_EFA_INSTALLER_DEBUG_INFO=${ENABLE_EFA_INSTALLER_DEBUG_INFO}
ENV ENABLE_MPI4=${ENABLE_MPI4}
ENV ENABLE_MPI5=${ENABLE_MPI5}

RUN mkdir /aws-efa-installer
COPY --from=extracted /aws-efa-installer /aws-efa-installer

# XXX: the EFA installer script should refresh the package caches itself.
# XXX: the EFA installer depends on util-linux, which many contianers don't have.
RUN ( ! command -v getopt && ( apt install -y util-linux || \
dnf -y install util-linux || \
yum install -y util-linux ) || /bin/true) && \
((command -v apt-get && apt-get update -y) || /bin/true ) && \
((command -v yum && yum update -y ) || /bin/true ) && \
((command -v dnf && dnf -y update ) || /bin/true ) && \
cd /aws-efa-installer && \
./efa_installer.sh -y -n -l -k -g \
$(test "$ENABLE_EFA_INSTALLER_DEBUG_INFO" -eq "1" && echo "-d") \
$(test "$ENABLE_MPI4" -eq "1" && echo "--mpi openmpi4") \
$(test "$ENABLE_MPI5" -eq "1" && echo "--mpi openmpi5") && \
cd && rm -rf /aws-efa-installer && \
((command -v apt-get && apt-get purge -y && apt-get clean -y) || /bin/true ) && \
((command -v dnf && dnf clean all -y) || /bin/true ) && \
((command -v yum && yum clean all -y) || /bin/true ) && \
((command -v zypper && zypper clean ) || /bin/true )
44 changes: 44 additions & 0 deletions .docker/containers/Dockerfile.dnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=fedora
ARG VERSION=rawhide
ARG VARIANT=cuda
ARG CUDA_DISTRO
ARG AWS_BUILD
ARG ENABLE_POWERTOOLS

# Install EFA-installer deps.
FROM ${FAMILY}:${VERSION} AS builder
ARG CUDA_DISTRO
ARG ENABLE_POWERTOOLS
ENV CUDA_DISTRO=${CUDA_DISTRO}
ENV ENABLE_POWERTOOLS=${ENABLE_POWERTOOLS}
# Add NVIDIA repo for CUDA builds.
COPY --from=efainstaller / /
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
bash -c "cd /aws-efa-installer && dnf install -y gcc rpmdevtools rpmlint dnf-plugins-core util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \
( test "${ENABLE_POWERTOOLS}" = "1" && sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo || /bin/true ) && \
dnf -y update && dnf -y upgrade
RUN rpmdev-setuptree

FROM builder AS environment
ARG VARIANT
ARG AWS_BUILD
ENV VARIANT=${VARIANT}
ENV AWS_BUILD=${AWS_BUILD}
COPY --from=srpm . .
RUN yum search hwloc
RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros
RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
dnf -y install cuda-cudart-devel-12-6 && dnf -y builddep *.src.rpm && rpmbuild --rebuild *.src.rpm

FROM scratch
COPY --from=environment /root/rpmbuild/RPMS/**/* /
65 changes: 65 additions & 0 deletions .docker/containers/Dockerfile.dpkg
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=ubuntu
ARG VERSION=latest
ARG CUDA_DISTRO
ARG DEBIAN_FRONTEND=noninteractive
ARG AWS_BUILD

FROM ${FAMILY}:${VERSION} AS build
ARG CUDA_DISTRO
ENV CUDA_DISTRO=${CUDA_DISTRO}
ARG AWS_BUILD=0
ENV AWS_BUILD=${AWS_BUILD}

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y && apt-get install wget -y

RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb

RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
dpkg -i cuda-keyring_1.1-1_all.deb

COPY --from=efainstaller / .
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
bash -c "apt-get update -y && cd /aws-efa-installer && ./efa_installer.sh /efa_installer.sh -n -l -k -d -y && apt-get install -y autoconf automake libtool gcc g++ git libhwloc-dev make && rm -rf /aws-efa-installer"

COPY --from=makedist / .
RUN tar xvf ./aws-ofi-nccl*.tar.gz -C .
RUN cd aws-ofi-nccl* && \
./configure --$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") \
--prefix=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \
--with-libfabric=/opt/amazon/efa \
--disable-tests \
--$(test "$AWS_BUILD" -eq 0 && echo -n "disable" || echo -n "enable")-platform-aws \
--with-mpi=no && make -j && make install

FROM ubuntu:latest AS packager
ARG FAMILY
ARG VERSION
ARG AWS_BUILD=0
ENV AWS_BUILD=${AWS_BUILD}
ENV FAMILY=${FAMILY}
ENV VERSION=${VERSION}
COPY --from=build /opt/amazon/ /opt/amazon/
RUN find /opt/amazon/ | grep -E \.la$ | xargs rm
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y && apt-get install -y ruby tar squashfs-tools binutils && gem install fpm
RUN fpm \
-s dir -t deb \
--license Apache2.0 \
-p /libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")-${FAMILY}-${VERSION}.deb \
--name nccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \
/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")/=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")

FROM scratch
COPY --from=packager /libnccl-net-ofi* /

11 changes: 11 additions & 0 deletions .docker/containers/Dockerfile.dpkg_add_cuda_repo
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM alpine:latest AS downloader
ARG CUDA_DISTRO
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb

FROM base_image
ARG CUDA_TOOLKIT_VERSION_SUFFIX
ENV CUDA_TOOLKIT_VERSION_SUFFIX=${CUDA_TOOLKIT_VERSION_SUFFIX}
COPY --from=downloader cuda-keyring*.deb .
RUN apt-get update -y && apt-get install -y ca-certificates && dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring*.deb && \
apt-get update -y && apt-get install -y cuda-cudart-dev-${CUDA_TOOLKIT_VERSION_SUFFIX} && \
apt-get purge -y && apt-get clean -y
17 changes: 17 additions & 0 deletions .docker/containers/Dockerfile.install_efa
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
FROM distro_image
RUN mkdir /aws-efa-installer
COPY --from=efa_installer_contents /aws-efa-installer /aws-efa-installer

# XXX: EFA installer doesn't refresh the package caches if they're unpopulated,
# as they always are in container images.
#
# XXX: EFA installer depends on util-linux, which many contianers don't have.
RUN (command -v getopt || apt install -y util-linux 2>/dev/null || \
dnf -y install util-linux 2>/dev/null || yum -y install util-linux 2>/dev/null) && \
(command -v apt-get && apt-get update -y || /bin/true ) && \
(! command -v yum || yum update -y ) && \
cd /aws-efa-installer && \
./efa_installer.sh -d -y -n -l -k -g --mpi openmpi4,openmpi5 && \
cd && rm -rf /aws-efa-installer && (command -v apt-get && apt-get purge -y && apt-get clean -y || /bin/true ) \
(command -v dnf && dnf clean -y || /bin/true ) \
(command -v yum && yum clean -y || /bin/true )
23 changes: 23 additions & 0 deletions .docker/containers/Dockerfile.makedist
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

FROM base_image AS buildenv
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y && apt-get install -y automake autoconf libtool libhwloc-dev

FROM buildenv AS distbuilder
ARG ACCELERATOR
ENV ACCELERATOR=${ACCELERATOR}
COPY ../ /proj
WORKDIR /proj
RUN autoreconf -ivf && \
./configure --with-libfabric=/opt/amazon/efa \
--$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") && \
make -j dist

FROM scratch
COPY --from=distbuilder /proj/aws-ofi-nccl*.tar.gz /
18 changes: 18 additions & 0 deletions .docker/containers/Dockerfile.srpm
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

FROM fedora:rawhide AS packitimg
RUN dnf install -y packit mock

FROM packitimg AS srpm
RUN mkdir /proj
WORKDIR /proj
COPY --from=src . .
COPY --from=makedist . .
RUN packit srpm

FROM scratch
COPY --from=srpm /proj/*.src.rpm /
42 changes: 42 additions & 0 deletions .docker/containers/Dockerfile.yum
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=amazonlinux
ARG VERSION=2
ARG VARIANT=cuda
ARG CUDA_DISTRO
ARG AWS_BUILD

# Install EFA-installer deps.
FROM ${FAMILY}:${VERSION} AS builder
ARG CUDA_DISTRO
ENV CUDA_DISTRO=${CUDA_DISTRO}
# Add NVIDIA repo for CUDA builds.
COPY --from=efainstaller / /
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
bash -c "cd /aws-efa-installer && yum install -y gcc rpmdevtools rpmlint yum-utils util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \
yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \
yum update -y
RUN rpmdev-setuptree

FROM builder AS environment
ARG VARIANT
ARG AWS_BUILD
ARG TOOLKIT_VERSION=12-6
ENV VARIANT=${VARIANT}
ENV AWS_BUILD=${AWS_BUILD}
ENV TOOLKIT_VERSION=${TOOLKIT_VERSION}
COPY --from=srpm . .
RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros
RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros
RUN echo "%_cuda_toolkit_version ${TOOLKIT_VERSION}" >> ~/.rpmmacros
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
yum install -y cuda-cudart-devel-${TOOLKIT_VERSION} && yum-builddep -y *.src.rpm && rpmbuild --rebuild *.src.rpm

FROM scratch
COPY --from=environment /root/rpmbuild/RPMS/**/* /
32 changes: 32 additions & 0 deletions .docker/eks/cluster.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: ect-build
region: us-west-1
version: "1.30"
tags:
karpenter.sh/discovery: ect-build
karpenter:
version: '1.0.2'
withSpotInterruptionQueue: true
createServiceAccount: true
iam:
withOIDC: true
availabilityZones:
- us-west-1b
- us-west-1c
managedNodeGroups:
- name: mgmt-ng-1
amiFamily: Bottlerocket
desiredCapacity: 1
minSize: 1
maxSize: 2
labels: { role: management }
tags:
nodegroup-role: management
instanceSelector:
cpuArchitecture: arm64
vCPUs: 8
addons:
- name: eks-pod-identity-agent
88 changes: 88 additions & 0 deletions .docker/eks/nodepools.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
---
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: pool-amd64
annotations:
kubernetes.io/description: "Build nodepool for amd64 workloads"
spec:
disruption:
consolidateAfter: 1m0s
consolidationPolicy: WhenEmptyOrUnderutilized
#expireAfter: 8h
limits:
cpu: 2048
template:
metadata: {}
spec:
requirements:
- key: kubernetes.io/arch
operator: In
values: ["amd64"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["spot"]
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: bottlerocket
---
apiVersion: karpenter.sh/v1
kind: NodePool
metadata:
name: pool-arm64
annotations:
kubernetes.io/description: "Build nodepool for amd64 workloads"
spec:
disruption:
consolidateAfter: 1m0s
consolidationPolicy: WhenEmptyOrUnderutilized
#expireAfter: 8h
limits:
cpu: 2048
template:
metadata: {}
spec:
requirements:
- key: kubernetes.io/arch
operator: In
values: ["arm64"]
- key: kubernetes.io/os
operator: In
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["spot"]
nodeClassRef:
group: karpenter.k8s.aws
kind: EC2NodeClass
name: bottlerocket
---
apiVersion: karpenter.k8s.aws/v1
kind: EC2NodeClass
metadata:
name: bottlerocket
spec:
role: "KarpenterNodeRole-ect-build"
subnetSelectorTerms:
- tags:
karpenter.sh/discovery: "ect-build"
securityGroupSelectorTerms:
- tags:
karpenter.sh/discovery: "ect-build"
amiSelectorTerms:
- alias: bottlerocket@latest
blockDeviceMappings:
- deviceName: /dev/xvda
ebs:
volumeType: gp3
volumeSize: 10Gi
deleteOnTermination: true
- deviceName: /dev/xvdb
ebs:
volumeType: gp3
volumeSize: 40Gi
deleteOnTermination: true
20 changes: 20 additions & 0 deletions .docker/eks/provisioner.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
apiVersion: karpenter.sh/v1
kind: Provisioner
metadata:
name: management
spec:
requirements:
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
limits:
resources:
cpu: 1000
provider:
instanceProfile: eksctl-KarpenterNodeInstanceProfile-ect-build
subnetSelector:
karpenter.sh/discovery: ect-build
securityGroupSelector:
karpenter.sh/discovery: ect-build
ttlSecondsAfterEmpty: 30
27 changes: 27 additions & 0 deletions .docker/eks/scale.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: inflate
spec:
replicas: 0
selector:
matchLabels:
app: inflate
template:
metadata:
labels:
app: inflate
spec:
terminationGracePeriodSeconds: 0
securityContext:
runAsUser: 1000
runAsGroup: 3000
fsGroup: 2000
containers:
- name: inflate
image: public.ecr.aws/eks-distro/kubernetes/pause:3.7
resources:
requests:
cpu: 1
securityContext:
allowPrivilegeEscalation: false
81 changes: 81 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
*~

src/*.o
src/tuner/*.o
src/*.lo
src/*.la
libnccl-net.so
tags

src/tuner/.dirstamp
src/tuner/*.lo

tests/functional/*.o
tests/unit/*.o
tests/unit/*.log
tests/unit/*.trs
tests/functional/nccl_connection
tests/functional/nccl_message_transfer
tests/functional/ring
tests/functional/cuda_check
tests/unit/msgbuff
tests/unit/freelist
tests/unit/deque
tests/unit/scheduler
tests/unit/idpool
tests/unit/show_tuner_decisions
tests/unit/show_tuner_costs
tests/unit/ep_addr_list
tests/unit/mr

# http://www.gnu.org/software/automake
.deps/
Makefile.in
Makefile
/ar-lib
/mdate-sh
/py-compile
/test-driver
/ylwrap

# http://www.gnu.org/software/autoconf
build-aux/
autom4te.cache
/autoscan.log
/autoscan-*.log
/aclocal.m4
/compile
/config.guess
/config.log
/config.status
/config.sub
/config.cache
/configure
/configure.scan
/depcomp
/install-sh
/missing
/stamp-h1
/include/stamp-h1
/include/config.h
/include/config.h.in

# https://www.gnu.org/software/libtool/
/ltmain.sh
.libs/
libtool

# http://www.gnu.org/software/m4/
m4/libtool.m4
m4/ltoptions.m4
m4/ltsugar.m4
m4/ltversion.m4
m4/lt~obsolete.m4

# other
.idea/
.devenv/
.direnv
*.src.rpm
dockerbld
result
94 changes: 94 additions & 0 deletions .github/workflows/packages.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
name: Package Generation
on:
workflow_dispatch:
push:
branches:
- master
- main
- v*
pull_request:

jobs:
dist:
name: Call make dist
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
platforms: linux/amd64,linux/arm64
- name: docker buildx bake makedist
uses: docker/bake-action@v5
with:
set: |
*.cache-from=type=gha
*.cache-to=type=gha,mode=max
push: true
targets: makedist
srpm:
name: Generate a universal SRPM
needs: [ dist ]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
platforms: linux/amd64,linux/arm64
- name: docker buildx bake srpm
uses: docker/bake-action@v5
with:
set: |
*.cache-from=type=gha
*.cache-to=type=gha,mode=max
push: true
targets: srpm
debs:
name: Generate Debian-like Packages
needs: [ dist ]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
platforms: linux/amd64,linux/arm64
- name: docker buildx bake debs
uses: docker/bake-action@v5
with:
set: |
*.cache-from=type=gha
*.cache-to=type=gha,mode=max
push: ${{ github.event_name != 'pull_request' }}
targets: debs
rpms:
name: Generate RPM-like Packages
needs: [ srpm ]
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up QEMU
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
platforms: linux/amd64,linux/arm64
- name: docker buildx bake rpms
uses: docker/bake-action@v5
with:
set: |
*.cache-from=type=gha
*.cache-to=type=gha,mode=max
push: ${{ github.event_name != 'pull_request' }}
targets: rpms
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -76,3 +76,4 @@ m4/lt~obsolete.m4
.devenv/
.direnv
result
dockerbld
20 changes: 20 additions & 0 deletions .packit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
---
# vi:ts=2 sw=2 et:
#
# Docs: https://packit.dev/docs/


srpm_build_deps:
- git

actions:
get-current-version:
- bash -c "cat .version"
create-archive:
- bash -c "echo ./aws-ofi-nccl-${PACKIT_PROJECT_VERSION}.tar.gz"

specfile_path: .packit/libnccl-net-ofi.spec
upstream_package_name: libnccl-net-ofi
downstream_package_name: libnccl-net-ofi
release_suffix: "{PACKIT_PROJECT_BRANCH}"
update_release: false
98 changes: 98 additions & 0 deletions .packit/libnccl-net-ofi.spec
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Whether to build with cuda support. Default: on if neuron
%if "%{with_cuda}" == "1" && "%{with_neuron}" == "1"
%{error:Neuron and CUDA must not be enabled together}
%endif

%if "%{with_cuda}" == "0" && "%{with_neuron}" == "0"
%{error:One of Neuron or CUDA must be enabled}
%endif

%if "%{with_cuda}" == "1"
%{!?target: %global target nccl}
%endif
%if "%{with_neuron}" == "1"
%{!?target: %global target nccom}
%endif

%global pname_base lib%{!?with_neuron:nccl}%{?with_neuron:nccom}-net-ofi
%global pname %{pname_base}%{?with_platform_aws:-aws}

%if "%{with_platform_aws}"
%global _prefix /opt/amazon/%{pname_base}
%endif

# (CUDA only) what toolkit package to declare a build dependency on. Default: 12-6
%{!?_cuda_toolkit_version: %global _cuda_toolkit_version 12-6}

Name: %{pname}
Version: null
Release: 0%{dist}
Summary: NCCL + libfabric compatibility layer
License: Apache-2.0
URL: https://github.com/aws/aws-ofi-nccl
Source0: null
%if "%{_vendor}" == "debbuild"
Group: devel
%else
Group: Development/Tools%{?suse_version:/Building}
BuildRequires: hwloc-devel
BuildRequires: make
BuildRequires: gcc
BuildRequires: gcc-c++
%if "%{with_platform_aws}"
BuildRequires: libfabric-aws-devel
Requires: libfabric-aws
%else
BuildRequires: libfabric1-devel
Requires: libfabric
%endif
%if "%{with_cuda}" == "1"
BuildRequires: cuda-cudart-devel-%{_cuda_toolkit_version}
%endif
%endif
Requires: hwloc

%description
This is a plugin which lets EC2 developers use libfabric as network provider
while running NCCL applications.


%prep
%setup
%build
%configure \
--prefix="%{_prefix}" \
--disable-tests \
--with-mpi=no \
%if "%{with_cuda}" == "1"
--with-cuda=/usr/local/cuda-12 \
--enable-neuron=no \
%else
--with-cuda=no \
--enable-neuron=yes \
%endif
%if "%{with_platform_aws}" == "1"
--enable-platform-aws \
--with-libfabric=/opt/amazon/efa
%else
--disable-platform-aws
%endif
%make_build


%install
%make_install
find %{buildroot} -name '*.la' -exec rm -f {} ';'
%ldconfig_scriptlets


%files
%{_libdir}/*.so
%{_datadir}/aws-ofi-nccl/xml/*.xml
%license LICENSE NOTICE
%doc


%changelog
* Thu Aug 08 2024 Nicholas Sielicki <nslick@amazon.com>
Initial Package
1 change: 1 addition & 0 deletions .version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1.12.0pre
1 change: 1 addition & 0 deletions Makefile.am
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@
ACLOCAL_AMFLAGS = -I m4
SUBDIRS = include src topology tests
EXTRA_DIST = \
.version \
autogen.sh \
CODE_OF_CONDUCT.md \
CONTRIBUTING.md \
2 changes: 1 addition & 1 deletion configure.ac
Original file line number Diff line number Diff line change
@@ -6,7 +6,7 @@
#

# Initialization
AC_INIT([aws-ofi-nccl], [GitHub-dev], [al-ofi-nccl-team@amazon.com], , [http://github.com/aws/aws-ofi-nccl])
AC_INIT([aws-ofi-nccl], m4_normalize(m4_include([.version])), [al-ofi-nccl-team@amazon.com], , [http://github.com/aws/aws-ofi-nccl])
AC_PREREQ([2.69])
AC_CONFIG_SRCDIR([src/nccl_ofi_net.c])
AC_CONFIG_AUX_DIR([build-aux])
291 changes: 291 additions & 0 deletions docker-bake.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#
#
# Usage: https://docs.docker.com/reference/cli/docker/buildx/bake/

# Notes:
# * arm64 builds will use qemu by default, but requires containerd snapshotting
# to be enabled in docker's daemon.json, or explicit creation of an arm64
# capable context.
#
# * developers should strongly consider standing up an eks cluster and
# configuring a k8s builder for native arm64 builds:
# https://docs.docker.com/build/builders/drivers/kubernetes/

group "default" { targets = [ "rpms", "debs" ] }

variable "base_images" {
type = list(string)
default = [
"amazonlinux:2",
"amazonlinux:2023",
"rockylinux:8",
"rockylinux:9",
"opensuse/leap:15",
# Intentionaly not included
# "centos:centos7"
# Debian
"debian:10",
"debian:11",
# "debian:12" # not supported by EFA installer.
"ubuntu:20.04",
"ubuntu:22.04",
"ubuntu:24.04",
]
}


function "efa_installer_dir_name" {
params = [base_image]
result = "${replace(replace(replace(replace(upper(replace(replace(base_image, ".", ""), ":", "")), "MAZON", ""), "/", ""), "OPEN", ""), "LEAP15", "")}"
}

function "plaintext_image_name" {
params = [base_image]
result = "${replace(replace(replace(base_image, "/", "_"), ":", ""), ".", "")}"
}

function "baseimage_to_cuda_repo_name" {
params = [base_image]
result = "${replace(replace(base_image, ":", ""), ".", "")}"
}

# Caches efa installer packages, without actually installing them.
target "efa_installer_base_images" {
name = "${plaintext_image_name(base_image)}_base_efa-installer-${replace(efa_installer_version, ".", "-")}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}"
tags = [ "982534352369.dkr.ecr.us-west-1.amazonaws.com/common/efa_installer/${plaintext_image_name(base_image)}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}:${efa_installer_version}" ]
matrix = {
base_image = base_images,
# "amazonlinux:2",
# "amazonlinux:2023",
# "rockylinux:8",
# "rockylinux:9",
# "opensuse/leap:15",
# # Intentionaly not included
# # "centos:centos7"
# # Debian
# "debian:10",
# "debian:11",
# # "debian:12" # not supported by EFA installer.
# "ubuntu:20.04",
# "ubuntu:22.04",
# "ubuntu:24.04",
#]
efa_installer_version = [
"1.34.0",
"1.33.0",
]
item = [
{ mpi4_enabled = 1, mpi5_enabled = 0 },
{ mpi4_enabled = 0, mpi5_enabled = 1 },
{ mpi4_enabled = 0, mpi5_enabled = 0 },
]
debug_enabled = [ 0, 1 ]
}
contexts = {
efa_installer_tarball = "https://efa-installer.amazonaws.com/aws-efa-installer-${efa_installer_version}.tar.gz"
distro_image = "docker-image://${base_image}"
}
targets = [ "linux/amd64", "linux/arm64" ]
args = {
INSTALLER_PREFIX = "${efa_installer_dir_name(base_image)}"
ENABLE_EFA_INSTALLER_DEBUG_INFO = debug_enabled
ENABLE_MPI4 = item.mpi4_enabled,
ENABLE_MPI5 = item.mpi5_enabled,
}
dockerfile = ".docker/containers/Dockerfile.cache_efa"
#output = ["type=image,push=true"]
}

target "cuda_enabled_build_images" {
name = "${plaintext_image_name(base_image)}_efa-installer-${replace(efa_installer_version, ".", "-")}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}"
tags = [ "982534352369.dkr.ecr.us-west-1.amazonaws.com/common/efa_installer/${plaintext_image_name(base_image)}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}:${efa_installer_version}-cuda${cuda_version}" ]
dockerfile = ".docker/containers/Dockerfile.dpkg_add_cuda_repo"
output = ["type=cacheonly"]
args = { CUDA_DISTRO = "${baseimage_to_cuda_repo_name(base_image)}", CUDA_TOOLKIT_VERSION_SUFFIX = "${replace(CUDA_VERSION, ".", "-")}" }
contexts = {
base_image = "target:${plaintext_image_name(base_image)}_base_efa-installer-${replace(efa_installer_version, ".", "-")}${debug_enabled == 1 ? "-debugsyms" : ""}${ item.mpi4_enabled == 1 ? "-mpi4" : "" }${ item.mpi5_enabled == 1 ? "-mpi5" : ""}"
}
matrix = {
base_image = [
"amazonlinux:2",
"amazonlinux:2023",
"rockylinux:8",
"rockylinux:9",
"opensuse/leap:15",
# Intentionaly not included
# "centos:centos7"
"debian:10",
"debian:11",
#"debian:12",
"ubuntu:20.04",
"ubuntu:22.04",
"ubuntu:24.04",
]
efa_installer_version = [ "1.34.0", "1.33.0" ]
cuda_version = [
"12.6",
"12.5",
"12.4",
"12.3",
"12.2",
"12.1",
"12.0",
"11.8",
"11.7",
]
}
}

# Generate a `make dist` tarball. Note that this requires ./configure to be
# called, and that the contents of this "dist tarball" may differ depending on
# the configuration options passed. Requires dependencies to be installed as
# ./configure aborts if they cannot resolve.
#target "makedist" {
# name = "makedist-${item.accelerator}"
# matrix = {
# item = [
# { accelerator = "neuron", base_image = "target:ubuntu2204_efa-installer-${replace(, ".", "-")}" },
# { accelerator = "cuda", base_image = "target:ubuntu2204_efa-installer-${replace(EFA_INSTALLER_VERSION, ".", "-")}" },
# ]
# }
# contexts = { src = ".", base_image = "${item.base_image}" }
# args = { ACCELERATOR = item.accelerator }
# dockerfile = ".docker/containers/Dockerfile.makedist"
# output = ["type=local,dest=dockerbld/tarball"]
#}

# # Generate a universal srpm using packit.
# target "srpm" {
# contexts = { src = ".", makedist = "target:makedist-neuron" }
# dockerfile = ".docker/containers/Dockerfile.srpm"
# output = ["type=local,dest=dockerbld/srpm"]
# }
#
# # Generate RPMs from the srpm above.
# target "rpms" {
# name = "pkg${item.aws == "1" ? "-aws" : ""}-${replace(item.family, "/", "_")}-${replace(item.version, ".", "_")}"
# matrix = {
# item = [
# {
# family = "amazonlinux",
# package_frontend = "dnf",
# version = "2023",
# efa = "latest",
# cuda_distro = "amzn2023",
# toolkit_version = "12-6",
# accelerator = "cuda",
# enable_powertools = "0",
# aws = "1"
# },
# {
# family = "amazonlinux",
# package_frontend = "yum",
# version = "2",
# efa = "latest",
# cuda_distro = "rhel7",
# toolkit_version = "12-3",
# accelerator = "cuda",
# enable_powertools = "0",
# aws = "1"
# },
# {
# family = "rockylinux",
# package_frontend = "dnf",
# version = "8",
# efa = "latest",
# cuda_distro = "rhel8",
# toolkit_version = "12-6",
# accelerator = "cuda",
# enable_powertools = "1",
# aws = "1"
# },
# {
# family = "rockylinux",
# package_frontend = "dnf",
# version = "9",
# efa = "latest",
# cuda_distro = "rhel9",
# toolkit_version = "12-6",
# accelerator = "cuda",
# enable_powertools = "0",
# aws = "1"
# },
# ]
# }
# contexts = {
# efainstaller = "target:efainstaller"
# srpm = "target:srpm"
# }
# dockerfile = ".docker/containers/Dockerfile.${item.package_frontend}"
# output = ["type=local,dest=dockerbld/pkgs"]
# args = {
# FAMILY = item.family,
# VERSION = item.version
# EFA_INSTALLER_VERSION = item.efa
# CUDA_DISTRO = item.cuda_distro
# VARIANT = item.accelerator
# AWS_BUILD = item.aws
# TOOLKIT_VERSION = item.toolkit_version
# ENABLE_POWERTOOLS = item.enable_powertools
# }
# }
#
# # Build and package for debian-like distributions by building and invoking fpm.
# target "debs" {
# name = "pkg-${item.accelerator}${item.aws == "1" ? "-aws" : ""}-${replace(item.family, "/", "_")}-${replace(item.version, ".", "_")}"
# matrix = {
# item = [
# { accelerator = "cuda", aws = "1", family = "debian", version = "oldstable", cuda_distro = "debian11" },
# # XXX: EFA Installer lacks support.
# #{ accelerator = "cuda", aws = "1", platform = "amd64", family = "debian", version = "stable", cuda_distro = "debian11" },
# { accelerator = "cuda", aws = "1", family = "ubuntu", version = "20.04", cuda_distro = "ubuntu2004" },
# { accelerator = "cuda", aws = "1", family = "ubuntu", version = "22.04", cuda_distro = "ubuntu2204" },
# { accelerator = "cuda", aws = "1", family = "ubuntu", version = "24.04", cuda_distro = "ubuntu2404" },
# { accelerator = "cuda", aws = "0", family = "debian", version = "oldstable", cuda_distro = "debian11" },
# # XXX: EFA Installer lacks support.
# #{ accelerator = "cuda", aws = "0", family = "debian", version = "stable", cuda_distro = "debian11" },
# { accelerator = "cuda", aws = "0", family = "ubuntu", version = "20.04", cuda_distro = "ubuntu2004" },
# { accelerator = "cuda", aws = "0", family = "ubuntu", version = "22.04", cuda_distro = "ubuntu2204" },
# { accelerator = "cuda", aws = "0", family = "ubuntu", version = "24.04", cuda_distro = "ubuntu2404" },
#
# # XXX: todo
# # { accelerator = "neuron", aws = "1", platform = "amd64", family = "debian", version = "oldstable", cuda_distro = "debian11" },
# # #{ accelerator = "neuron", aws = "1", platform = "amd64", family = "debian", version = "stable", cuda_distro = "debian11" },
# # { accelerator = "neuron", aws = "1", platform = "amd64", family = "ubuntu", version = "20.04", cuda_distro = "ubuntu2004" },
# # { accelerator = "neuron", aws = "1", platform = "amd64", family = "ubuntu", version = "22.04", cuda_distro = "ubuntu2204" },
# # { accelerator = "neuron", aws = "1", platform = "amd64", family = "ubuntu", version = "24.04", cuda_distro = "ubuntu2404" },
#
# # { accelerator = "neuron", aws = "0", platform = "amd64", family = "debian", version = "oldstable", cuda_distro = "debian11" },
# # #{ accelerator = "neuron", aws = "0", platform = "amd64", family = "debian", version = "stable", cuda_distro = "debian11" },
# # { accelerator = "neuron", aws = "0", platform = "amd64", family = "ubuntu", version = "20.04", cuda_distro = "ubuntu2004" },
# # { accelerator = "neuron", aws = "0", platform = "amd64", family = "ubuntu", version = "22.04", cuda_distro = "ubuntu2204" },
# # { accelerator = "neuron", aws = "0", platform = "amd64", family = "ubuntu", version = "24.04", cuda_distro = "ubuntu2404" },
# ]
# }
# contexts = {
# efainstaller = "target:efainstaller"
# makedist = "target:makedist-${item.accelerator}"
# }
# dockerfile = ".docker/containers/Dockerfile.dpkg"
# output = ["type=local,dest=dockerbld/pkgs"]
# args = {
# FAMILY = item.family,
# VERSION = item.version
# CUDA_DISTRO = item.cuda_distro
# AWS_BUILD = item.aws
# }
# }
#
# target "nccl_tests" {
# matrix = {
#
# }
# contexts = {
# nccl_tests = "https://github.com/NVIDIA/nccl-tests.git"
# }
# }
#

0 comments on commit 6755d76

Please sign in to comment.