Skip to content

Commit

Permalink
ci: add package generation
Browse files Browse the repository at this point in the history
  • Loading branch information
aws-nslick committed Aug 30, 2024
1 parent 12c280b commit 494d0fd
Show file tree
Hide file tree
Showing 15 changed files with 601 additions and 72 deletions.
44 changes: 44 additions & 0 deletions .docker/Dockerfile.dnf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=fedora
ARG VERSION=rawhide
ARG VARIANT=cuda
ARG CUDA_DISTRO
ARG AWS_BUILD
ARG ENABLE_POWERTOOLS

# Install EFA-installer deps.
FROM ${FAMILY}:${VERSION} AS builder
ARG CUDA_DISTRO
ARG ENABLE_POWERTOOLS
ENV CUDA_DISTRO=${CUDA_DISTRO}
ENV ENABLE_POWERTOOLS=${ENABLE_POWERTOOLS}
# Add NVIDIA repo for CUDA builds.
COPY --from=efainstaller / /
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
bash -c "cd /aws-efa-installer && dnf install -y gcc rpmdevtools rpmlint dnf-plugins-core util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \
( test "${ENABLE_POWERTOOLS}" = "1" && sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo || /bin/true ) && \
dnf -y update && dnf -y upgrade
RUN rpmdev-setuptree

FROM builder AS environment
ARG VARIANT
ARG AWS_BUILD
ENV VARIANT=${VARIANT}
ENV AWS_BUILD=${AWS_BUILD}
COPY --from=srpm . .
RUN yum search hwloc
RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros
RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
dnf -y install cuda-cudart-devel-12-6 && dnf -y builddep *.src.rpm && rpmbuild --rebuild *.src.rpm

FROM scratch
COPY --from=environment /root/rpmbuild/RPMS/**/* /
55 changes: 55 additions & 0 deletions .docker/Dockerfile.dpkg
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=ubuntu
ARG VERSION=latest
ARG CUDA_DISTRO
ARG DEBIAN_FRONTEND=noninteractive
ARG AWS_BUILD

FROM ${FAMILY}:${VERSION} AS build
ARG CUDA_DISTRO
ENV CUDA_DISTRO=${CUDA_DISTRO}
ARG AWS_BUILD=0
ENV AWS_BUILD=${AWS_BUILD}
RUN apt-get update -y && apt-get install wget -y
RUN wget https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-keyring_1.1-1_all.deb
RUN dpkg -i cuda-keyring_1.1-1_all.deb
COPY --from=efainstaller / .
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
bash -c "apt-get update -y && cd /aws-efa-installer && ./efa_installer.sh /efa_installer.sh -n -l -k -d -y && apt-get install -y autoconf automake libtool gcc git libhwloc-dev make && rm -rf /aws-efa-installer"

COPY --from=makedist / .
RUN tar xvf ./aws-ofi-nccl*.tar.gz -C .
RUN cd aws-ofi-nccl* && \
./configure --$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") \
--prefix=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \
--with-libfabric=/opt/amazon/efa \
--disable-tests \
--$(test "$AWS_BUILD" -eq 0 && echo -n "disable" || echo -n "enable")-platform-aws \
--with-mpi=no && make -j && make install

FROM ubuntu:latest AS packager
ARG FAMILY
ARG VERSION
ARG AWS_BUILD=0
ENV AWS_BUILD=${AWS_BUILD}
ENV FAMILY=${FAMILY}
ENV VERSION=${VERSION}
COPY --from=build /opt/amazon/ /opt/amazon/
RUN find /opt/amazon/ | grep -E \.la$ | xargs rm
RUN apt-get update -y && apt-get install -y ruby tar squashfs-tools binutils && gem install fpm
RUN fpm \
-s dir -t deb \
--license Apache2.0 \
-p /libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")-${FAMILY}-${VERSION}.deb \
--name nccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws") \
/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")/=/opt/amazon/libnccl-net-ofi$(test "$AWS_BUILD" -eq 0 || echo -n "-aws")

FROM scratch
COPY --from=packager /libnccl-net-ofi* /

15 changes: 15 additions & 0 deletions .docker/Dockerfile.efa
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

FROM alpine:latest AS efa_installer_extracted
ARG EFA_INSTALLER_VERSION=latest
ENV EFA_INSTALLER_VERSION=${EFA_INSTALLER_VERSION}
RUN apk add tar curl
RUN mkdir /libfabric
RUN curl -s -L https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz | tar -xvzf - -C /

FROM scratch
COPY --from=efa_installer_extracted /aws-efa-installer /aws-efa-installer
26 changes: 26 additions & 0 deletions .docker/Dockerfile.makedist
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG ACCELERATOR
ARG BASE_IMAGE=ubuntu:22.04
FROM ${BASE_IMAGE} AS distbuilder
ARG ACCELERATOR
ENV ACCELERATOR=${ACCELERATOR}
RUN mkdir /aws-efa-installer
COPY --from=efainstaller /aws-efa-installer /aws-efa-installer
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
bash -c "apt-get update -y && cd /aws-efa-installer && ./efa_installer.sh /efa_installer.sh -n -l -k -d -y && apt-get install -y autoconf automake libtool gcc git libhwloc-dev make && rm -rf /aws-efa-installer"
COPY ../ /proj
WORKDIR /proj
RUN autoreconf -ivf
RUN ./configure --with-libfabric=/opt/amazon/efa --$(test "$ACCELERATOR" = "cuda" && echo "with-cuda=/usr/local/cuda" || echo "enable-neuron=yes") --with-libfabric=/opt/amazon/efa
RUN make dist
RUN ls -lart
RUN pwd

FROM scratch
COPY --from=distbuilder /proj/aws-ofi-nccl*.tar.gz /
18 changes: 18 additions & 0 deletions .docker/Dockerfile.srpm
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

FROM fedora:rawhide AS packitimg
RUN dnf install -y packit mock

FROM packitimg AS srpm
RUN mkdir /proj
WORKDIR /proj
COPY --from=src . .
COPY --from=makedist . .
RUN packit srpm

FROM scratch
COPY --from=srpm /proj/*.src.rpm /
42 changes: 42 additions & 0 deletions .docker/Dockerfile.yum
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#
# Copyright (c) 2024, Amazon.com, Inc. or its affiliates. All rights reserved.
#
# See LICENSE.txt for license information
#

ARG FAMILY=amazonlinux
ARG VERSION=2
ARG VARIANT=cuda
ARG CUDA_DISTRO
ARG AWS_BUILD

# Install EFA-installer deps.
FROM ${FAMILY}:${VERSION} AS builder
ARG CUDA_DISTRO
ENV CUDA_DISTRO=${CUDA_DISTRO}
# Add NVIDIA repo for CUDA builds.
COPY --from=efainstaller / /
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
bash -c "cd /aws-efa-installer && yum install -y gcc rpmdevtools rpmlint yum-utils util-linux && ./efa_installer.sh -n -l -k -d -y && rm -rf /aws-efa-installer" && \
yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/${CUDA_DISTRO}/$(uname -m)/cuda-${CUDA_DISTRO}.repo && \
yum update -y
RUN rpmdev-setuptree

FROM builder AS environment
ARG VARIANT
ARG AWS_BUILD
ARG TOOLKIT_VERSION=12-6
ENV VARIANT=${VARIANT}
ENV AWS_BUILD=${AWS_BUILD}
ENV TOOLKIT_VERSION=${TOOLKIT_VERSION}
COPY --from=srpm . .
RUN echo "%with_${VARIANT} 1" >> ~/.rpmmacros
RUN echo "%with_platform_aws ${AWS_BUILD}" >> ~/.rpmmacros
RUN echo "%_cuda_toolkit_version ${TOOLKIT_VERSION}" >> ~/.rpmmacros
RUN --mount=type=cache,target=/var/cache/yum,sharing=locked \
--mount=type=cache,target=/var/cache/dnf,sharing=locked \
yum install -y cuda-cudart-devel-${TOOLKIT_VERSION} && yum-builddep -y *.src.rpm && rpmbuild --rebuild *.src.rpm

FROM scratch
COPY --from=environment /root/rpmbuild/RPMS/**/* /
82 changes: 11 additions & 71 deletions .github/workflows/distcheck.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,75 +16,13 @@ concurrency:
group: ${{ github.head_ref || github.run_id }}
cancel-in-progress: true
jobs:
amazonlinux:
distcheck:
runs-on: ubuntu-22.04
strategy:
matrix:
sdk:
- cuda
amazonlinux:
- al2023
- al2
efainstaller:
- latest
- 1.25.0
include:
- amazonlinux: al2023
efainstallerdir: ALINUX2023
nvidiadistro: fedora37
configmanager: dnf config-manager
cudapackages: cuda-cudart-devel-12-3 cuda-driver-devel-12-3

- amazonlinux: al2
efainstallerdir: ALINUX2
nvidiadistro: rhel7
configmanager: yum-config-manager
cudapackages: cuda-cudart-devel-12-3 cuda-driver-devel-12-3

runs-on: codebuild-ghactions-${{ matrix.amazonlinux }}-${{ github.run_id }}-${{ github.run_attempt }}
name: ${{matrix.amazonlinux}}/${{ matrix.sdk }}/efa@${{ matrix.efainstaller }}/makeinstall
steps:
# note, do not bump to v4: https://github.com/actions/checkout/issues/1590
- uses: actions/checkout@v3
- uses: actions/cache@v3
with:
key: aws-efa-installer-${{ matrix.efainstaller }}
path: aws-efa-installer
- name: Install EFA Dependencies
run: |
cd aws-efa-installer/RPMS/${{ matrix.efainstallerdir }}/x86_64
find . | grep rpm$ | xargs sudo yum -y localinstall
- name: Install hwloc, utilities.
run: |
sudo yum -y install hwloc-devel yum-utils
- name: Install CUDA
run: |
sudo ${{ matrix.configmanager }} --add-repo \
http://developer.download.nvidia.com/compute/cuda/repos/${{ matrix.nvidiadistro }}/x86_64/cuda-${{ matrix.nvidiadistro }}.repo \
--save
sudo yum -y clean expire-cache
sudo yum -y install ${{ matrix.cudapackages }}
- name: Call `autoreconf -ivf`
run: ./autogen.sh

- name: Call `./configure`
run: |
./configure --prefix=/opt/aws-ofi-nccl --with-mpi=/opt/amazon/openmpi \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-tests=no \
--enable-platform-aws
- name: Call `make`
run: make -j

- name: Call `make install`
run: sudo make install

distcheck:
runs-on: ubuntu-22.04
strategy:
matrix:
cc-variant:
- latest
- legacy
Expand All @@ -93,15 +31,15 @@ jobs:
- clang
tracing:
- lttng
- nvtx
- none
sdk:
- cuda
- neuron

include:
- cc-variant: latest
cc: clang
cc-version: 19
cc-version: 20
- cc-variant: latest
cc: gcc
cc-version: 13
Expand Down Expand Up @@ -170,9 +108,9 @@ jobs:
packages: liblttng-ust-dev
version: lttng

- uses: actions/cache@v3
- uses: actions/cache@v4
with:
key: aws-efa-installer
key: aws-efa-installer-latest
path: aws-efa-installer

- name: Install EFA Installer Dependencies
Expand Down Expand Up @@ -231,7 +169,7 @@ jobs:
include:
- cc-variant: latest
cc: clang
cc-version: 19
cc-version: 20
- cc-variant: latest
cc: gcc
cc-version: 13
Expand Down Expand Up @@ -293,10 +231,11 @@ jobs:
packages: aws-neuronx-runtime-lib
version: neuron-packages

- uses: actions/cache@v3
- uses: actions/cache@v4
with:
key: aws-efa-installer-latest
path: aws-efa-installer

- name: Install EFA Installer Dependencies
run: |
pushd aws-efa-installer/
Expand Down Expand Up @@ -390,10 +329,11 @@ jobs:
packages: cppcheck
version: codechecker-cppcheck

- uses: actions/cache@v3
- uses: actions/cache@v4
with:
key: aws-efa-installer-latest
path: aws-efa-installer

- name: Install EFA Installer Dependencies
run: |
pushd aws-efa-installer/
Expand Down
Loading

0 comments on commit 494d0fd

Please sign in to comment.