From 1336a70c001c5cfdca33bc33c13d934f1fffbdb4 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Sat, 18 Mar 2023 12:51:18 +0100 Subject: [PATCH 1/9] {mpi}[GCC/12.2.0] OpenMPI v5.0.0rc10, PMIx v5.0.0rc1 --- .../o/OpenMPI/OpenMPI-5.0.0rc10-GCC-12.2.0.eb | 35 +++++++++++++++ .../p/PMIx/PMIx-5.0.0rc1-GCCcore-12.2.0.eb | 45 +++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.0rc10-GCC-12.2.0.eb create mode 100644 easybuild/easyconfigs/p/PMIx/PMIx-5.0.0rc1-GCCcore-12.2.0.eb diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.0rc10-GCC-12.2.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.0rc10-GCC-12.2.0.eb new file mode 100644 index 00000000000..88a9dacce86 --- /dev/null +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.0rc10-GCC-12.2.0.eb @@ -0,0 +1,35 @@ +name = 'OpenMPI' +version = '5.0.0rc10' + +homepage = 'https://www.open-mpi.org/' +description = """The Open MPI Project is an open source MPI-3 implementation.""" + +toolchain = {'name': 'GCC', 'version': '12.2.0'} + +source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads'] +sources = [SOURCELOWER_TAR_BZ2] +checksums = ['fdaf320c5d2a4cc83023e09e4489d86367bb8089b77f46848df5193865313b1d'] + +builddependencies = [ + ('pkgconf', '1.9.3'), + ('Perl', '5.36.0'), + ('Autotools', '20220317'), +] + +dependencies = [ + ('zlib', '1.2.12'), + ('hwloc', '2.8.0'), + ('libevent', '2.1.12'), + ('UCX', '1.13.1'), + ('libfabric', '1.16.1'), + ('PMIx', '5.0.0rc1'), + ('UCC', '1.1.0'), +] + +# disable MPI1 compatibility for now, see what breaks... +# configopts += '--enable-mpi1-compatibility ' + +# to enable SLURM integration (site-specific) +# configopts += '--with-slurm --with-pmi=/usr/include/slurm --with-pmi-libdir=/usr' + +moduleclass = 'mpi' diff --git a/easybuild/easyconfigs/p/PMIx/PMIx-5.0.0rc1-GCCcore-12.2.0.eb b/easybuild/easyconfigs/p/PMIx/PMIx-5.0.0rc1-GCCcore-12.2.0.eb new file mode 100644 index 00000000000..c4eef27fded --- /dev/null +++ b/easybuild/easyconfigs/p/PMIx/PMIx-5.0.0rc1-GCCcore-12.2.0.eb @@ -0,0 +1,45 @@ +easyblock = 'ConfigureMake' + +name = 'PMIx' +version = '5.0.0rc1' + +homepage = 'https://pmix.org/' +description = """Process Management for Exascale Environments +PMI Exascale (PMIx) represents an attempt to +provide an extended version of the PMI standard specifically designed +to support clusters up to and including exascale sizes. The overall +objective of the project is not to branch the existing pseudo-standard +definitions - in fact, PMIx fully supports both of the existing PMI-1 +and PMI-2 APIs - but rather to (a) augment and extend those APIs to +eliminate some current restrictions that impact scalability, and (b) +provide a reference implementation of the PMI-server that demonstrates +the desired level of scalability. +""" + +toolchain = {'name': 'GCCcore', 'version': '12.2.0'} +toolchainopts = {'pic': True} + +source_urls = ['https://github.com/openpmix/openpmix/releases/download/v%(version)s'] +sources = ['%(namelower)s-%(version)s.tar.bz2'] +checksums = ['9e681a380dee68ff597919ab1a878265ee645ed10b079533918a726ef673b905'] + +builddependencies = [('binutils', '2.39')] + +dependencies = [ + ('libevent', '2.1.12'), + ('zlib', '1.2.12'), + ('hwloc', '2.8.0'), +] + +configopts = ' --with-libevent=$EBROOTLIBEVENT --with-zlib=$EBROOTZLIB' +configopts += ' --with-hwloc=$EBROOTHWLOC' +configopts += ' --enable-pmix-binaries' + +buildopts = 'V=1' + +sanity_check_paths = { + 'files': ['bin/pevent', 'bin/plookup', 'bin/pmix_info', 'bin/pps'], + 'dirs': ['etc', 'include', 'lib', 'share'] +} + +moduleclass = 'lib' From b70202c3f1265cff9878c456a56ccaca0160c04e Mon Sep 17 00:00:00 2001 From: Sebastian Achilles Date: Sat, 20 Jan 2024 15:06:47 +0100 Subject: [PATCH 2/9] bump PMIx and OpenMPI to 5.0.1 and use GCC 13.2.0 --- ...-12.2.0.eb => OpenMPI-5.0.1-GCC-13.2.0.eb} | 22 +++++++++---------- ...12.2.0.eb => PMIx-5.0.1-GCCcore-13.2.0.eb} | 12 +++++----- 2 files changed, 17 insertions(+), 17 deletions(-) rename easybuild/easyconfigs/o/OpenMPI/{OpenMPI-5.0.0rc10-GCC-12.2.0.eb => OpenMPI-5.0.1-GCC-13.2.0.eb} (64%) rename easybuild/easyconfigs/p/PMIx/{PMIx-5.0.0rc1-GCCcore-12.2.0.eb => PMIx-5.0.1-GCCcore-13.2.0.eb} (83%) diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.0rc10-GCC-12.2.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.1-GCC-13.2.0.eb similarity index 64% rename from easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.0rc10-GCC-12.2.0.eb rename to easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.1-GCC-13.2.0.eb index 88a9dacce86..903f8d4c167 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.0rc10-GCC-12.2.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.1-GCC-13.2.0.eb @@ -1,29 +1,29 @@ name = 'OpenMPI' -version = '5.0.0rc10' +version = '5.0.1' homepage = 'https://www.open-mpi.org/' description = """The Open MPI Project is an open source MPI-3 implementation.""" -toolchain = {'name': 'GCC', 'version': '12.2.0'} +toolchain = {'name': 'GCC', 'version': '13.2.0'} source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads'] sources = [SOURCELOWER_TAR_BZ2] -checksums = ['fdaf320c5d2a4cc83023e09e4489d86367bb8089b77f46848df5193865313b1d'] +checksums = ['e357043e65fd1b956a47d0dae6156a90cf0e378df759364936c1781f1a25ef80'] builddependencies = [ - ('pkgconf', '1.9.3'), - ('Perl', '5.36.0'), + ('pkgconf', '2.0.3'), + ('Perl', '5.38.0'), ('Autotools', '20220317'), ] dependencies = [ - ('zlib', '1.2.12'), - ('hwloc', '2.8.0'), + ('zlib', '1.2.13'), + ('hwloc', '2.9.2'), ('libevent', '2.1.12'), - ('UCX', '1.13.1'), - ('libfabric', '1.16.1'), - ('PMIx', '5.0.0rc1'), - ('UCC', '1.1.0'), + ('UCX', '1.15.0'), + ('libfabric', '1.19.0'), + ('PMIx', '5.0.1'), + ('UCC', '1.2.0'), ] # disable MPI1 compatibility for now, see what breaks... diff --git a/easybuild/easyconfigs/p/PMIx/PMIx-5.0.0rc1-GCCcore-12.2.0.eb b/easybuild/easyconfigs/p/PMIx/PMIx-5.0.1-GCCcore-13.2.0.eb similarity index 83% rename from easybuild/easyconfigs/p/PMIx/PMIx-5.0.0rc1-GCCcore-12.2.0.eb rename to easybuild/easyconfigs/p/PMIx/PMIx-5.0.1-GCCcore-13.2.0.eb index c4eef27fded..fc786b047a8 100644 --- a/easybuild/easyconfigs/p/PMIx/PMIx-5.0.0rc1-GCCcore-12.2.0.eb +++ b/easybuild/easyconfigs/p/PMIx/PMIx-5.0.1-GCCcore-13.2.0.eb @@ -1,7 +1,7 @@ easyblock = 'ConfigureMake' name = 'PMIx' -version = '5.0.0rc1' +version = '5.0.1' homepage = 'https://pmix.org/' description = """Process Management for Exascale Environments @@ -16,19 +16,19 @@ provide a reference implementation of the PMI-server that demonstrates the desired level of scalability. """ -toolchain = {'name': 'GCCcore', 'version': '12.2.0'} +toolchain = {'name': 'GCCcore', 'version': '13.2.0'} toolchainopts = {'pic': True} source_urls = ['https://github.com/openpmix/openpmix/releases/download/v%(version)s'] sources = ['%(namelower)s-%(version)s.tar.bz2'] -checksums = ['9e681a380dee68ff597919ab1a878265ee645ed10b079533918a726ef673b905'] +checksums = ['d4371792d4ba4c791e1010100b4bf9a65500ababaf5ff25d681f938527a67d4a'] -builddependencies = [('binutils', '2.39')] +builddependencies = [('binutils', '2.40')] dependencies = [ ('libevent', '2.1.12'), - ('zlib', '1.2.12'), - ('hwloc', '2.8.0'), + ('zlib', '1.2.13'), + ('hwloc', '2.9.2'), ] configopts = ' --with-libevent=$EBROOTLIBEVENT --with-zlib=$EBROOTZLIB' From 0793144ec74eb61b239fb88f3053efea32262b08 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 15 Feb 2024 15:42:04 +0000 Subject: [PATCH 3/9] Bump Open MPI to 5.0.2 and add internal CUDA patch This patch has changed since libcuda is no longer dlopen()'ed by Open MPI. Instead we can generate a stub library, and at runtime the CUDA-dependent DSO's (but not the main libmpi.so library) load libcuda.so. This is then consistent with https://docs.open-mpi.org/en/v5.0.x/tuning-apps/networking/cuda.html (but --enable-mca-dso= is done by default already) --- ...-13.2.0.eb => OpenMPI-5.0.2-GCC-13.2.0.eb} | 14 +- ....0.2_build-with-internal-cuda-header.patch | 139 ++++++++++++++++++ 2 files changed, 150 insertions(+), 3 deletions(-) rename easybuild/easyconfigs/o/OpenMPI/{OpenMPI-5.0.1-GCC-13.2.0.eb => OpenMPI-5.0.2-GCC-13.2.0.eb} (58%) create mode 100644 easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2_build-with-internal-cuda-header.patch diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.1-GCC-13.2.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb similarity index 58% rename from easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.1-GCC-13.2.0.eb rename to easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb index 903f8d4c167..e291e3e2c26 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.1-GCC-13.2.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb @@ -1,5 +1,5 @@ name = 'OpenMPI' -version = '5.0.1' +version = '5.0.2' homepage = 'https://www.open-mpi.org/' description = """The Open MPI Project is an open source MPI-3 implementation.""" @@ -8,11 +8,15 @@ toolchain = {'name': 'GCC', 'version': '13.2.0'} source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads'] sources = [SOURCELOWER_TAR_BZ2] -checksums = ['e357043e65fd1b956a47d0dae6156a90cf0e378df759364936c1781f1a25ef80'] +patches = [('OpenMPI-5.0.2_build-with-internal-cuda-header.patch', 1)] +checksums = [ + {'openmpi-5.0.2.tar.bz2': 'ee46ad8eeee2c3ff70772160bff877cbf38c330a0bc3b3ddc811648b3396698f'}, + {'OpenMPI-5.0.2_build-with-internal-cuda-header.patch': + 'f52dc470543f35efef10d651dd159c771ae25f8f76a420d20d87abf4dc769ed7'}, +] builddependencies = [ ('pkgconf', '2.0.3'), - ('Perl', '5.38.0'), ('Autotools', '20220317'), ] @@ -26,6 +30,10 @@ dependencies = [ ('UCC', '1.2.0'), ] +# CUDA related patches and custom configure option can be removed if CUDA support isn't wanted. +preconfigopts = 'gcc -Iopal/mca/cuda/include -shared opal/mca/cuda/lib/cuda.c -o opal/mca/cuda/lib/libcuda.so && ' +configopts = '--with-cuda=%(start_dir)s/opal/mca/cuda ' + # disable MPI1 compatibility for now, see what breaks... # configopts += '--enable-mpi1-compatibility ' diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2_build-with-internal-cuda-header.patch b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2_build-with-internal-cuda-header.patch new file mode 100644 index 00000000000..2d935fda64a --- /dev/null +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2_build-with-internal-cuda-header.patch @@ -0,0 +1,139 @@ +Allow building Open MPI with an internal CUDA header and stub library via +--with-cuda=%(start_dir)s/opal/mca/cuda +by providing an internal minimal cuda.h header file, and function stubs. +This eliminates the CUDA (build)dependency; as long as the runtime CUDA version is 8.0+, +the system's libcuda.so will be used successfully by dynamically loaded plugins in +$EBROOTOPENMPI/lib/openmpi, not by the main libmpi.so. + +Author: Bart Oldeman +diff -urN openmpi-5.0.2.orig/opal/mca/cuda/cuda.c openmpi-5.0.2/opal/mca/cuda/cuda.c +--- openmpi-5.0.2.orig/opal/mca/cuda/lib/cuda.c 1970-01-01 00:00:00.000000000 +0000 ++++ openmpi-5.0.2/opal/mca/cuda/lib/cuda.c 2024-02-15 01:39:24.969142045 +0000 +@@ -0,0 +1,28 @@ ++#include "cuda.h" ++ ++CUresult cuPointerGetAttribute(void *, CUpointer_attribute, CUdeviceptr) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuMemcpyAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuMemAlloc(CUdeviceptr *, size_t) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuMemFree(CUdeviceptr buf) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuCtxGetCurrent(void *cuContext) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuStreamCreate(CUstream *, int) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuEventCreate(CUevent *, int) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuEventRecord(CUevent, CUstream) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuEventQuery(CUevent) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuEventDestroy(CUevent) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuMemHostRegister(void *, size_t, unsigned int) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuMemHostUnregister(void *) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuMemGetAddressRange(CUdeviceptr *, size_t *, CUdeviceptr) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuIpcGetEventHandle(CUipcEventHandle *, CUevent) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuIpcOpenEventHandle(CUevent *, CUipcEventHandle) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuIpcOpenMemHandle(CUdeviceptr *, CUipcMemHandle, unsigned int) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuIpcCloseMemHandle(CUdeviceptr) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuIpcGetMemHandle(CUipcMemHandle *, CUdeviceptr) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuCtxGetDevice(CUdevice *) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuCtxSetCurrent(CUcontext) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuStreamSynchronize(CUstream) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuStreamDestroy(CUstream) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuPointerSetAttribute(const void *, CUpointer_attribute, CUdeviceptr) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuDeviceGetPCIBusId(char*, int, CUdevice) { return CUDA_ERROR_UNKNOWN; } ++CUresult cuPointerGetAttributes(unsigned int, CUpointer_attribute *, void **, CUdeviceptr) { return CUDA_ERROR_UNKNOWN; } +diff -urN openmpi-5.0.2.orig/opal/mca/cuda/include/cuda.h openmpi-5.0.2/opal/mca/cuda/include/cuda.h +--- openmpi-5.0.2.orig/opal/mca/cuda/include/cuda.h 1970-01-01 00:00:00.000000000 +0000 ++++ openmpi-5.0.2/opal/mca/cuda/include/cuda.h 2024-02-15 03:07:26.480531383 +0000 +@@ -0,0 +1,95 @@ ++/* This header provides minimal parts of the CUDA Driver API, without having to ++ rely on the proprietary CUDA toolkit. ++ ++ References (to avoid copying from NVidia's proprietary cuda.h): ++ https://github.com/gcc-mirror/gcc/blob/master/include/cuda/cuda.h ++ https://github.com/Theano/libgpuarray/blob/master/src/loaders/libcuda.h ++ https://github.com/CPFL/gdev/blob/master/cuda/driver/cuda.h ++ https://github.com/CudaWrangler/cuew/blob/master/include/cuew.h ++*/ ++ ++#ifndef OMPI_CUDA_H ++#define OMPI_CUDA_H ++ ++#include ++ ++#define CUDA_VERSION 8000 ++ ++typedef void *CUcontext; ++typedef int CUdevice; ++#if defined(__LP64__) || defined(_WIN64) ++typedef unsigned long long CUdeviceptr; ++#else ++typedef unsigned CUdeviceptr; ++#endif ++typedef void *CUevent; ++typedef void *CUstream; ++ ++typedef enum { ++ CUDA_SUCCESS = 0, ++ CUDA_ERROR_INVALID_VALUE = 1, ++ CUDA_ERROR_NOT_INITIALIZED = 3, ++ CUDA_ERROR_DEINITIALIZED = 4, ++ CUDA_ERROR_ALREADY_MAPPED = 208, ++ CUDA_ERROR_NOT_READY = 600, ++ CUDA_ERROR_UNKNOWN = 999, ++} CUresult; ++ ++enum { ++ CU_EVENT_DISABLE_TIMING = 0x2, ++ CU_EVENT_INTERPROCESS = 0x4, ++}; ++ ++enum { ++ CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1, ++}; ++ ++typedef enum { ++ CU_POINTER_ATTRIBUTE_CONTEXT = 1, ++ CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, ++ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, ++ CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, ++ CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, ++} CUpointer_attribute; ++ ++typedef enum { ++ CU_MEMORYTYPE_HOST = 0x01, ++} CUmemorytype; ++ ++#define CU_IPC_HANDLE_SIZE 64 ++typedef struct CUipcEventHandle_st { ++ char reserved[CU_IPC_HANDLE_SIZE]; ++} CUipcEventHandle; ++ ++typedef struct CUipcMemHandle_st { ++ char reserved[CU_IPC_HANDLE_SIZE]; ++} CUipcMemHandle; ++ ++CUresult cuPointerGetAttribute(void *, CUpointer_attribute, CUdeviceptr); ++CUresult cuMemcpyAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream); ++CUresult cuMemAlloc(CUdeviceptr *, size_t); ++CUresult cuMemFree(CUdeviceptr buf); ++CUresult cuCtxGetCurrent(void *cuContext); ++CUresult cuStreamCreate(CUstream *, int); ++CUresult cuEventCreate(CUevent *, int); ++CUresult cuEventRecord(CUevent, CUstream); ++CUresult cuEventQuery(CUevent); ++CUresult cuEventDestroy(CUevent); ++CUresult cuMemHostRegister(void *, size_t, unsigned int); ++CUresult cuMemHostUnregister(void *); ++CUresult cuMemGetAddressRange(CUdeviceptr *, size_t *, CUdeviceptr); ++CUresult cuIpcGetEventHandle(CUipcEventHandle *, CUevent); ++CUresult cuIpcOpenEventHandle(CUevent *, CUipcEventHandle); ++CUresult cuIpcOpenMemHandle(CUdeviceptr *, CUipcMemHandle, unsigned int); ++CUresult cuIpcCloseMemHandle(CUdeviceptr); ++CUresult cuIpcGetMemHandle(CUipcMemHandle *, CUdeviceptr); ++CUresult cuCtxGetDevice(CUdevice *); ++CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice); ++CUresult cuCtxSetCurrent(CUcontext); ++CUresult cuStreamSynchronize(CUstream); ++CUresult cuStreamDestroy(CUstream); ++CUresult cuPointerSetAttribute(const void *, CUpointer_attribute, CUdeviceptr); ++CUresult cuDeviceGetPCIBusId(char*, int, CUdevice); ++CUresult cuPointerGetAttributes(unsigned int, CUpointer_attribute *, void **, CUdeviceptr); ++ ++#endif From 2d2f1638c3aa24d45b7a8435c842277b6488e533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Tue, 20 Feb 2024 14:33:12 +0100 Subject: [PATCH 4/9] add patch based on https://github.com/open-mpi/ompi/pull/12343 --- .../OpenMPI-5.0.x_add_atomic_wmb.patch | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.x_add_atomic_wmb.patch diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.x_add_atomic_wmb.patch b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.x_add_atomic_wmb.patch new file mode 100644 index 00000000000..044ec75cdb5 --- /dev/null +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.x_add_atomic_wmb.patch @@ -0,0 +1,35 @@ +From 0ebea598a5023200bcae8a647478e2f297bc1a41 Mon Sep 17 00:00:00 2001 +From: Luke Robison +Date: Wed, 14 Feb 2024 21:14:29 +0000 +Subject: [PATCH] btl/smcuda: Add atomic_wmb() before sm_fifo_write + +This change fixes https://github.com/open-mpi/ompi/issues/12270 + +Testing on c7g instance type (arm64) confirms this change elminates +hangs and crashes that were previously observed in 1 in 30 runs of +IMB alltoall benchmark. Tested with over 300 runs and no failures. + +The write memory barrier prevents other CPUs from observing the fifo +get updated before they observe the updated contents of the header +itself. Without the barrier, uninitialized header contents caused +the crashes and invalid data. + +Signed-off-by: Luke Robison +(cherry picked from commit 71f378d28cb89dd80379dbad570849b297594cde) +--- + opal/mca/btl/smcuda/btl_smcuda_fifo.h | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/opal/mca/btl/smcuda/btl_smcuda_fifo.h b/opal/mca/btl/smcuda/btl_smcuda_fifo.h +index ca1257b5c56..56369dba9b3 100644 +--- a/opal/mca/btl/smcuda/btl_smcuda_fifo.h ++++ b/opal/mca/btl/smcuda/btl_smcuda_fifo.h +@@ -85,6 +85,8 @@ static void add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool res + #define MCA_BTL_SMCUDA_FIFO_WRITE(endpoint_peer, my_smp_rank, peer_smp_rank, hdr, resend, \ + retry_pending_sends, rc) \ + do { \ ++ /* memory barrier: ensure writes to the hdr have completed */ \ ++ opal_atomic_wmb(); \ + sm_fifo_t *fifo = &(mca_btl_smcuda_component.fifo[peer_smp_rank][FIFO_MAP(my_smp_rank)]); \ + \ + if (retry_pending_sends) { \ From 85e836b0600c9033052ed3e2f95ceca66472a490 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Tue, 20 Feb 2024 14:33:27 +0100 Subject: [PATCH 5/9] add OpenMPI-5.0.x_add_atomic_wmb.patch --- easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb index e291e3e2c26..956b36afc2e 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb @@ -8,11 +8,15 @@ toolchain = {'name': 'GCC', 'version': '13.2.0'} source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads'] sources = [SOURCELOWER_TAR_BZ2] -patches = [('OpenMPI-5.0.2_build-with-internal-cuda-header.patch', 1)] +patches = [ + ('OpenMPI-5.0.2_build-with-internal-cuda-header.patch', 1), + 'OpenMPI-5.0.x_add_atomic_wmb.patch' +] checksums = [ {'openmpi-5.0.2.tar.bz2': 'ee46ad8eeee2c3ff70772160bff877cbf38c330a0bc3b3ddc811648b3396698f'}, {'OpenMPI-5.0.2_build-with-internal-cuda-header.patch': 'f52dc470543f35efef10d651dd159c771ae25f8f76a420d20d87abf4dc769ed7'}, + {'OpenMPI-5.0.x_add_atomic_wmb.patch': '23989c1998bd89c64b23e4fc101aa68748543c90f3c79bdedda38a5933a5ef44'}, ] builddependencies = [ From 2b3403424f2dabff10d6b32a7a2dfb4220f35ed3 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Tue, 30 Apr 2024 15:22:23 +0000 Subject: [PATCH 6/9] Bump to OpenMPI to 5.0.3, PMIx to 5.0.2 `OpenMPI-5.0.x_add_atomic_wmb.patch` is obsolete now --- ...-13.2.0.eb => OpenMPI-5.0.3-GCC-13.2.0.eb} | 12 +++---- .../OpenMPI-5.0.x_add_atomic_wmb.patch | 35 ------------------- ...13.2.0.eb => PMIx-5.0.2-GCCcore-13.2.0.eb} | 4 +-- 3 files changed, 6 insertions(+), 45 deletions(-) rename easybuild/easyconfigs/o/OpenMPI/{OpenMPI-5.0.2-GCC-13.2.0.eb => OpenMPI-5.0.3-GCC-13.2.0.eb} (76%) delete mode 100644 easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.x_add_atomic_wmb.patch rename easybuild/easyconfigs/p/PMIx/{PMIx-5.0.1-GCCcore-13.2.0.eb => PMIx-5.0.2-GCCcore-13.2.0.eb} (93%) diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.2.0.eb similarity index 76% rename from easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb rename to easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.2.0.eb index 956b36afc2e..dc7d83d6230 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.2-GCC-13.2.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.2.0.eb @@ -1,5 +1,5 @@ name = 'OpenMPI' -version = '5.0.2' +version = '5.0.3' homepage = 'https://www.open-mpi.org/' description = """The Open MPI Project is an open source MPI-3 implementation.""" @@ -8,15 +8,11 @@ toolchain = {'name': 'GCC', 'version': '13.2.0'} source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads'] sources = [SOURCELOWER_TAR_BZ2] -patches = [ - ('OpenMPI-5.0.2_build-with-internal-cuda-header.patch', 1), - 'OpenMPI-5.0.x_add_atomic_wmb.patch' -] +patches = [('OpenMPI-5.0.2_build-with-internal-cuda-header.patch', 1)] checksums = [ - {'openmpi-5.0.2.tar.bz2': 'ee46ad8eeee2c3ff70772160bff877cbf38c330a0bc3b3ddc811648b3396698f'}, + {'openmpi-5.0.3.tar.bz2': '990582f206b3ab32e938aa31bbf07c639368e4405dca196fabe7f0f76eeda90b'}, {'OpenMPI-5.0.2_build-with-internal-cuda-header.patch': 'f52dc470543f35efef10d651dd159c771ae25f8f76a420d20d87abf4dc769ed7'}, - {'OpenMPI-5.0.x_add_atomic_wmb.patch': '23989c1998bd89c64b23e4fc101aa68748543c90f3c79bdedda38a5933a5ef44'}, ] builddependencies = [ @@ -30,7 +26,7 @@ dependencies = [ ('libevent', '2.1.12'), ('UCX', '1.15.0'), ('libfabric', '1.19.0'), - ('PMIx', '5.0.1'), + ('PMIx', '5.0.2'), ('UCC', '1.2.0'), ] diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.x_add_atomic_wmb.patch b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.x_add_atomic_wmb.patch deleted file mode 100644 index 044ec75cdb5..00000000000 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.x_add_atomic_wmb.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 0ebea598a5023200bcae8a647478e2f297bc1a41 Mon Sep 17 00:00:00 2001 -From: Luke Robison -Date: Wed, 14 Feb 2024 21:14:29 +0000 -Subject: [PATCH] btl/smcuda: Add atomic_wmb() before sm_fifo_write - -This change fixes https://github.com/open-mpi/ompi/issues/12270 - -Testing on c7g instance type (arm64) confirms this change elminates -hangs and crashes that were previously observed in 1 in 30 runs of -IMB alltoall benchmark. Tested with over 300 runs and no failures. - -The write memory barrier prevents other CPUs from observing the fifo -get updated before they observe the updated contents of the header -itself. Without the barrier, uninitialized header contents caused -the crashes and invalid data. - -Signed-off-by: Luke Robison -(cherry picked from commit 71f378d28cb89dd80379dbad570849b297594cde) ---- - opal/mca/btl/smcuda/btl_smcuda_fifo.h | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/opal/mca/btl/smcuda/btl_smcuda_fifo.h b/opal/mca/btl/smcuda/btl_smcuda_fifo.h -index ca1257b5c56..56369dba9b3 100644 ---- a/opal/mca/btl/smcuda/btl_smcuda_fifo.h -+++ b/opal/mca/btl/smcuda/btl_smcuda_fifo.h -@@ -85,6 +85,8 @@ static void add_pending(struct mca_btl_base_endpoint_t *ep, void *data, bool res - #define MCA_BTL_SMCUDA_FIFO_WRITE(endpoint_peer, my_smp_rank, peer_smp_rank, hdr, resend, \ - retry_pending_sends, rc) \ - do { \ -+ /* memory barrier: ensure writes to the hdr have completed */ \ -+ opal_atomic_wmb(); \ - sm_fifo_t *fifo = &(mca_btl_smcuda_component.fifo[peer_smp_rank][FIFO_MAP(my_smp_rank)]); \ - \ - if (retry_pending_sends) { \ diff --git a/easybuild/easyconfigs/p/PMIx/PMIx-5.0.1-GCCcore-13.2.0.eb b/easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.2.0.eb similarity index 93% rename from easybuild/easyconfigs/p/PMIx/PMIx-5.0.1-GCCcore-13.2.0.eb rename to easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.2.0.eb index fc786b047a8..b4059ce4241 100644 --- a/easybuild/easyconfigs/p/PMIx/PMIx-5.0.1-GCCcore-13.2.0.eb +++ b/easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.2.0.eb @@ -1,7 +1,7 @@ easyblock = 'ConfigureMake' name = 'PMIx' -version = '5.0.1' +version = '5.0.2' homepage = 'https://pmix.org/' description = """Process Management for Exascale Environments @@ -21,7 +21,7 @@ toolchainopts = {'pic': True} source_urls = ['https://github.com/openpmix/openpmix/releases/download/v%(version)s'] sources = ['%(namelower)s-%(version)s.tar.bz2'] -checksums = ['d4371792d4ba4c791e1010100b4bf9a65500ababaf5ff25d681f938527a67d4a'] +checksums = ['28227ff2ba925da2c3fece44502f23a91446017de0f5a58f5cea9370c514b83c'] builddependencies = [('binutils', '2.40')] From 22406435745d1b8ace0ddf3e690e3897a7f365fa Mon Sep 17 00:00:00 2001 From: Sebastian Achilles Date: Wed, 22 May 2024 20:42:29 +0200 Subject: [PATCH 7/9] bump PMIx and OpenMPI to GCC 13.3.0 --- ...GCC-13.2.0.eb => OpenMPI-5.0.3-GCC-13.3.0.eb} | 16 ++++++++-------- ...re-13.2.0.eb => PMIx-5.0.2-GCCcore-13.3.0.eb} | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) rename easybuild/easyconfigs/o/OpenMPI/{OpenMPI-5.0.3-GCC-13.2.0.eb => OpenMPI-5.0.3-GCC-13.3.0.eb} (84%) rename easybuild/easyconfigs/p/PMIx/{PMIx-5.0.2-GCCcore-13.2.0.eb => PMIx-5.0.2-GCCcore-13.3.0.eb} (90%) diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.2.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb similarity index 84% rename from easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.2.0.eb rename to easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb index dc7d83d6230..107540dad88 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.2.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb @@ -4,7 +4,7 @@ version = '5.0.3' homepage = 'https://www.open-mpi.org/' description = """The Open MPI Project is an open source MPI-3 implementation.""" -toolchain = {'name': 'GCC', 'version': '13.2.0'} +toolchain = {'name': 'GCC', 'version': '13.3.0'} source_urls = ['https://www.open-mpi.org/software/ompi/v%(version_major_minor)s/downloads'] sources = [SOURCELOWER_TAR_BZ2] @@ -16,18 +16,18 @@ checksums = [ ] builddependencies = [ - ('pkgconf', '2.0.3'), - ('Autotools', '20220317'), + ('pkgconf', '2.2.0'), + ('Autotools', '20231222'), ] dependencies = [ - ('zlib', '1.2.13'), - ('hwloc', '2.9.2'), + ('zlib', '1.3.1'), + ('hwloc', '2.10.0'), ('libevent', '2.1.12'), - ('UCX', '1.15.0'), - ('libfabric', '1.19.0'), + ('UCX', '1.16.0'), + ('libfabric', '1.21.0'), ('PMIx', '5.0.2'), - ('UCC', '1.2.0'), + ('UCC', '1.3.0'), ] # CUDA related patches and custom configure option can be removed if CUDA support isn't wanted. diff --git a/easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.2.0.eb b/easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.3.0.eb similarity index 90% rename from easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.2.0.eb rename to easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.3.0.eb index b4059ce4241..1d6208ff9f0 100644 --- a/easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.2.0.eb +++ b/easybuild/easyconfigs/p/PMIx/PMIx-5.0.2-GCCcore-13.3.0.eb @@ -16,19 +16,19 @@ provide a reference implementation of the PMI-server that demonstrates the desired level of scalability. """ -toolchain = {'name': 'GCCcore', 'version': '13.2.0'} +toolchain = {'name': 'GCCcore', 'version': '13.3.0'} toolchainopts = {'pic': True} source_urls = ['https://github.com/openpmix/openpmix/releases/download/v%(version)s'] sources = ['%(namelower)s-%(version)s.tar.bz2'] checksums = ['28227ff2ba925da2c3fece44502f23a91446017de0f5a58f5cea9370c514b83c'] -builddependencies = [('binutils', '2.40')] +builddependencies = [('binutils', '2.42')] dependencies = [ ('libevent', '2.1.12'), - ('zlib', '1.2.13'), - ('hwloc', '2.9.2'), + ('zlib', '1.3.1'), + ('hwloc', '2.10.0'), ] configopts = ' --with-libevent=$EBROOTLIBEVENT --with-zlib=$EBROOTZLIB' From 0b42d077909be31feda31e07084ae56512abc4ce Mon Sep 17 00:00:00 2001 From: Sebastian Achilles Date: Tue, 28 May 2024 19:00:57 +0200 Subject: [PATCH 8/9] remove outdated comment about Slurm support and add --with-show-load-errors=no in OpenMPI-5.0.3-GCC-13.3.0.eb --- easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb index 107540dad88..81e9279e4bf 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb @@ -32,12 +32,9 @@ dependencies = [ # CUDA related patches and custom configure option can be removed if CUDA support isn't wanted. preconfigopts = 'gcc -Iopal/mca/cuda/include -shared opal/mca/cuda/lib/cuda.c -o opal/mca/cuda/lib/libcuda.so && ' -configopts = '--with-cuda=%(start_dir)s/opal/mca/cuda ' +configopts = '--with-cuda=%(start_dir)s/opal/mca/cuda --with-show-load-errors=no ' # disable MPI1 compatibility for now, see what breaks... # configopts += '--enable-mpi1-compatibility ' -# to enable SLURM integration (site-specific) -# configopts += '--with-slurm --with-pmi=/usr/include/slurm --with-pmi-libdir=/usr' - moduleclass = 'mpi' From 2fb792e55dc308985e9d0c836fa660d64d304ead Mon Sep 17 00:00:00 2001 From: Sebastian Achilles Date: Tue, 28 May 2024 20:45:37 +0200 Subject: [PATCH 9/9] remove comment about MPI1 compatibility in OpenMPI-5.0.3-GCC-13.3.0.eb --- easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb | 3 --- 1 file changed, 3 deletions(-) diff --git a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb index 81e9279e4bf..7e11bfb3000 100644 --- a/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb +++ b/easybuild/easyconfigs/o/OpenMPI/OpenMPI-5.0.3-GCC-13.3.0.eb @@ -34,7 +34,4 @@ dependencies = [ preconfigopts = 'gcc -Iopal/mca/cuda/include -shared opal/mca/cuda/lib/cuda.c -o opal/mca/cuda/lib/libcuda.so && ' configopts = '--with-cuda=%(start_dir)s/opal/mca/cuda --with-show-load-errors=no ' -# disable MPI1 compatibility for now, see what breaks... -# configopts += '--enable-mpi1-compatibility ' - moduleclass = 'mpi'