From 454d7614b6e0edee23597abae8c62c4c02a91f32 Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Thu, 9 Jun 2022 17:14:40 -0500 Subject: [PATCH 1/2] Add the missing SHMEM_CTX_INVALID --- include/shmem.h.in.tpl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/shmem.h.in.tpl b/include/shmem.h.in.tpl index 74d13f6..30fca72 100644 --- a/include/shmem.h.in.tpl +++ b/include/shmem.h.in.tpl @@ -91,6 +91,8 @@ typedef struct { #define SHMEM_TEAM_SHARED (shmem_team_t) 0x90001 #define SHMEM_TEAM_INVALID NULL +#define SHMEM_TEAM_NUM_CONTEXTS 0x091001L + /* SHMEM malloc hints */ #define SHMEM_MALLOC_ATOMICS_REMOTE 0x002001L #define SHMEM_MALLOC_SIGNAL_REMOTE 0x002002L @@ -99,6 +101,7 @@ typedef struct { #define SHMEM_CTX_SERIALIZED 0x001001L #define SHMEM_CTX_PRIVATE 0x001002L #define SHMEM_CTX_NOSTORE 0x001003L +#define SHMEM_CTX_INVALID (-1L) typedef void* shmem_ctx_t; #define SHMEM_CTX_DEFAULT (shmem_ctx_t) 0x80000 From 4d140e8d1b568db0862c225c6e5d6830b75f6f3f Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Mon, 6 Jun 2022 15:20:04 -0500 Subject: [PATCH 2/2] Add team-based collective and deprecate active-set-based ones --- autogen.sh | 116 +++++-- include/Makefile.mk | 10 +- include/shmem.h.in.tpl | 52 ++- include/shmem_coll_typed.h.tpl | 69 ++++ ...pl => shmem_reduce_bitws_aset_typed.h.tpl} | 3 +- include/shmem_reduce_bitws_team_typed.h.tpl | 43 +++ ...l => shmem_reduce_minmax_aset_typed.h.tpl} | 3 +- include/shmem_reduce_minmax_team_typed.h.tpl | 31 ++ ... => shmem_reduce_sumprod_aset_typed.h.tpl} | 3 +- include/shmem_reduce_sumprod_team_typed.h.tpl | 31 ++ maint/coll_typedef.txt | 25 ++ ...edef.txt => reduce_bitws_aset_typedef.txt} | 0 maint/reduce_bitws_team_typedef.txt | 15 + ...def.txt => reduce_minmax_aset_typedef.txt} | 0 maint/reduce_minmax_team_typedef.txt | 25 ++ ...ef.txt => reduce_sumprod_aset_typedef.txt} | 0 maint/reduce_sumprod_team_typedef.txt | 27 ++ src/include/oshmpi_impl.h | 36 +- src/internal/Makefile.mk | 1 + src/internal/coll_activeset_impl.h | 315 ++++++++++++++++++ src/internal/coll_impl.h | 276 +++------------ src/shmem/Makefile.mk | 22 +- src/shmem/coll.c | 87 ++--- src/shmem/coll_activeset.c | 82 +++++ src/shmem/coll_typed.c.tpl | 50 +++ ...ed.c.tpl => reduce_bitws_aset_typed.c.tpl} | 4 +- src/shmem/reduce_bitws_team_typed.c.tpl | 36 ++ ...d.c.tpl => reduce_minmax_aset_typed.c.tpl} | 4 +- src/shmem/reduce_minmax_team_typed.c.tpl | 28 ++ ....c.tpl => reduce_sumprod_aset_typed.c.tpl} | 4 +- src/shmem/reduce_sumprod_team_typed.c.tpl | 28 ++ 31 files changed, 1070 insertions(+), 356 deletions(-) create mode 100644 include/shmem_coll_typed.h.tpl rename include/{shmem_reduce_bitws_typed.h.tpl => shmem_reduce_bitws_aset_typed.h.tpl} (93%) create mode 100644 include/shmem_reduce_bitws_team_typed.h.tpl rename include/{shmem_reduce_minmax_typed.h.tpl => shmem_reduce_minmax_aset_typed.h.tpl} (91%) create mode 100644 include/shmem_reduce_minmax_team_typed.h.tpl rename include/{shmem_reduce_sumprod_typed.h.tpl => shmem_reduce_sumprod_aset_typed.h.tpl} (91%) create mode 100644 include/shmem_reduce_sumprod_team_typed.h.tpl create mode 100644 maint/coll_typedef.txt rename maint/{reduce_bitws_typedef.txt => reduce_bitws_aset_typedef.txt} (100%) create mode 100644 maint/reduce_bitws_team_typedef.txt rename maint/{reduce_maxmin_typedef.txt => reduce_minmax_aset_typedef.txt} (100%) create mode 100644 maint/reduce_minmax_team_typedef.txt rename maint/{reduce_sumprod_typedef.txt => reduce_sumprod_aset_typedef.txt} (100%) create mode 100644 maint/reduce_sumprod_team_typedef.txt create mode 100644 src/internal/coll_activeset_impl.h create mode 100644 src/shmem/coll_activeset.c create mode 100644 src/shmem/coll_typed.c.tpl rename src/shmem/{reduce_bitws_typed.c.tpl => reduce_bitws_aset_typed.c.tpl} (94%) create mode 100644 src/shmem/reduce_bitws_team_typed.c.tpl rename src/shmem/{reduce_minmax_typed.c.tpl => reduce_minmax_aset_typed.c.tpl} (92%) create mode 100644 src/shmem/reduce_minmax_team_typed.c.tpl rename src/shmem/{reduce_sumprod_typed.c.tpl => reduce_sumprod_aset_typed.c.tpl} (92%) create mode 100644 src/shmem/reduce_sumprod_team_typed.c.tpl diff --git a/autogen.sh b/autogen.sh index 09e875a..20564ba 100755 --- a/autogen.sh +++ b/autogen.sh @@ -140,21 +140,44 @@ insert_file_by_key "SHMEM_AMO_BITWS_TYPED_H start" ./include/shmem_amo_bitws_typ echo "-- inserted SHMEM_AMO_BITWS_TYPED_H in include/shmem.h.in" echo "" -echo "Generating Collective reduction typed APIs header file..." -./maint/build_typed_api.pl --typefile ./maint/reduce_maxmin_typedef.txt \ - --tplfile ./include/shmem_reduce_minmax_typed.h.tpl --outfile ./include/shmem_reduce_minmax_typed.h -insert_file_by_key "SHMEM_REDUCE_MINMAX_TYPED_H start" ./include/shmem_reduce_minmax_typed.h include/shmem.h.in -echo "-- inserted SHMEM_REDUCE_MINMAX_TYPED_H in include/shmem.h.in" - -./maint/build_typed_api.pl --typefile ./maint/reduce_sumprod_typedef.txt \ - --tplfile ./include/shmem_reduce_sumprod_typed.h.tpl --outfile ./include/shmem_reduce_sumprod_typed.h -insert_file_by_key "SHMEM_REDUCE_SUMPROD_TYPED_H start" ./include/shmem_reduce_sumprod_typed.h include/shmem.h.in -echo "-- inserted SHMEM_REDUCE_SUMPROD_TYPED_H in include/shmem.h.in" - -./maint/build_typed_api.pl --typefile ./maint/reduce_bitws_typedef.txt \ - --tplfile ./include/shmem_reduce_bitws_typed.h.tpl --outfile ./include/shmem_reduce_bitws_typed.h -insert_file_by_key "SHMEM_REDUCE_BITWS_TYPED_H start" ./include/shmem_reduce_bitws_typed.h include/shmem.h.in -echo "-- inserted SHMEM_REDUCE_BITWS_TYPED_H in include/shmem.h.in" +echo "Generating Collective typed APIs header file..." +./maint/build_typed_api.pl --typefile ./maint/coll_typedef.txt \ + --tplfile ./include/shmem_coll_typed.h.tpl --outfile ./include/shmem_coll_typed.h +insert_file_by_key "SHMEM_COLL_TYPED_H start" ./include/shmem_coll_typed.h include/shmem.h.in +echo "-- inserted SHMEM_COLL_TYPED_H in include/shmem.h.in" + +echo "Generating Collective reduction active-set-based typed APIs header file..." +./maint/build_typed_api.pl --typefile ./maint/reduce_minmax_aset_typedef.txt \ + --tplfile ./include/shmem_reduce_minmax_aset_typed.h.tpl --outfile ./include/shmem_reduce_minmax_aset_typed.h +insert_file_by_key "SHMEM_REDUCE_MINMAX_ASET_TYPED_H start" ./include/shmem_reduce_minmax_aset_typed.h include/shmem.h.in +echo "-- inserted SHMEM_REDUCE_MINMAX_ASET_TYPED_H in include/shmem.h.in" + +./maint/build_typed_api.pl --typefile ./maint/reduce_sumprod_aset_typedef.txt \ + --tplfile ./include/shmem_reduce_sumprod_aset_typed.h.tpl --outfile ./include/shmem_reduce_sumprod_aset_typed.h +insert_file_by_key "SHMEM_REDUCE_SUMPROD_ASET_TYPED_H start" ./include/shmem_reduce_sumprod_aset_typed.h include/shmem.h.in +echo "-- inserted SHMEM_REDUCE_SUMPROD_ASET_TYPED_H in include/shmem.h.in" + +./maint/build_typed_api.pl --typefile ./maint/reduce_bitws_aset_typedef.txt \ + --tplfile ./include/shmem_reduce_bitws_aset_typed.h.tpl --outfile ./include/shmem_reduce_bitws_aset_typed.h +insert_file_by_key "SHMEM_REDUCE_BITWS_ASET_TYPED_H start" ./include/shmem_reduce_bitws_aset_typed.h include/shmem.h.in +echo "-- inserted SHMEM_REDUCE_BITWS_ASET_TYPED_H in include/shmem.h.in" +echo "" + +echo "Generating Collective reduction active-set-based typed APIs header file..." +./maint/build_typed_api.pl --typefile ./maint/reduce_minmax_team_typedef.txt \ + --tplfile ./include/shmem_reduce_minmax_team_typed.h.tpl --outfile ./include/shmem_reduce_minmax_team_typed.h +insert_file_by_key "SHMEM_REDUCE_MINMAX_TEAM_TYPED_H start" ./include/shmem_reduce_minmax_team_typed.h include/shmem.h.in +echo "-- inserted SHMEM_REDUCE_MINMAX_TEAM_TYPED_H in include/shmem.h.in" + +./maint/build_typed_api.pl --typefile ./maint/reduce_sumprod_team_typedef.txt \ + --tplfile ./include/shmem_reduce_sumprod_team_typed.h.tpl --outfile ./include/shmem_reduce_sumprod_team_typed.h +insert_file_by_key "SHMEM_REDUCE_SUMPROD_TEAM_TYPED_H start" ./include/shmem_reduce_sumprod_team_typed.h include/shmem.h.in +echo "-- inserted SHMEM_REDUCE_SUMPROD_TEAM_TYPED_H in include/shmem.h.in" + +./maint/build_typed_api.pl --typefile ./maint/reduce_bitws_team_typedef.txt \ + --tplfile ./include/shmem_reduce_bitws_team_typed.h.tpl --outfile ./include/shmem_reduce_bitws_team_typed.h +insert_file_by_key "SHMEM_REDUCE_BITWS_TEAM_TYPED_H start" ./include/shmem_reduce_bitws_team_typed.h include/shmem.h.in +echo "-- inserted SHMEM_REDUCE_BITWS_TEAM_TYPED_H in include/shmem.h.in" echo "" echo "Generating Signal APIs header file..." @@ -215,24 +238,51 @@ echo "-- ./src/shmem/amo_bitws_typed.c done" echo "-- ./src/shmem/amo_bitws_typed.c format cleaned" echo "" -echo "Generating Collective reduction typed APIs source files..." -./maint/build_typed_api.pl --typefile ./maint/reduce_maxmin_typedef.txt \ - --tplfile ./src/shmem/reduce_minmax_typed.c.tpl --outfile ./src/shmem/reduce_minmax_typed.c -echo "-- ./src/shmem/reduce_minmax_typed.c done" -./maint/code-cleanup.sh ./src/shmem/reduce_minmax_typed.c -echo "-- ./src/shmem/reduce_minmax_typed.c format cleaned" - -./maint/build_typed_api.pl --typefile ./maint/reduce_sumprod_typedef.txt \ - --tplfile ./src/shmem/reduce_sumprod_typed.c.tpl --outfile ./src/shmem/reduce_sumprod_typed.c -echo "-- ./src/shmem/reduce_sumprod_typed.c done" -./maint/code-cleanup.sh ./src/shmem/reduce_sumprod_typed.c -echo "-- ./src/shmem/reduce_sumprod_typed.c format cleaned" - -./maint/build_typed_api.pl --typefile ./maint/reduce_bitws_typedef.txt \ - --tplfile ./src/shmem/reduce_bitws_typed.c.tpl --outfile ./src/shmem/reduce_bitws_typed.c -echo "-- ./src/shmem/reduce_bitws_typed.c done" -./maint/code-cleanup.sh ./src/shmem/reduce_bitws_typed.c -echo "-- ./src/shmem/reduce_bitws_typed.c format cleaned" +echo "Generating Collective typed APIs source files..." +./maint/build_typed_api.pl --typefile ./maint/coll_typedef.txt \ + --tplfile ./src/shmem/coll_typed.c.tpl --outfile ./src/shmem/coll_typed.c +echo "-- ./src/shmem/coll_typed.c done" +./maint/code-cleanup.sh ./src/shmem/coll_typed.c +echo "-- ./src/shmem/coll_typed.c format cleaned" + +echo "Generating Collective reduction active-set-based typed APIs source files..." +./maint/build_typed_api.pl --typefile ./maint/reduce_minmax_aset_typedef.txt \ + --tplfile ./src/shmem/reduce_minmax_aset_typed.c.tpl --outfile ./src/shmem/reduce_minmax_aset_typed.c +echo "-- ./src/shmem/reduce_minmax_aset_typed.c done" +./maint/code-cleanup.sh ./src/shmem/reduce_minmax_aset_typed.c +echo "-- ./src/shmem/reduce_minmax_aset_typed.c format cleaned" + +./maint/build_typed_api.pl --typefile ./maint/reduce_sumprod_aset_typedef.txt \ + --tplfile ./src/shmem/reduce_sumprod_aset_typed.c.tpl --outfile ./src/shmem/reduce_sumprod_aset_typed.c +echo "-- ./src/shmem/reduce_sumprod_aset_typed.c done" +./maint/code-cleanup.sh ./src/shmem/reduce_sumprod_aset_typed.c +echo "-- ./src/shmem/reduce_sumprod_aset_typed.c format cleaned" + +./maint/build_typed_api.pl --typefile ./maint/reduce_bitws_aset_typedef.txt \ + --tplfile ./src/shmem/reduce_bitws_aset_typed.c.tpl --outfile ./src/shmem/reduce_bitws_aset_typed.c +echo "-- ./src/shmem/reduce_bitws_aset_typed.c done" +./maint/code-cleanup.sh ./src/shmem/reduce_bitws_aset_typed.c +echo "-- ./src/shmem/reduce_bitws_aset_typed.c format cleaned" +echo "" + +echo "Generating Collective reduction team-based typed APIs source files..." +./maint/build_typed_api.pl --typefile ./maint/reduce_minmax_team_typedef.txt \ + --tplfile ./src/shmem/reduce_minmax_team_typed.c.tpl --outfile ./src/shmem/reduce_minmax_team_typed.c +echo "-- ./src/shmem/reduce_minmax_team_typed.c done" +./maint/code-cleanup.sh ./src/shmem/reduce_minmax_team_typed.c +echo "-- ./src/shmem/reduce_minmax_team_typed.c format cleaned" + +./maint/build_typed_api.pl --typefile ./maint/reduce_sumprod_team_typedef.txt \ + --tplfile ./src/shmem/reduce_sumprod_team_typed.c.tpl --outfile ./src/shmem/reduce_sumprod_team_typed.c +echo "-- ./src/shmem/reduce_sumprod_team_typed.c done" +./maint/code-cleanup.sh ./src/shmem/reduce_sumprod_team_typed.c +echo "-- ./src/shmem/reduce_sumprod_team_typed.c format cleaned" + +./maint/build_typed_api.pl --typefile ./maint/reduce_bitws_team_typedef.txt \ + --tplfile ./src/shmem/reduce_bitws_team_typed.c.tpl --outfile ./src/shmem/reduce_bitws_team_typed.c +echo "-- ./src/shmem/reduce_bitws_team_typed.c done" +./maint/code-cleanup.sh ./src/shmem/reduce_bitws_team_typed.c +echo "-- ./src/shmem/reduce_bitws_team_typed.c format cleaned" echo "" echo "Generating Signaling APIs source files..." diff --git a/include/Makefile.mk b/include/Makefile.mk index 67b9fb9..7816623 100644 --- a/include/Makefile.mk +++ b/include/Makefile.mk @@ -10,6 +10,10 @@ EXTRA_DIST += include/shmem.h.in.tpl \ include/shmem_amo_std_typed.h.tpl \ include/shmem_amo_ext_typed.h.tpl \ include/shmem_amo_bitws_typed.h.tpl \ - include/shmem_reduce_minmax_typed.h.tpl \ - include/shmem_reduce_sumprod_typed.h.tpl \ - include/shmem_reduce_bitws_typed.h.tpl + include/shmem_coll_typed.h.tpl \ + include/shmem_reduce_minmax_aset_typed.h.tpl \ + include/shmem_reduce_sumprod_aset_typed.h.tpl \ + include/shmem_reduce_bitws_aset_typed.h.tpl \ + include/shmem_reduce_minmax_team_typed.h.tpl \ + include/shmem_reduce_sumprod_team_typed.h.tpl \ + include/shmem_reduce_bitws_team_typed.h.tpl diff --git a/include/shmem.h.in.tpl b/include/shmem.h.in.tpl index 30fca72..1df1ba8 100644 --- a/include/shmem.h.in.tpl +++ b/include/shmem.h.in.tpl @@ -32,6 +32,7 @@ extern "C" { #define OSHMPI_C11_ARG1_HELPER(second, ...) second #define OSHMPI_C11_ARG1(first, ...) OSHMPI_C11_ARG1_HELPER(__VA_ARGS__, extra) #define OSHMPI_C11_CTX_VAL(ctx) (ctx) +#define OSHMPI_C11_TEAM_VAL(team) (team) static inline void shmem_c11_type_ignore(void) {} #endif @@ -244,9 +245,19 @@ uint64_t shmem_signal_fetch(const uint64_t *sig_addr); /* -- Collectives -- */ void shmem_barrier_all(void); -void shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync); int shmem_team_sync(shmem_team_t team); void shmem_sync_all(void); +int shmem_broadcastmem(shmem_team_t team, void *dest, const void *source, size_t nelems, + int PE_root); +int shmem_collectmem(shmem_team_t team, void *dest, const void *source, size_t nelems); +int shmem_fcollectmem(shmem_team_t team, void *dest, const void *source, size_t nelems); +int shmem_alltoallmem(shmem_team_t team, void *dest, const void *source, size_t nelems); +int shmem_alltoallsmem(shmem_team_t team, void *dest, const void *source, ptrdiff_t dst, + ptrdiff_t sst, size_t nelems); + +/* (deprecated APIs) */ +void shmem_sync_aset(int PE_start, int logPE_stride, int PE_size, long *pSync); +void shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync); void shmem_broadcast32(void *dest, const void *source, size_t nelems, int PE_root, int PE_start, int logPE_stride, int PE_size, long *pSync); void shmem_broadcast64(void *dest, const void *source, size_t nelems, int PE_root, int PE_start, @@ -267,17 +278,40 @@ void shmem_alltoalls32(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t int PE_start, int logPE_stride, int PE_size, long *pSync); void shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems, int PE_start, int logPE_stride, int PE_size, long *pSync); -/* (deprecated APIs) */ -void shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync); -/* SHMEM_REDUCE_MINMAX_TYPED_H start */ -/* SHMEM_REDUCE_MINMAX_TYPED_H end */ +/* *INDENT-OFF* */ +#if OSHMPI_HAVE_C11 +#define shmem_sync(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: shmem_team_sync, \ + int: shmem_sync_aset, \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) +#else +#define shmem_sync(...) shmem_sync_aset(__VA_ARGS__) +#endif /* OSHMPI_HAVE_C11 */ +/* *INDENT-ON* */ + +/* SHMEM_COLL_TYPED_H start */ +/* SHMEM_COLL_TYPED_H end */ + +/* SHMEM_REDUCE_MINMAX_TEAM_TYPED_H start */ +/* SHMEM_REDUCE_MINMAX_TEAM_TYPED_H end */ + +/* SHMEM_REDUCE_SUMPROD_TEAM_TYPED_H start */ +/* SHMEM_REDUCE_SUMPROD_TEAM_TYPED_H end */ + +/* SHMEM_REDUCE_BITWS_TEAM_TYPED_H start */ +/* SHMEM_REDUCE_BITWS_TEAM_TYPED_H end */ + +/* SHMEM_REDUCE_MINMAX_ASET_TYPED_H start */ +/* SHMEM_REDUCE_MINMAX_ASET_TYPED_H end */ -/* SHMEM_REDUCE_SUMPROD_TYPED_H start */ -/* SHMEM_REDUCE_SUMPROD_TYPED_H end */ +/* SHMEM_REDUCE_SUMPROD_ASET_TYPED_H start */ +/* SHMEM_REDUCE_SUMPROD_ASET_TYPED_H end */ -/* SHMEM_REDUCE_BITWS_TYPED_H start */ -/* SHMEM_REDUCE_BITWS_TYPED_H end */ +/* SHMEM_REDUCE_BITWS_ASET_TYPED_H start */ +/* SHMEM_REDUCE_BITWS_ASET_TYPED_H end */ /* -- Point-To-Point Synchronization -- */ /* SHMEM_P2P_TYPED_H start */ diff --git a/include/shmem_coll_typed.h.tpl b/include/shmem_coll_typed.h.tpl new file mode 100644 index 0000000..10543e2 --- /dev/null +++ b/include/shmem_coll_typed.h.tpl @@ -0,0 +1,69 @@ +/* The following lines are automatically generated. DO NOT EDIT. */ +/* TPL_BLOCK_START */ +int shmem_TYPENAME_broadcast(shmem_team_t team, TYPE * dest, const TYPE * source, size_t nelems, + int PE_root); +int shmem_TYPENAME_collect(shmem_team_t team, TYPE * dest, const TYPE * source, size_t nelems); +int shmem_TYPENAME_fcollect(shmem_team_t team, TYPE * dest, const TYPE * source, size_t nelems); +int shmem_TYPENAME_alltoall(shmem_team_t team, TYPE * dest, const TYPE * source, size_t nelems); +int shmem_TYPENAME_alltoalls(shmem_team_t team, TYPE * dest, const TYPE * source, ptrdiff_t dst, + ptrdiff_t sst, size_t nelems); +/* TPL_BLOCK_END */ + +/* *INDENT-OFF* */ +#if OSHMPI_HAVE_C11 +#define shmem_broadcast(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_broadcast, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) + +#define shmem_collect(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_collect, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) + +#define shmem_fcollect(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_fcollect, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) + +#define shmem_alltoall(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_alltoall, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) + +#define shmem_alltoalls(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_alltoalls, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) +#endif /* OSHMPI_HAVE_C11 */ +/* *INDENT-ON* */ diff --git a/include/shmem_reduce_bitws_typed.h.tpl b/include/shmem_reduce_bitws_aset_typed.h.tpl similarity index 93% rename from include/shmem_reduce_bitws_typed.h.tpl rename to include/shmem_reduce_bitws_aset_typed.h.tpl index 7d556b8..0d37452 100644 --- a/include/shmem_reduce_bitws_typed.h.tpl +++ b/include/shmem_reduce_bitws_aset_typed.h.tpl @@ -1,9 +1,10 @@ /* The following lines are automatically generated. DO NOT EDIT. */ /* TPL_BLOCK_START */ +/* depreciated APIs */ void shmem_TYPENAME_and_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync); void shmem_TYPENAME_or_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync); void shmem_TYPENAME_xor_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync); -/* TPL_BLOCK_END */ \ No newline at end of file +/* TPL_BLOCK_END */ diff --git a/include/shmem_reduce_bitws_team_typed.h.tpl b/include/shmem_reduce_bitws_team_typed.h.tpl new file mode 100644 index 0000000..dedbfd5 --- /dev/null +++ b/include/shmem_reduce_bitws_team_typed.h.tpl @@ -0,0 +1,43 @@ +/* The following lines are automatically generated. DO NOT EDIT. */ +/* TPL_BLOCK_START */ +int shmem_TYPENAME_and_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce); +int shmem_TYPENAME_or_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce); +int shmem_TYPENAME_xor_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce); +/* TPL_BLOCK_END */ + +/* *INDENT-OFF* */ +#if OSHMPI_HAVE_C11 +#define shmem_and_reduce(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_and_reduce, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) + +#define shmem_or_reduce(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_or_reduce, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) + +#define shmem_xor_reduce(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_xor_reduce, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) +#endif /* OSHMPI_HAVE_C11 */ +/* *INDENT-ON* */ diff --git a/include/shmem_reduce_minmax_typed.h.tpl b/include/shmem_reduce_minmax_aset_typed.h.tpl similarity index 91% rename from include/shmem_reduce_minmax_typed.h.tpl rename to include/shmem_reduce_minmax_aset_typed.h.tpl index 4170491..59ae5c8 100644 --- a/include/shmem_reduce_minmax_typed.h.tpl +++ b/include/shmem_reduce_minmax_aset_typed.h.tpl @@ -1,7 +1,8 @@ /* The following lines are automatically generated. DO NOT EDIT. */ /* TPL_BLOCK_START */ +/* deprecated APIs */ void shmem_TYPENAME_min_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync); void shmem_TYPENAME_max_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync); -/* TPL_BLOCK_END */ \ No newline at end of file +/* TPL_BLOCK_END */ diff --git a/include/shmem_reduce_minmax_team_typed.h.tpl b/include/shmem_reduce_minmax_team_typed.h.tpl new file mode 100644 index 0000000..eb06485 --- /dev/null +++ b/include/shmem_reduce_minmax_team_typed.h.tpl @@ -0,0 +1,31 @@ +/* The following lines are automatically generated. DO NOT EDIT. */ +/* TPL_BLOCK_START */ +int shmem_TYPENAME_min_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce); +int shmem_TYPENAME_max_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce); +/* TPL_BLOCK_END */ + +/* *INDENT-OFF* */ +#if OSHMPI_HAVE_C11 +#define shmem_min_reduce(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_min_reduce, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) + +#define shmem_max_reduce(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_max_reduce, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) +#endif /* OSHMPI_HAVE_C11 */ +/* *INDENT-ON* */ diff --git a/include/shmem_reduce_sumprod_typed.h.tpl b/include/shmem_reduce_sumprod_aset_typed.h.tpl similarity index 91% rename from include/shmem_reduce_sumprod_typed.h.tpl rename to include/shmem_reduce_sumprod_aset_typed.h.tpl index bb63d20..9e28de0 100644 --- a/include/shmem_reduce_sumprod_typed.h.tpl +++ b/include/shmem_reduce_sumprod_aset_typed.h.tpl @@ -1,7 +1,8 @@ /* The following lines are automatically generated. DO NOT EDIT. */ /* TPL_BLOCK_START */ +/* deprecated APIs */ void shmem_TYPENAME_sum_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync); void shmem_TYPENAME_prod_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync); -/* TPL_BLOCK_END */ \ No newline at end of file +/* TPL_BLOCK_END */ diff --git a/include/shmem_reduce_sumprod_team_typed.h.tpl b/include/shmem_reduce_sumprod_team_typed.h.tpl new file mode 100644 index 0000000..064bd54 --- /dev/null +++ b/include/shmem_reduce_sumprod_team_typed.h.tpl @@ -0,0 +1,31 @@ +/* The following lines are automatically generated. DO NOT EDIT. */ +/* TPL_BLOCK_START */ +int shmem_TYPENAME_sum_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce); +int shmem_TYPENAME_prod_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce); +/* TPL_BLOCK_END */ + +/* *INDENT-OFF* */ +#if OSHMPI_HAVE_C11 +#define shmem_sum_reduce(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_sum_reduce, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) + +#define shmem_prod_reduce(...) \ + _Generic(OSHMPI_C11_TEAM_VAL(OSHMPI_C11_ARG0(__VA_ARGS__)), \ + shmem_team_t: _Generic((OSHMPI_C11_ARG1(__VA_ARGS__)), \ +/* TPL_C11_BLOCK_START */ + TYPE*: shmem_TYPENAME_prod_reduce, \ +/* TPL_C11_BLOCK_END */ + default: shmem_c11_type_ignore \ + ), \ + default: shmem_c11_type_ignore \ + )(__VA_ARGS__) +#endif /* OSHMPI_HAVE_C11 */ +/* *INDENT-ON* */ diff --git a/maint/coll_typedef.txt b/maint/coll_typedef.txt new file mode 100644 index 0000000..c3ca1dd --- /dev/null +++ b/maint/coll_typedef.txt @@ -0,0 +1,25 @@ +# TYPE, TYPENAME, MPITYPE, C11_INCLDUE +float, float, MPI_FLOAT, 1 +double, double, MPI_DOUBLE, 1 +long double, longdouble, MPI_LONG_DOUBLE, 1 +char, char, MPI_CHAR, 1 +signed char, schar, MPI_SIGNED_CHAR, 1 +short, short, MPI_SHORT, 1 +int, int, MPI_INT, 1 +long, long, MPI_LONG, 1 +long long, longlong, MPI_LONG_LONG, 1 +unsigned char, uchar, MPI_UNSIGNED_CHAR, 1 +unsigned short, ushort, MPI_UNSIGNED_SHORT, 1 +unsigned int, uint, MPI_UNSIGNED, 1 +unsigned long, ulong, MPI_UNSIGNED_LONG, 1 +unsigned long long, ulonglong, MPI_UNSIGNED_LONG_LONG, 1 +int8_t, int8, MPI_INT8_T, 0 +int16_t, int16, MPI_INT16_T, 0 +int32_t, int32, MPI_INT32_T, 0 +int64_t, int64, MPI_INT64_T, 0 +uint8_t, uint8, MPI_UINT8_T, 0 +uint16_t, uint16, MPI_UINT16_T, 0 +uint32_t, uint32, MPI_UINT32_T, 0 +uint64_t, uint64, MPI_UINT64_T, 0 +size_t, size, OSHMPI_MPI_SIZE_T, 0 +ptrdiff_t, ptrdiff, OSHMPI_MPI_PTRDIFF_T, 0 diff --git a/maint/reduce_bitws_typedef.txt b/maint/reduce_bitws_aset_typedef.txt similarity index 100% rename from maint/reduce_bitws_typedef.txt rename to maint/reduce_bitws_aset_typedef.txt diff --git a/maint/reduce_bitws_team_typedef.txt b/maint/reduce_bitws_team_typedef.txt new file mode 100644 index 0000000..f7fbbdd --- /dev/null +++ b/maint/reduce_bitws_team_typedef.txt @@ -0,0 +1,15 @@ +# TYPE, TYPENAME, MPITYPE, C11_INCLDUE +unsigned char, uchar, MPI_UNSIGNED_CHAR, 1 +unsigned short, ushort, MPI_UNSIGNED_SHORT, 1 +unsigned int, uint, MPI_UNSIGNED, 1 +unsigned long, ulong, MPI_UNSIGNED_LONG, 1 +unsigned long long, ulonglong, MPI_UNSIGNED_LONG_LONG, 1 +int8_t, int8, MPI_INT8_T, 1 +int16_t, int16, MPI_INT16_T, 1 +int32_t, int32, MPI_INT32_T, 1 +int64_t, int64, MPI_INT64_T, 1 +uint8_t, uint8, MPI_UINT8_T, 0 +uint16_t, uint16, MPI_UINT16_T, 0 +uint32_t, uint32, MPI_UINT32_T, 0 +uint64_t, uint64, MPI_UINT64_T, 0 +size_t, size, OSHMPI_MPI_SIZE_T, 0 diff --git a/maint/reduce_maxmin_typedef.txt b/maint/reduce_minmax_aset_typedef.txt similarity index 100% rename from maint/reduce_maxmin_typedef.txt rename to maint/reduce_minmax_aset_typedef.txt diff --git a/maint/reduce_minmax_team_typedef.txt b/maint/reduce_minmax_team_typedef.txt new file mode 100644 index 0000000..c3ca1dd --- /dev/null +++ b/maint/reduce_minmax_team_typedef.txt @@ -0,0 +1,25 @@ +# TYPE, TYPENAME, MPITYPE, C11_INCLDUE +float, float, MPI_FLOAT, 1 +double, double, MPI_DOUBLE, 1 +long double, longdouble, MPI_LONG_DOUBLE, 1 +char, char, MPI_CHAR, 1 +signed char, schar, MPI_SIGNED_CHAR, 1 +short, short, MPI_SHORT, 1 +int, int, MPI_INT, 1 +long, long, MPI_LONG, 1 +long long, longlong, MPI_LONG_LONG, 1 +unsigned char, uchar, MPI_UNSIGNED_CHAR, 1 +unsigned short, ushort, MPI_UNSIGNED_SHORT, 1 +unsigned int, uint, MPI_UNSIGNED, 1 +unsigned long, ulong, MPI_UNSIGNED_LONG, 1 +unsigned long long, ulonglong, MPI_UNSIGNED_LONG_LONG, 1 +int8_t, int8, MPI_INT8_T, 0 +int16_t, int16, MPI_INT16_T, 0 +int32_t, int32, MPI_INT32_T, 0 +int64_t, int64, MPI_INT64_T, 0 +uint8_t, uint8, MPI_UINT8_T, 0 +uint16_t, uint16, MPI_UINT16_T, 0 +uint32_t, uint32, MPI_UINT32_T, 0 +uint64_t, uint64, MPI_UINT64_T, 0 +size_t, size, OSHMPI_MPI_SIZE_T, 0 +ptrdiff_t, ptrdiff, OSHMPI_MPI_PTRDIFF_T, 0 diff --git a/maint/reduce_sumprod_typedef.txt b/maint/reduce_sumprod_aset_typedef.txt similarity index 100% rename from maint/reduce_sumprod_typedef.txt rename to maint/reduce_sumprod_aset_typedef.txt diff --git a/maint/reduce_sumprod_team_typedef.txt b/maint/reduce_sumprod_team_typedef.txt new file mode 100644 index 0000000..5025118 --- /dev/null +++ b/maint/reduce_sumprod_team_typedef.txt @@ -0,0 +1,27 @@ +# TYPE, TYPENAME, MPITYPE, C11_INCLDUE +float, float, MPI_FLOAT, 1 +double, double, MPI_DOUBLE, 1 +long double, longdouble, MPI_LONG_DOUBLE, 1 +char, char, MPI_CHAR, 1 +signed char, schar, MPI_SIGNED_CHAR, 1 +short, short, MPI_SHORT, 1 +int, int, MPI_INT, 1 +long, long, MPI_LONG, 1 +long long, longlong, MPI_LONG_LONG, 1 +unsigned char, uchar, MPI_UNSIGNED_CHAR, 1 +unsigned short, ushort, MPI_UNSIGNED_SHORT, 1 +unsigned int, uint, MPI_UNSIGNED, 1 +unsigned long, ulong, MPI_UNSIGNED_LONG, 1 +unsigned long long, ulonglong, MPI_UNSIGNED_LONG_LONG, 1 +float _Complex, complexf, MPI_C_FLOAT_COMPLEX, 1 +double _Complex, complexd, MPI_C_DOUBLE_COMPLEX, 1 +int8_t, int8, MPI_INT8_T, 0 +int16_t, int16, MPI_INT16_T, 0 +int32_t, int32, MPI_INT32_T, 0 +int64_t, int64, MPI_INT64_T, 0 +uint8_t, uint8, MPI_UINT8_T, 0 +uint16_t, uint16, MPI_UINT16_T, 0 +uint32_t, uint32, MPI_UINT32_T, 0 +uint64_t, uint64, MPI_UINT64_T, 0 +size_t, size, OSHMPI_MPI_SIZE_T, 0 +ptrdiff_t, ptrdiff, OSHMPI_MPI_PTRDIFF_T, 0 diff --git a/src/include/oshmpi_impl.h b/src/include/oshmpi_impl.h index 8ad614b..e8ecd61 100644 --- a/src/include/oshmpi_impl.h +++ b/src/include/oshmpi_impl.h @@ -29,6 +29,7 @@ * so the capacity must be at least this large */ #define OSHMPI_DLMALLOC_MIN_MSPACE_SIZE (128 * sizeof(size_t)) +#define OSHMPI_MPI_COLL_BYTE_T MPI_BYTE #define OSHMPI_MPI_COLL32_T MPI_UINT32_T #define OSHMPI_MPI_COLL64_T MPI_UINT64_T @@ -345,6 +346,17 @@ extern OSHMPI_env_t OSHMPI_env; #define OSHMPI_TEAM_HANDLE_TO_OBJ(handle) ((OSHMPI_team_t *) (handle)) #define OSHMPI_TEAM_OBJ_TO_HANDLE(obj) ((shmem_team_t) (obj)) +#define OSHMPI_TEAM_GET_OBJ(team, obj) \ +do { \ + if ((team) == SHMEM_TEAM_WORLD) { \ + (obj) = OSHMPI_global.team_world; \ + } else if ((team) == SHMEM_TEAM_SHARED) { \ + (obj) = OSHMPI_global.team_shared; \ + } else { \ + (obj) = OSHMPI_TEAM_HANDLE_TO_OBJ((team)); \ + } \ +} while (0) + /* SHMEM internal routines. */ void OSHMPI_initialize_thread(int required, int *provided); @@ -450,8 +462,29 @@ void OSHMPI_rma_am_iget_pkt_cb(int origin_rank, OSHMPI_am_pkt_t * pkt); void OSHMPI_coll_initialize(void); void OSHMPI_coll_finalize(void); OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_barrier_all(void); -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_barrier(int PE_start, int logPE_stride, int PE_size); OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_sync_all(void); +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_sync_team(OSHMPI_team_t * team); +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_broadcast_team(OSHMPI_team_t * team, void *dest, + const void *source, size_t nelems, + MPI_Datatype mpi_type, int PE_size); +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_collect_team(OSHMPI_team_t * team, void *dest, + const void *source, size_t nelems, + MPI_Datatype mpi_type); +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_fcollect_team(OSHMPI_team_t * team, void *dest, + const void *source, size_t nelems, + MPI_Datatype mpi_type); +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoall_team(OSHMPI_team_t * team, void *dest, + const void *source, size_t nelems, + MPI_Datatype mpi_type); +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoalls_team(OSHMPI_team_t * team, void *dest, + const void *source, ptrdiff_t dst, + ptrdiff_t sst, size_t nelems, + MPI_Datatype mpi_type); +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_allreduce_team(OSHMPI_team_t * team, void *dest, + const void *source, int count, + MPI_Datatype mpi_type, MPI_Op op); +/* for deprecated active-set-based collectives */ +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_barrier(int PE_start, int logPE_stride, int PE_size); OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_sync(int PE_start, int logPE_stride, int PE_size); OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_broadcast(void *dest, const void *source, size_t nelems, MPI_Datatype mpi_type, int PE_root, int PE_start, @@ -809,6 +842,7 @@ OSHMPI_STATIC_INLINE_PREFIX size_t OSHMPI_get_mspace_sz(size_t bufsz) #include "strided_impl.h" #include "coll_impl.h" +#include "coll_activeset_impl.h" #include "rma_impl.h" #include "amo_impl.h" #include "am_impl.h" diff --git a/src/internal/Makefile.mk b/src/internal/Makefile.mk index 542029f..d40a697 100644 --- a/src/internal/Makefile.mk +++ b/src/internal/Makefile.mk @@ -5,6 +5,7 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/internal noinst_HEADERS += src/internal/coll_impl.h \ + src/internal/coll_activeset_impl.h \ src/internal/rma_impl.h \ src/internal/amo_impl.h \ src/internal/am_impl.h \ diff --git a/src/internal/coll_activeset_impl.h b/src/internal/coll_activeset_impl.h new file mode 100644 index 0000000..36a9079 --- /dev/null +++ b/src/internal/coll_activeset_impl.h @@ -0,0 +1,315 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * (C) 2022 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ +#ifndef INTERNAL_COLL_ACTIVESET_IMPL_H +#define INTERNAL_COLL_ACTIVESET_IMPL_H + +typedef struct OSHMPI_comm_cache_obj { + int pe_start; + int pe_stride; + int pe_size; + MPI_Comm comm; + MPI_Group group; /* Cached in case we need to translate root rank. */ + struct OSHMPI_comm_cache_obj *next; +} OSHMPI_comm_cache_obj_t; + +typedef struct OSHMPI_comm_cache { + OSHMPI_comm_cache_obj_t *head; /* List of cached communicator objects */ + int nobjs; + OSHMPIU_thread_cs_t cs; +} OSHMPI_comm_cache_t; + +extern OSHMPI_comm_cache_t OSHMPI_coll_comm_cache; + +/* Cache a newly created comm. + * Note that we have to cache all comms to ensure it is cached on all involved pes. + * However, we expect that the amount of different active sets will be small.*/ +OSHMPI_STATIC_INLINE_PREFIX void coll_set_comm_cache(int PE_start, int logPE_stride, int PE_size, + MPI_Comm comm, MPI_Group group) +{ + OSHMPI_comm_cache_obj_t *cobj = NULL; + + cobj = OSHMPIU_malloc(sizeof(OSHMPI_comm_cache_obj_t)); + OSHMPI_ASSERT(cobj); + + /* Set new comm */ + cobj->pe_start = PE_start; + cobj->pe_stride = logPE_stride; + cobj->pe_size = PE_size; + cobj->comm = comm; + cobj->group = group; + + OSHMPI_THREAD_ENTER_CS(&OSHMPI_coll_comm_cache.cs); + /* Insert in head, O(1) */ + LL_PREPEND(OSHMPI_coll_comm_cache.head, cobj); + OSHMPI_coll_comm_cache.nobjs++; + OSHMPI_THREAD_EXIT_CS(&OSHMPI_coll_comm_cache.cs); +} + +/* Find if cached comm already exists. */ +OSHMPI_STATIC_INLINE_PREFIX int coll_find_comm_cache(int PE_start, int logPE_stride, int PE_size, + MPI_Comm * comm, MPI_Group * group) +{ + int found = 0; + OSHMPI_comm_cache_obj_t *cobj = NULL; + + OSHMPI_THREAD_ENTER_CS(&OSHMPI_coll_comm_cache.cs); + cobj = OSHMPI_coll_comm_cache.head; + LL_FOREACH(OSHMPI_coll_comm_cache.head, cobj) { + if (cobj->pe_start == PE_start && cobj->pe_stride == logPE_stride + && cobj->pe_size == PE_size) { + found = 1; + *comm = cobj->comm; + *group = cobj->group; + break; + } + } + OSHMPI_THREAD_EXIT_CS(&OSHMPI_coll_comm_cache.cs); + return found; +} + +OSHMPI_STATIC_INLINE_PREFIX void coll_acquire_comm(int PE_start, int logPE_stride, int PE_size, + MPI_Comm * comm) +{ + MPI_Group strided_group = MPI_GROUP_NULL; + + /* Fast path: comm_world */ + if (PE_start == 0 && logPE_stride == 0 && PE_size == OSHMPI_global.team_world_n_pes) { + *comm = OSHMPI_global.team_world_comm; + OSHMPI_DBGMSG("active_set[%d,%d,%d]=>comm_world 0x%lx returned.\n", + PE_start, logPE_stride, PE_size, (unsigned long) *comm); + return; + } + + /* Fast path: return a cached comm if found */ + if (coll_find_comm_cache(PE_start, logPE_stride, PE_size, comm, &strided_group)) { + OSHMPI_DBGMSG("active_set[%d,%d,%d]=>cached comm 0x%lx returned.\n", + PE_start, logPE_stride, PE_size, (unsigned long) *comm); + return; + } + + /* Slow path: create a new communicator and cache it */ + + /* List of processes in the group that will be created. */ + int *pe_list = NULL; + pe_list = (int *) OSHMPIU_malloc(PE_size * sizeof(int)); + OSHMPI_ASSERT(pe_list != NULL); + + /* Implement 2^pe_logs with bitshift. */ + const int pe_stride = 1 << logPE_stride; + for (int i = 0; i < PE_size; i++) + pe_list[i] = PE_start + i * pe_stride; + + OSHMPI_CALLMPI(MPI_Group_incl + (OSHMPI_global.team_world_group, PE_size, pe_list, &strided_group)); + /* Only collective on the strided_group. */ + OSHMPI_CALLMPI(MPI_Comm_create_group + (OSHMPI_global.team_world_comm, strided_group, PE_start /* tag */ , comm)); + OSHMPIU_free(pe_list); + + coll_set_comm_cache(PE_start, logPE_stride, PE_size, *comm, strided_group); + OSHMPI_DBGMSG("new active_set[%d,%d,%d]=>comm 0x%lx group 0x%lx created and cached.\n", + PE_start, logPE_stride, PE_size, (unsigned long) *comm, + (unsigned long) strided_group); +} + +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_barrier(int PE_start, int logPE_stride, int PE_size) +{ + MPI_Comm comm = MPI_COMM_NULL; + + /* Ensure completion of all outstanding Put, AMO, and nonblocking Put */ +#ifdef OSHMPI_ENABLE_DYNAMIC_WIN + OSHMPI_CALLMPI(MPI_Win_flush_all(OSHMPI_global.symm_ictx.win)); +#else + OSHMPI_CALLMPI(MPI_Win_flush_all(OSHMPI_global.symm_heap_ictx.win)); + OSHMPI_CALLMPI(MPI_Win_flush_all(OSHMPI_global.symm_data_ictx.win)); +#endif + + /* Ensure AM completion (e.g., AM AMOs) */ + OSHMPI_am_flush(SHMEM_CTX_DEFAULT, PE_start, logPE_stride, PE_size); + + /* Ensure completion of memory store */ +#ifdef OSHMPI_ENABLE_DYNAMIC_WIN + OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_ictx.win)); +#else + OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_heap_ictx.win)); + OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_data_ictx.win)); +#endif + + coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); + OSHMPI_am_progress_mpi_barrier(comm); +} + +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_sync(int PE_start, int logPE_stride, int PE_size) +{ + MPI_Comm comm = MPI_COMM_NULL; + + /* Ensure completion of previously issued memory store */ +#ifdef OSHMPI_ENABLE_DYNAMIC_WIN + OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_ictx.win)); +#else + OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_heap_ictx.win)); + OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_data_ictx.win)); +#endif + + coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); + OSHMPI_am_progress_mpi_barrier(comm); +} + +/* Return 1 if root is included in the active set, otherwise 0. */ +OSHMPI_STATIC_INLINE_PREFIX int coll_check_root_in_active_set(int PE_root, + int PE_start, int logPE_stride, + int PE_size) +{ + int i, included = 0; + const int pe_stride = 1 << logPE_stride; /* Implement 2^pe_logs with bitshift. */ + for (i = 0; i < PE_size; i++) { + if (PE_root == PE_start + i * pe_stride) { + included = 1; + break; + } + } + return included; +} + +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_broadcast(void *dest, const void *source, size_t nelems, + MPI_Datatype mpi_type, int PE_root, int PE_start, + int logPE_stride, int PE_size) +{ + MPI_Comm comm = MPI_COMM_NULL; + + coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); + + /* Special path: directly use MPI_Bcast if root is included in active set */ + if (coll_check_root_in_active_set(PE_root, PE_start, logPE_stride, PE_size)) { + OSHMPI_am_progress_mpi_bcast(PE_root == + OSHMPI_global.team_world_my_pe ? (void *) source : dest, + nelems, mpi_type, PE_root, comm); + } else { + OSHMPI_ictx_t *ictx = NULL; + OSHMPI_sobj_attr_t *sobj_attr = NULL; + MPI_Aint target_disp = -1; + + /* Generic path: every PE in active set gets data from root + * FIXME: the semantics ensures dest is updated only on local PE at return, + * thus we assume barrier is unneeded.*/ + OSHMPI_sobj_query_attr_ictx(SHMEM_CTX_DEFAULT, source, PE_root, &sobj_attr, &ictx); + OSHMPI_ASSERT(sobj_attr && ictx); + OSHMPI_sobj_trans_vaddr_to_disp(sobj_attr, source, PE_root, + OSHMPI_ICTX_DISP_MODE(ictx), &target_disp); + OSHMPI_ASSERT(target_disp >= 0); + + OSHMPI_CALLMPI(MPI_Get + (dest, nelems, mpi_type, PE_root, target_disp, nelems, mpi_type, ictx->win)); + OSHMPI_CALLMPI(MPI_Win_flush_local(PE_root, ictx->win)); + } +} + +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_collect(void *dest, const void *source, size_t nelems, + MPI_Datatype mpi_type, int PE_start, + int logPE_stride, int PE_size) +{ + MPI_Comm comm = MPI_COMM_NULL; + int *rcounts, *rdispls; + unsigned int same_nelems = 0; + + coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); + + /* collect allows each PE to have different nelems. */ + rcounts = OSHMPIU_malloc(PE_size * sizeof(int)); + OSHMPI_ASSERT(rcounts); + + rdispls = OSHMPIU_malloc(PE_size * sizeof(int)); + OSHMPI_ASSERT(rdispls); + + OSHMPI_am_progress_mpi_allgather(&nelems, 1, MPI_INT, rcounts, 1, MPI_INT, comm); + + rdispls[0] = 0; + same_nelems = (nelems == rcounts[0]) ? 1 : 0; + for (int i = 1; i < PE_size; i++) { + rdispls[i] = rdispls[i - 1] + rcounts[i - 1]; + same_nelems &= (nelems == rcounts[i]); + } + + if (same_nelems) /* call faster allgather if same nelems on all PEs */ + OSHMPI_am_progress_mpi_allgather(source, nelems, mpi_type, dest, nelems, mpi_type, comm); + else + OSHMPI_am_progress_mpi_allgatherv(source, nelems, mpi_type, dest, rcounts, rdispls, + mpi_type, comm); + + OSHMPIU_free(rdispls); + OSHMPIU_free(rcounts); +} + +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_fcollect(void *dest, const void *source, size_t nelems, + MPI_Datatype mpi_type, int PE_start, + int logPE_stride, int PE_size) +{ + MPI_Comm comm = MPI_COMM_NULL; + + coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); + + OSHMPI_am_progress_mpi_allgather(source, nelems, mpi_type, dest, nelems, mpi_type, comm); +} + +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoall(void *dest, const void *source, size_t nelems, + MPI_Datatype mpi_type, int PE_start, + int logPE_stride, int PE_size) +{ + MPI_Comm comm = MPI_COMM_NULL; + + coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); + + OSHMPI_am_progress_mpi_alltoall(source, nelems, mpi_type, dest, nelems, mpi_type, comm); +} + +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoalls(void *dest, const void *source, ptrdiff_t dst, + ptrdiff_t sst, size_t nelems, + MPI_Datatype mpi_type, int PE_start, + int logPE_stride, int PE_size) +{ + MPI_Comm comm = MPI_COMM_NULL; + MPI_Datatype sdtype = MPI_DATATYPE_NULL, rdtype = MPI_DATATYPE_NULL; + size_t scount, rcount; + + /* Values of dst, sst, nelems must be equal on all PEs. When dst=sst=1, same as alltoall. + * TODO: not sure if alltoall with ddt is faster or alltoallv is faster */ + + /* Create derived datatypes if strided > 1, otherwise directly use basic datatype; + * when dst == sst, reuse send datatype. */ + OSHMPI_create_strided_dtype(nelems, sst, mpi_type, nelems * sst /* required extent */ , + &scount, &sdtype); + if (dst == sst) { + rdtype = sdtype; + rcount = scount; + } else + OSHMPI_create_strided_dtype(nelems, dst, mpi_type, nelems * dst /* required extent */ , + &rcount, &rdtype); + + /* TODO: check non-int inputs exceeds int limit */ + + coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); + + OSHMPI_am_progress_mpi_alltoall(source, (int) scount, sdtype, dest, (int) rcount, rdtype, comm); + + OSHMPI_free_strided_dtype(mpi_type, &sdtype); + if (dst != sst) + OSHMPI_free_strided_dtype(mpi_type, &rdtype); +} + +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_allreduce(void *dest, const void *source, int count, + MPI_Datatype mpi_type, MPI_Op op, int PE_start, + int logPE_stride, int PE_size) +{ + MPI_Comm comm = MPI_COMM_NULL; + + coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); + + /* source and dest may be the same array, but may not be overlapping. */ + OSHMPI_am_progress_mpi_allreduce((source == dest) ? MPI_IN_PLACE : source, + dest, count, mpi_type, op, comm); +} + +#endif /* INTERNAL_COLL_ACTIVESET_IMPL_H */ diff --git a/src/internal/coll_impl.h b/src/internal/coll_impl.h index 3b36bc9..c06cf05 100644 --- a/src/internal/coll_impl.h +++ b/src/internal/coll_impl.h @@ -6,117 +6,9 @@ #ifndef INTERNAL_COLL_IMPL_H #define INTERNAL_COLL_IMPL_H -typedef struct OSHMPI_comm_cache_obj { - int pe_start; - int pe_stride; - int pe_size; - MPI_Comm comm; - MPI_Group group; /* Cached in case we need to translate root rank. */ - struct OSHMPI_comm_cache_obj *next; -} OSHMPI_comm_cache_obj_t; - -typedef struct OSHMPI_comm_cache { - OSHMPI_comm_cache_obj_t *head; /* List of cached communicator objects */ - int nobjs; - OSHMPIU_thread_cs_t cs; -} OSHMPI_comm_cache_t; - -extern OSHMPI_comm_cache_t OSHMPI_coll_comm_cache; - -/* Cache a newly created comm. - * Note that we have to cache all comms to ensure it is cached on all involved pes. - * However, we expect that the amount of different active sets will be small.*/ -OSHMPI_STATIC_INLINE_PREFIX void coll_set_comm_cache(int PE_start, int logPE_stride, int PE_size, - MPI_Comm comm, MPI_Group group) -{ - OSHMPI_comm_cache_obj_t *cobj = NULL; - - cobj = OSHMPIU_malloc(sizeof(OSHMPI_comm_cache_obj_t)); - OSHMPI_ASSERT(cobj); - - /* Set new comm */ - cobj->pe_start = PE_start; - cobj->pe_stride = logPE_stride; - cobj->pe_size = PE_size; - cobj->comm = comm; - cobj->group = group; - - OSHMPI_THREAD_ENTER_CS(&OSHMPI_coll_comm_cache.cs); - /* Insert in head, O(1) */ - LL_PREPEND(OSHMPI_coll_comm_cache.head, cobj); - OSHMPI_coll_comm_cache.nobjs++; - OSHMPI_THREAD_EXIT_CS(&OSHMPI_coll_comm_cache.cs); -} - -/* Find if cached comm already exists. */ -OSHMPI_STATIC_INLINE_PREFIX int coll_find_comm_cache(int PE_start, int logPE_stride, int PE_size, - MPI_Comm * comm, MPI_Group * group) -{ - int found = 0; - OSHMPI_comm_cache_obj_t *cobj = NULL; - - OSHMPI_THREAD_ENTER_CS(&OSHMPI_coll_comm_cache.cs); - cobj = OSHMPI_coll_comm_cache.head; - LL_FOREACH(OSHMPI_coll_comm_cache.head, cobj) { - if (cobj->pe_start == PE_start && cobj->pe_stride == logPE_stride - && cobj->pe_size == PE_size) { - found = 1; - *comm = cobj->comm; - *group = cobj->group; - break; - } - } - OSHMPI_THREAD_EXIT_CS(&OSHMPI_coll_comm_cache.cs); - return found; -} - -OSHMPI_STATIC_INLINE_PREFIX void coll_acquire_comm(int PE_start, int logPE_stride, int PE_size, - MPI_Comm * comm) -{ - MPI_Group strided_group = MPI_GROUP_NULL; - - /* Fast path: comm_world */ - if (PE_start == 0 && logPE_stride == 0 && PE_size == OSHMPI_global.team_world_n_pes) { - *comm = OSHMPI_global.team_world_comm; - OSHMPI_DBGMSG("active_set[%d,%d,%d]=>comm_world 0x%lx returned.\n", - PE_start, logPE_stride, PE_size, (unsigned long) *comm); - return; - } - - /* Fast path: return a cached comm if found */ - if (coll_find_comm_cache(PE_start, logPE_stride, PE_size, comm, &strided_group)) { - OSHMPI_DBGMSG("active_set[%d,%d,%d]=>cached comm 0x%lx returned.\n", - PE_start, logPE_stride, PE_size, (unsigned long) *comm); - return; - } - - /* Slow path: create a new communicator and cache it */ - - /* List of processes in the group that will be created. */ - int *pe_list = NULL; - pe_list = (int *) OSHMPIU_malloc(PE_size * sizeof(int)); - OSHMPI_ASSERT(pe_list != NULL); - - /* Implement 2^pe_logs with bitshift. */ - const int pe_stride = 1 << logPE_stride; - for (int i = 0; i < PE_size; i++) - pe_list[i] = PE_start + i * pe_stride; - - OSHMPI_CALLMPI(MPI_Group_incl - (OSHMPI_global.team_world_group, PE_size, pe_list, &strided_group)); - /* Only collective on the strided_group. */ - OSHMPI_CALLMPI(MPI_Comm_create_group - (OSHMPI_global.team_world_comm, strided_group, PE_start /* tag */ , comm)); - OSHMPIU_free(pe_list); - - coll_set_comm_cache(PE_start, logPE_stride, PE_size, *comm, strided_group); - OSHMPI_DBGMSG("new active_set[%d,%d,%d]=>comm 0x%lx group 0x%lx created and cached.\n", - PE_start, logPE_stride, PE_size, (unsigned long) *comm, - (unsigned long) strided_group); -} - /* Block until all PEs arrive at the barrier and all local updates * and remote memory updates on the default context are completed. */ +#include "oshmpi_util.h" OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_barrier_all(void) { /* Ensure completion of all outstanding Put, AMO, and nonblocking Put */ @@ -139,33 +31,6 @@ OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_barrier_all(void) OSHMPI_am_progress_mpi_barrier(OSHMPI_global.team_world_comm); } -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_barrier(int PE_start, int logPE_stride, int PE_size) -{ - MPI_Comm comm = MPI_COMM_NULL; - - /* Ensure completion of all outstanding Put, AMO, and nonblocking Put */ -#ifdef OSHMPI_ENABLE_DYNAMIC_WIN - OSHMPI_CALLMPI(MPI_Win_flush_all(OSHMPI_global.symm_ictx.win)); -#else - OSHMPI_CALLMPI(MPI_Win_flush_all(OSHMPI_global.symm_heap_ictx.win)); - OSHMPI_CALLMPI(MPI_Win_flush_all(OSHMPI_global.symm_data_ictx.win)); -#endif - - /* Ensure AM completion (e.g., AM AMOs) */ - OSHMPI_am_flush(SHMEM_CTX_DEFAULT, PE_start, logPE_stride, PE_size); - - /* Ensure completion of memory store */ -#ifdef OSHMPI_ENABLE_DYNAMIC_WIN - OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_ictx.win)); -#else - OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_heap_ictx.win)); - OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_data_ictx.win)); -#endif - - coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); - OSHMPI_am_progress_mpi_barrier(comm); -} - OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_sync_all(void) { /* Ensure completion of previously issued memory store */ @@ -178,135 +43,74 @@ OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_sync_all(void) OSHMPI_am_progress_mpi_barrier(OSHMPI_global.team_world_comm); } -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_sync(int PE_start, int logPE_stride, int PE_size) -{ - MPI_Comm comm = MPI_COMM_NULL; - - /* Ensure completion of previously issued memory store */ -#ifdef OSHMPI_ENABLE_DYNAMIC_WIN - OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_ictx.win)); -#else - OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_heap_ictx.win)); - OSHMPI_CALLMPI(MPI_Win_sync(OSHMPI_global.symm_data_ictx.win)); -#endif - - coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); - OSHMPI_am_progress_mpi_barrier(comm); -} - -/* Return 1 if root is included in the active set, otherwise 0. */ -OSHMPI_STATIC_INLINE_PREFIX int coll_check_root_in_active_set(int PE_root, - int PE_start, int logPE_stride, - int PE_size) +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_sync_team(OSHMPI_team_t * team) { - int i, included = 0; - const int pe_stride = 1 << logPE_stride; /* Implement 2^pe_logs with bitshift. */ - for (i = 0; i < PE_size; i++) { - if (PE_root == PE_start + i * pe_stride) { - included = 1; - break; - } - } - return included; + OSHMPI_am_progress_mpi_barrier(team->comm); } -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_broadcast(void *dest, const void *source, size_t nelems, - MPI_Datatype mpi_type, int PE_root, int PE_start, - int logPE_stride, int PE_size) +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_broadcast_team(OSHMPI_team_t * team, void *dest, + const void *source, size_t nelems, + MPI_Datatype mpi_type, int PE_root) { - MPI_Comm comm = MPI_COMM_NULL; - - coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); - - /* Special path: directly use MPI_Bcast if root is included in active set */ - if (coll_check_root_in_active_set(PE_root, PE_start, logPE_stride, PE_size)) { - OSHMPI_am_progress_mpi_bcast(PE_root == - OSHMPI_global.team_world_my_pe ? (void *) source : dest, - nelems, mpi_type, PE_root, comm); - } else { - OSHMPI_ictx_t *ictx = NULL; - OSHMPI_sobj_attr_t *sobj_attr = NULL; - MPI_Aint target_disp = -1; - - /* Generic path: every PE in active set gets data from root - * FIXME: the semantics ensures dest is updated only on local PE at return, - * thus we assume barrier is unneeded.*/ - OSHMPI_sobj_query_attr_ictx(SHMEM_CTX_DEFAULT, source, PE_root, &sobj_attr, &ictx); - OSHMPI_ASSERT(sobj_attr && ictx); - OSHMPI_sobj_trans_vaddr_to_disp(sobj_attr, source, PE_root, - OSHMPI_ICTX_DISP_MODE(ictx), &target_disp); - OSHMPI_ASSERT(target_disp >= 0); - - OSHMPI_CALLMPI(MPI_Get - (dest, nelems, mpi_type, PE_root, target_disp, nelems, mpi_type, ictx->win)); - OSHMPI_CALLMPI(MPI_Win_flush_local(PE_root, ictx->win)); - } + OSHMPI_am_progress_mpi_bcast(PE_root == OSHMPI_global.team_world_my_pe ? (void *) source : dest, + nelems, mpi_type, PE_root, team->comm); } -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_collect(void *dest, const void *source, size_t nelems, - MPI_Datatype mpi_type, int PE_start, - int logPE_stride, int PE_size) +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_collect_team(OSHMPI_team_t * team, void *dest, + const void *source, size_t nelems, + MPI_Datatype mpi_type) { - MPI_Comm comm = MPI_COMM_NULL; int *rcounts, *rdispls; + int comm_size = 0; unsigned int same_nelems = 0; - coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); - /* collect allows each PE to have different nelems. */ - rcounts = OSHMPIU_malloc(PE_size * sizeof(int)); + OSHMPI_CALLMPI(MPI_Comm_size(team->comm, &comm_size)); + rcounts = OSHMPIU_malloc(comm_size * sizeof(int)); OSHMPI_ASSERT(rcounts); - rdispls = OSHMPIU_malloc(PE_size * sizeof(int)); + rdispls = OSHMPIU_malloc(comm_size * sizeof(int)); OSHMPI_ASSERT(rdispls); - OSHMPI_am_progress_mpi_allgather(&nelems, 1, MPI_INT, rcounts, 1, MPI_INT, comm); + OSHMPI_am_progress_mpi_allgather(&nelems, 1, MPI_INT, rcounts, 1, MPI_INT, team->comm); rdispls[0] = 0; same_nelems = (nelems == rcounts[0]) ? 1 : 0; - for (int i = 1; i < PE_size; i++) { + for (int i = 1; i < comm_size; i++) { rdispls[i] = rdispls[i - 1] + rcounts[i - 1]; same_nelems &= (nelems == rcounts[i]); } if (same_nelems) /* call faster allgather if same nelems on all PEs */ - OSHMPI_am_progress_mpi_allgather(source, nelems, mpi_type, dest, nelems, mpi_type, comm); + OSHMPI_am_progress_mpi_allgather(source, nelems, mpi_type, dest, nelems, mpi_type, + team->comm); else OSHMPI_am_progress_mpi_allgatherv(source, nelems, mpi_type, dest, rcounts, rdispls, - mpi_type, comm); + mpi_type, team->comm); OSHMPIU_free(rdispls); OSHMPIU_free(rcounts); } -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_fcollect(void *dest, const void *source, size_t nelems, - MPI_Datatype mpi_type, int PE_start, - int logPE_stride, int PE_size) +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_fcollect_team(OSHMPI_team_t * team, void *dest, + const void *source, size_t nelems, + MPI_Datatype mpi_type) { - MPI_Comm comm = MPI_COMM_NULL; - - coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); - - OSHMPI_am_progress_mpi_allgather(source, nelems, mpi_type, dest, nelems, mpi_type, comm); + OSHMPI_am_progress_mpi_allgather(source, nelems, mpi_type, dest, nelems, mpi_type, team->comm); } -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoall(void *dest, const void *source, size_t nelems, - MPI_Datatype mpi_type, int PE_start, - int logPE_stride, int PE_size) +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoall_team(OSHMPI_team_t * team, void *dest, + const void *source, size_t nelems, + MPI_Datatype mpi_type) { - MPI_Comm comm = MPI_COMM_NULL; - - coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); - - OSHMPI_am_progress_mpi_alltoall(source, nelems, mpi_type, dest, nelems, mpi_type, comm); + OSHMPI_am_progress_mpi_alltoall(source, nelems, mpi_type, dest, nelems, mpi_type, team->comm); } -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoalls(void *dest, const void *source, ptrdiff_t dst, - ptrdiff_t sst, size_t nelems, - MPI_Datatype mpi_type, int PE_start, - int logPE_stride, int PE_size) +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoalls_team(OSHMPI_team_t * team, void *dest, + const void *source, ptrdiff_t dst, + ptrdiff_t sst, size_t nelems, + MPI_Datatype mpi_type) { - MPI_Comm comm = MPI_COMM_NULL; MPI_Datatype sdtype = MPI_DATATYPE_NULL, rdtype = MPI_DATATYPE_NULL; size_t scount, rcount; @@ -326,26 +130,22 @@ OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_alltoalls(void *dest, const void *source /* TODO: check non-int inputs exceeds int limit */ - coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); - OSHMPI_am_progress_mpi_alltoall(source, (int) scount, sdtype, dest, (int) rcount, rdtype, comm); + OSHMPI_am_progress_mpi_alltoall(source, (int) scount, sdtype, dest, (int) rcount, rdtype, + team->comm); OSHMPI_free_strided_dtype(mpi_type, &sdtype); if (dst != sst) OSHMPI_free_strided_dtype(mpi_type, &rdtype); } -OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_allreduce(void *dest, const void *source, int count, - MPI_Datatype mpi_type, MPI_Op op, int PE_start, - int logPE_stride, int PE_size) +OSHMPI_STATIC_INLINE_PREFIX void OSHMPI_allreduce_team(OSHMPI_team_t * team, void *dest, + const void *source, int count, + MPI_Datatype mpi_type, MPI_Op op) { - MPI_Comm comm = MPI_COMM_NULL; - - coll_acquire_comm(PE_start, logPE_stride, PE_size, &comm); - /* source and dest may be the same array, but may not be overlapping. */ OSHMPI_am_progress_mpi_allreduce((source == dest) ? MPI_IN_PLACE : source, - dest, count, mpi_type, op, comm); + dest, count, mpi_type, op, team->comm); } #endif /* INTERNAL_COLL_IMPL_H */ diff --git a/src/shmem/Makefile.mk b/src/shmem/Makefile.mk index aebe267..db3828d 100644 --- a/src/shmem/Makefile.mk +++ b/src/shmem/Makefile.mk @@ -16,9 +16,14 @@ liboshmpi_la_SOURCES += src/shmem/setup.c \ src/shmem/signal_typed.c \ src/shmem/signal_sized.c \ src/shmem/coll.c \ - src/shmem/reduce_minmax_typed.c \ - src/shmem/reduce_sumprod_typed.c \ - src/shmem/reduce_bitws_typed.c \ + src/shmem/coll_activeset.c \ + src/shmem/coll_typed.c \ + src/shmem/reduce_minmax_aset_typed.c \ + src/shmem/reduce_sumprod_aset_typed.c \ + src/shmem/reduce_bitws_aset_typed.c \ + src/shmem/reduce_minmax_team_typed.c \ + src/shmem/reduce_sumprod_team_typed.c \ + src/shmem/reduce_bitws_team_typed.c \ src/shmem/p2p.c \ src/shmem/p2p_typed.c \ src/shmem/order.c \ @@ -32,7 +37,12 @@ EXTRA_DIST += src/shmem/rma_typed.c.tpl \ src/shmem/amo_bitws_typed.c.tpl \ src/shmem/signal_typed.c.tpl \ src/shmem/signal_sized.c.tpl \ - src/shmem/reduce_minmax_typed.c.tpl \ - src/shmem/reduce_sumprod_typed.c.tpl \ - src/shmem/reduce_bitws_typed.c.tpl \ + src/shmem/coll_typed.c.tpl \ + src/shmem/coll_activeset_typed.c.tpl \ + src/shmem/reduce_minmax_aset_typed.c.tpl \ + src/shmem/reduce_sumprod_aset_typed.c.tpl \ + src/shmem/reduce_bitws_aset_typed.c.tpl \ + src/shmem/reduce_minmax_team_typed.c.tpl \ + src/shmem/reduce_sumprod_team_typed.c.tpl \ + src/shmem/reduce_bitws_team_typed.c.tpl \ src/shmem/p2p_typed.c.tpl diff --git a/src/shmem/coll.c b/src/shmem/coll.c index 0645138..d276bef 100644 --- a/src/shmem/coll.c +++ b/src/shmem/coll.c @@ -11,11 +11,6 @@ void shmem_barrier_all(void) OSHMPI_barrier_all(); } -void shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) -{ - OSHMPI_barrier(PE_start, logPE_stride, PE_size); -} - int shmem_team_sync(shmem_team_t team) { OSHMPI_ASSERT(0); @@ -27,72 +22,44 @@ void shmem_sync_all(void) OSHMPI_sync_all(); } -void shmem_sync(int PE_start, int logPE_stride, int PE_size, long *pSync) -{ - /* Deprecated API */ - OSHMPI_sync(PE_start, logPE_stride, PE_size); -} - -void shmem_broadcast32(void *dest, const void *source, size_t nelems, int PE_root, int PE_start, - int logPE_stride, int PE_size, long *pSync) -{ - OSHMPI_broadcast(dest, source, nelems, OSHMPI_MPI_COLL32_T, PE_root, PE_start, logPE_stride, - PE_size); -} - -void shmem_broadcast64(void *dest, const void *source, size_t nelems, int PE_root, int PE_start, - int logPE_stride, int PE_size, long *pSync) -{ - OSHMPI_broadcast(dest, source, nelems, OSHMPI_MPI_COLL64_T, PE_root, PE_start, logPE_stride, - PE_size); -} - -void shmem_collect32(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, - int PE_size, long *pSync) -{ - OSHMPI_collect(dest, source, nelems, OSHMPI_MPI_COLL32_T, PE_start, logPE_stride, PE_size); -} - -void shmem_collect64(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, - int PE_size, long *pSync) -{ - OSHMPI_collect(dest, source, nelems, OSHMPI_MPI_COLL64_T, PE_start, logPE_stride, PE_size); -} - -void shmem_fcollect32(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, - int PE_size, long *pSync) -{ - OSHMPI_fcollect(dest, source, nelems, OSHMPI_MPI_COLL32_T, PE_start, logPE_stride, PE_size); -} - -void shmem_fcollect64(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, - int PE_size, long *pSync) +int shmem_broadcastmem(shmem_team_t team, void *dest, const void *source, size_t nelems, + int PE_root) { - OSHMPI_fcollect(dest, source, nelems, OSHMPI_MPI_COLL64_T, PE_start, logPE_stride, PE_size); + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_broadcast_team(team_obj, dest, source, nelems, OSHMPI_MPI_COLL_BYTE_T, PE_root); + return SHMEM_SUCCESS; } -void shmem_alltoall32(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, - int PE_size, long *pSync) +int shmem_collectmem(shmem_team_t team, void *dest, const void *source, size_t nelems) { - OSHMPI_alltoall(dest, source, nelems, OSHMPI_MPI_COLL32_T, PE_start, logPE_stride, PE_size); + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_collect_team(team_obj, dest, source, nelems, OSHMPI_MPI_COLL_BYTE_T); + return SHMEM_SUCCESS; } -void shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, - int PE_size, long *pSync) +int shmem_fcollectmem(shmem_team_t team, void *dest, const void *source, size_t nelems) { - OSHMPI_alltoall(dest, source, nelems, OSHMPI_MPI_COLL64_T, PE_start, logPE_stride, PE_size); + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_fcollect_team(team_obj, dest, source, nelems, OSHMPI_MPI_COLL_BYTE_T); + return SHMEM_SUCCESS; } -void shmem_alltoalls32(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems, - int PE_start, int logPE_stride, int PE_size, long *pSync) +int shmem_alltoallmem(shmem_team_t team, void *dest, const void *source, size_t nelems) { - OSHMPI_alltoalls(dest, source, dst, sst, nelems, OSHMPI_MPI_COLL32_T, PE_start, logPE_stride, - PE_size); + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_alltoall_team(team_obj, dest, source, nelems, OSHMPI_MPI_COLL_BYTE_T); + return SHMEM_SUCCESS; } -void shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems, - int PE_start, int logPE_stride, int PE_size, long *pSync) +int shmem_alltoallsmem(shmem_team_t team, void *dest, const void *source, ptrdiff_t dst, + ptrdiff_t sst, size_t nelems) { - OSHMPI_alltoalls(dest, source, dst, sst, nelems, OSHMPI_MPI_COLL64_T, PE_start, logPE_stride, - PE_size); + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_alltoalls_team(team_obj, dest, source, dst, sst, nelems, OSHMPI_MPI_COLL_BYTE_T); + return SHMEM_SUCCESS; } diff --git a/src/shmem/coll_activeset.c b/src/shmem/coll_activeset.c new file mode 100644 index 0000000..d73ff2e --- /dev/null +++ b/src/shmem/coll_activeset.c @@ -0,0 +1,82 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * (C) 2022 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ +#include +#include "oshmpi_impl.h" + +void shmem_barrier(int PE_start, int logPE_stride, int PE_size, long *pSync) +{ + OSHMPI_barrier(PE_start, logPE_stride, PE_size); +} + +void shmem_sync_aset(int PE_start, int logPE_stride, int PE_size, long *pSync) +{ + /* Deprecated API */ + OSHMPI_sync(PE_start, logPE_stride, PE_size); +} + +void shmem_broadcast32(void *dest, const void *source, size_t nelems, int PE_root, int PE_start, + int logPE_stride, int PE_size, long *pSync) +{ + OSHMPI_broadcast(dest, source, nelems, OSHMPI_MPI_COLL32_T, PE_root, PE_start, logPE_stride, + PE_size); +} + +void shmem_broadcast64(void *dest, const void *source, size_t nelems, int PE_root, int PE_start, + int logPE_stride, int PE_size, long *pSync) +{ + OSHMPI_broadcast(dest, source, nelems, OSHMPI_MPI_COLL64_T, PE_root, PE_start, logPE_stride, + PE_size); +} + +void shmem_collect32(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, + int PE_size, long *pSync) +{ + OSHMPI_collect(dest, source, nelems, OSHMPI_MPI_COLL32_T, PE_start, logPE_stride, PE_size); +} + +void shmem_collect64(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, + int PE_size, long *pSync) +{ + OSHMPI_collect(dest, source, nelems, OSHMPI_MPI_COLL64_T, PE_start, logPE_stride, PE_size); +} + +void shmem_fcollect32(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, + int PE_size, long *pSync) +{ + OSHMPI_fcollect(dest, source, nelems, OSHMPI_MPI_COLL32_T, PE_start, logPE_stride, PE_size); +} + +void shmem_fcollect64(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, + int PE_size, long *pSync) +{ + OSHMPI_fcollect(dest, source, nelems, OSHMPI_MPI_COLL64_T, PE_start, logPE_stride, PE_size); +} + +void shmem_alltoall32(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, + int PE_size, long *pSync) +{ + OSHMPI_alltoall(dest, source, nelems, OSHMPI_MPI_COLL32_T, PE_start, logPE_stride, PE_size); +} + +void shmem_alltoall64(void *dest, const void *source, size_t nelems, int PE_start, int logPE_stride, + int PE_size, long *pSync) +{ + OSHMPI_alltoall(dest, source, nelems, OSHMPI_MPI_COLL64_T, PE_start, logPE_stride, PE_size); +} + +void shmem_alltoalls32(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems, + int PE_start, int logPE_stride, int PE_size, long *pSync) +{ + OSHMPI_alltoalls(dest, source, dst, sst, nelems, OSHMPI_MPI_COLL32_T, PE_start, logPE_stride, + PE_size); +} + +void shmem_alltoalls64(void *dest, const void *source, ptrdiff_t dst, ptrdiff_t sst, size_t nelems, + int PE_start, int logPE_stride, int PE_size, long *pSync) +{ + OSHMPI_alltoalls(dest, source, dst, sst, nelems, OSHMPI_MPI_COLL64_T, PE_start, logPE_stride, + PE_size); +} diff --git a/src/shmem/coll_typed.c.tpl b/src/shmem/coll_typed.c.tpl new file mode 100644 index 0000000..f514b5e --- /dev/null +++ b/src/shmem/coll_typed.c.tpl @@ -0,0 +1,50 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * (C) 2022 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + */ +#include +#include "oshmpi_impl.h" + +/* TPL_BLOCK_START */ + +int shmem_TYPENAME_broadcast(shmem_team_t team, TYPE * dest, const TYPE * source, size_t nelems, + int PE_root) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_broadcast_team(team_obj, dest, source, nelems, MPI_TYPE, PE_root); + return SHMEM_SUCCESS; +} +int shmem_TYPENAME_collect(shmem_team_t team, TYPE * dest, const TYPE * source, size_t nelems) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_collect_team(team_obj, dest, source, nelems, MPI_TYPE); + return SHMEM_SUCCESS; +} +int shmem_TYPENAME_fcollect(shmem_team_t team, TYPE * dest, const TYPE * source, size_t nelems) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_fcollect_team(team_obj, dest, source, nelems, MPI_TYPE); + return SHMEM_SUCCESS; +} + +int shmem_TYPENAME_alltoall(shmem_team_t team, TYPE * dest, const TYPE * source, size_t nelems) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_alltoall_team(team_obj, dest, source, nelems, MPI_TYPE); + return SHMEM_SUCCESS; +} + +int shmem_TYPENAME_alltoalls(shmem_team_t team, TYPE * dest, const TYPE * source, ptrdiff_t dst, + ptrdiff_t sst, size_t nelems) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_alltoalls_team(team_obj, dest, source, dst, sst, nelems, MPI_TYPE); + return SHMEM_SUCCESS; +} +/* TPL_BLOCK_END */ diff --git a/src/shmem/reduce_bitws_typed.c.tpl b/src/shmem/reduce_bitws_aset_typed.c.tpl similarity index 94% rename from src/shmem/reduce_bitws_typed.c.tpl rename to src/shmem/reduce_bitws_aset_typed.c.tpl index 9c621d9..932f322 100644 --- a/src/shmem/reduce_bitws_typed.c.tpl +++ b/src/shmem/reduce_bitws_aset_typed.c.tpl @@ -10,6 +10,7 @@ #include "oshmpi_impl.h" /* TPL_BLOCK_START */ +/* deprecated APIs */ void shmem_TYPENAME_and_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync) { @@ -27,4 +28,5 @@ void shmem_TYPENAME_xor_to_all(TYPE * dest, const TYPE * source, int nreduce, in { OSHMPI_allreduce(dest, source, nreduce, MPI_TYPE, MPI_BXOR, PE_start, logPE_stride, PE_size); } -/* TPL_BLOCK_END */ \ No newline at end of file +/* end of deprecation */ +/* TPL_BLOCK_END */ diff --git a/src/shmem/reduce_bitws_team_typed.c.tpl b/src/shmem/reduce_bitws_team_typed.c.tpl new file mode 100644 index 0000000..4285f71 --- /dev/null +++ b/src/shmem/reduce_bitws_team_typed.c.tpl @@ -0,0 +1,36 @@ +/* -*- Mode: C{} c-basic-offset:4 {} -*- */ +/* + * (C) 2022 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * This file is automatically generated. DO NOT EDIT. + */ + +#include +#include "oshmpi_impl.h" +/* TPL_BLOCK_START */ + +int shmem_TYPENAME_and_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_allreduce_team(team_obj, dest, source, nreduce, MPI_TYPE, MPI_BAND); + return SHMEM_SUCCESS; +} + +int shmem_TYPENAME_or_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_allreduce_team(team_obj, dest, source, nreduce, MPI_TYPE, MPI_BOR); + return SHMEM_SUCCESS; +} + +int shmem_TYPENAME_xor_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_allreduce_team(team_obj, dest, source, nreduce, MPI_TYPE, MPI_BXOR); + return SHMEM_SUCCESS; +} +/* TPL_BLOCK_END */ diff --git a/src/shmem/reduce_minmax_typed.c.tpl b/src/shmem/reduce_minmax_aset_typed.c.tpl similarity index 92% rename from src/shmem/reduce_minmax_typed.c.tpl rename to src/shmem/reduce_minmax_aset_typed.c.tpl index 10e3417..c7a2306 100644 --- a/src/shmem/reduce_minmax_typed.c.tpl +++ b/src/shmem/reduce_minmax_aset_typed.c.tpl @@ -10,6 +10,7 @@ #include "oshmpi_impl.h" /* TPL_BLOCK_START */ +/* depreciated APIs */ void shmem_TYPENAME_min_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync) { @@ -21,4 +22,5 @@ void shmem_TYPENAME_max_to_all(TYPE * dest, const TYPE * source, int nreduce, in { OSHMPI_allreduce(dest, source, nreduce, MPI_TYPE, MPI_MAX, PE_start, logPE_stride, PE_size); } -/* TPL_BLOCK_END */ \ No newline at end of file +/* end of deprecation */ +/* TPL_BLOCK_END */ diff --git a/src/shmem/reduce_minmax_team_typed.c.tpl b/src/shmem/reduce_minmax_team_typed.c.tpl new file mode 100644 index 0000000..39e7d5a --- /dev/null +++ b/src/shmem/reduce_minmax_team_typed.c.tpl @@ -0,0 +1,28 @@ +/* -*- Mode: C{} c-basic-offset:4 {} -*- */ +/* + * (C) 2022 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * This file is automatically generated. DO NOT EDIT. + */ + +#include +#include "oshmpi_impl.h" +/* TPL_BLOCK_START */ + +int shmem_TYPENAME_min_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_allreduce_team(team_obj, dest, source, nreduce, MPI_TYPE, MPI_MIN); + return SHMEM_SUCCESS; +} + +int shmem_TYPENAME_max_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_allreduce_team(team_obj, dest, source, nreduce, MPI_TYPE, MPI_MAX); + return SHMEM_SUCCESS; +} +/* TPL_BLOCK_END */ diff --git a/src/shmem/reduce_sumprod_typed.c.tpl b/src/shmem/reduce_sumprod_aset_typed.c.tpl similarity index 92% rename from src/shmem/reduce_sumprod_typed.c.tpl rename to src/shmem/reduce_sumprod_aset_typed.c.tpl index 8c07ca2..47324fd 100644 --- a/src/shmem/reduce_sumprod_typed.c.tpl +++ b/src/shmem/reduce_sumprod_aset_typed.c.tpl @@ -10,6 +10,7 @@ #include "oshmpi_impl.h" /* TPL_BLOCK_START */ +/* deprecated APIs */ void shmem_TYPENAME_sum_to_all(TYPE * dest, const TYPE * source, int nreduce, int PE_start, int logPE_stride, int PE_size, TYPE * pWrk, long *pSync) { @@ -21,4 +22,5 @@ void shmem_TYPENAME_prod_to_all(TYPE * dest, const TYPE * source, int nreduce, i { OSHMPI_allreduce(dest, source, nreduce, MPI_TYPE, MPI_PROD, PE_start, logPE_stride, PE_size); } -/* TPL_BLOCK_END */ \ No newline at end of file +/* end of deprecation */ +/* TPL_BLOCK_END */ diff --git a/src/shmem/reduce_sumprod_team_typed.c.tpl b/src/shmem/reduce_sumprod_team_typed.c.tpl new file mode 100644 index 0000000..7f37648 --- /dev/null +++ b/src/shmem/reduce_sumprod_team_typed.c.tpl @@ -0,0 +1,28 @@ +/* -*- Mode: C{} c-basic-offset:4 {} -*- */ +/* + * (C) 2022 by Argonne National Laboratory. + * See COPYRIGHT in top-level directory. + * + * This file is automatically generated. DO NOT EDIT. + */ + +#include +#include "oshmpi_impl.h" +/* TPL_BLOCK_START */ + +int shmem_TYPENAME_sum_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_allreduce_team(team_obj, dest, source, nreduce, MPI_TYPE, MPI_SUM); + return SHMEM_SUCCESS; +} + +int shmem_TYPENAME_prod_reduce(shmem_team_t team, TYPE * dest, const TYPE * source, int nreduce) +{ + OSHMPI_team_t *team_obj; + OSHMPI_TEAM_GET_OBJ(team, team_obj); + OSHMPI_allreduce_team(team_obj, dest, source, nreduce, MPI_TYPE, MPI_PROD); + return SHMEM_SUCCESS; +} +/* TPL_BLOCK_END */