From c938e003ccc136e79dccd93f548ff295f2c69068 Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:09:13 -0600 Subject: [PATCH 01/12] New comm_t object --- include/comm.hpp | 567 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 567 insertions(+) create mode 100644 include/comm.hpp diff --git a/include/comm.hpp b/include/comm.hpp new file mode 100644 index 0000000..7781da0 --- /dev/null +++ b/include/comm.hpp @@ -0,0 +1,567 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef LIBP_COMM_HPP +#define LIBP_COMM_HPP + +#include +#include "core.hpp" + +namespace libp { + +#define MAX_PROCESSOR_NAME MPI_MAX_PROCESSOR_NAME + +/*Generic data type*/ +template +struct mpiType { + static MPI_Datatype getMpiType() { + MPI_Datatype type; + MPI_Type_contiguous(sizeof(T), MPI_CHAR, &type); + MPI_Type_commit(&type); + return type; + } + static void freeMpiType(MPI_Datatype type) { + MPI_Type_free(&type); + } + static constexpr bool isMpiType() { return false; } +}; + +/*Pre-defined MPI datatypes*/ +#define TYPE(T, MPI_T) \ +template<> struct mpiType { \ + static MPI_Datatype getMpiType() { return MPI_T; } \ + static void freeMpiType(MPI_Datatype type) { } \ + static constexpr bool isMpiType() { return true; } \ +} + +TYPE(char, MPI_CHAR); +TYPE(int, MPI_INT); +TYPE(long long int, MPI_LONG_LONG_INT); +TYPE(float, MPI_FLOAT); +TYPE(double, MPI_DOUBLE); +#undef TYPE + +/*Communicator class*/ +class comm_t { + + private: + std::shared_ptr comm_ptr; + int _rank=0; + int _size=0; + + public: + comm_t() = default; + comm_t(const comm_t &c) = default; + comm_t& operator = (const comm_t &c)=default; + + /*Static MPI_Init and MPI_Finalize*/ + static void Init(int &argc, char** &argv); + static void Finalize(); + + /*Static handle to MPI_COMM_WORLD*/ + static comm_t world(); + + /*MPI_Comm_dup and MPI_Comm_delete*/ + comm_t Dup() const; + comm_t Split(const int color, const int key) const; + void Free(); + + /*Rank and size getters*/ + const int rank() const; + const int size() const; + + /*MPI_Comm getter*/ + MPI_Comm comm() const; + + using request_t = MPI_Request; + + /*Predefined ops*/ + using op_t = MPI_Op; + static constexpr op_t Max = MPI_MAX; + static constexpr op_t Min = MPI_MIN; + static constexpr op_t Sum = MPI_SUM; + static constexpr op_t Prod = MPI_PROD; + static constexpr op_t And = MPI_LAND; + static constexpr op_t Or = MPI_LOR; + static constexpr op_t Xor = MPI_LXOR; + + /*libp::memory send*/ + template class mem, typename T> + void Send(mem m, + const int dest, + const int count=-1, + const int tag=0) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Send(m.ptr(), cnt, type, dest, tag, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory recv*/ + template class mem, typename T> + void Recv(mem m, + const int source, + const int count=-1, + const int tag=0) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Recv(m.ptr(), cnt, type, source, tag, comm()); + mpiType::freeMpiType(type); + } + + /*scalar send*/ + template + void Send(T& val, + const int dest, + const int tag=0) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Send(&val, 1, type, dest, tag, comm()); + mpiType::freeMpiType(type); + } + + /*scalar recv*/ + template + void Recv(T& val, + const int source, + const int tag=0) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Recv(&val, 1, type, source, tag, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory non-blocking send*/ + template class mem, typename T> + void Isend(mem m, + const int dest, + const int count, + const int tag, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Isend(m.ptr(), count, type, dest, tag, comm(), &request); + mpiType::freeMpiType(type); + } + + /*libp::memory non-blocking recv*/ + template class mem, typename T> + void Irecv(mem m, + const int source, + const int count, + const int tag, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Irecv(m.ptr(), count, type, source, tag, comm(), &request); + mpiType::freeMpiType(type); + } + + /*scalar non-blocking send*/ + template + void Isend(T& val, + const int dest, + const int tag, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Isend(&val, 1, type, dest, tag, comm(), &request); + mpiType::freeMpiType(type); + } + + /*scalar non-blocking recv*/ + template + void Irecv(T& val, + const int source, + const int tag, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Irecv(&val, 1, type, source, tag, comm(), &request); + mpiType::freeMpiType(type); + } + + /*libp::memory broadcast*/ + template class mem, typename T> + void Bcast(mem m, + const int root, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Bcast(m.ptr(), cnt, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*scalar broadcast*/ + template + void Bcast(T& val, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Bcast(&val, 1, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory reduce*/ + template class mem, typename T> + void Reduce(const mem snd, + mem rcv, + const int root, + const op_t op = Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(snd.length()) : count; + MPI_Reduce(snd.ptr(), rcv.ptr(), cnt, type, op, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory in-place reduce*/ + template class mem, typename T> + void Reduce(mem m, + const int root, + const op_t op = Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + if (_rank==root) { + MPI_Reduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, root, comm()); + } else { + MPI_Reduce(m.ptr(), nullptr, cnt, type, op, root, comm()); + } + mpiType::freeMpiType(type); + } + + /*scalar reduce*/ + template + void Reduce(const T& snd, + T& rcv, + const int root, + const op_t op = Sum) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Reduce(&snd, &rcv, 1, type, op, root, comm()); + mpiType::freeMpiType(type); + } + template + void Reduce(T& val, + const int root, + const op_t op = Sum) const { + T rcv=val; + Reduce(val, rcv, root, op); + if (rank()==root) val=rcv; + } + + /*libp::memory allreduce*/ + template class mem, typename T> + void Allreduce(const mem snd, + mem rcv, + const op_t op = Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(snd.length()) : count; + MPI_Allreduce(snd.ptr(), rcv.ptr(), cnt, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory in-place allreduce*/ + template class mem, typename T> + void Allreduce(mem m, + const op_t op = Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Allreduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*scalar allreduce*/ + template + void Allreduce(const T& snd, + T& rcv, + const op_t op = Sum) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Allreduce(&snd, &rcv, 1, type, op, comm()); + mpiType::freeMpiType(type); + } + template + void Allreduce(T& val, + const op_t op = Sum) const { + T rcv=val; + Allreduce(val, rcv, op); + val = rcv; + } + + /*libp::memory non-blocking allreduce*/ + template class mem, typename T> + void Iallreduce(const mem snd, + mem rcv, + const op_t op, + const int count, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Iallreduce(snd.ptr(), rcv.ptr(), count, type, op, comm(), &request); + mpiType::freeMpiType(type); + } + + /*libp::memory non-blocking in-place allreduce*/ + template class mem, typename T> + void Iallreduce(mem m, + const int root, + const op_t op, + const int count, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Iallreduce(MPI_IN_PLACE, m.ptr(), count, type, op, comm(), &request); + mpiType::freeMpiType(type); + } + + /*scalar non-blocking allreduce*/ + template class mem, typename T> + void Iallreduce(const T& snd, + T& rcv, + const op_t op, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Iallreduce(&snd, &rcv, 1, type, op, comm(), &request); + mpiType::freeMpiType(type); + } + /*scalar non-blocking in-place allreduce*/ + template class mem, typename T> + void Iallreduce(T& val, + const op_t op, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Iallreduce(MPI_IN_PLACE, &val, 1, type, op, comm(), &request); + mpiType::freeMpiType(type); + } + + /*libp::memory scan*/ + template class mem, typename T> + void Scan(const mem snd, + mem rcv, + const op_t op = Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(snd.length()) : count; + MPI_Scan(snd.ptr(), rcv.ptr(), cnt, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory in-place scan*/ + template class mem, typename T> + void Scan(mem m, + const op_t op = Sum, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(m.length()) : count; + MPI_Scan(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm()); + mpiType::freeMpiType(type); + } + + /*scalar scan*/ + template + void Scan(const T& snd, + T& rcv, + const op_t op = Sum) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Scan(&snd, &rcv, 1, type, op, comm()); + mpiType::freeMpiType(type); + } + template + void Scan(T& snd, + const op_t op = Sum) const { + T rcv=snd; + Scan(snd, rcv, op); + snd = rcv; + } + + /*libp::memory gather*/ + template class mem, typename T> + void Gather(const mem snd, + mem rcv, + const int root, + const int sendCount=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (sendCount==-1) ? static_cast(snd.length()) : sendCount; + MPI_Gather(snd.ptr(), cnt, type, + rcv.ptr(), cnt, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory gatherv*/ + template class mem, typename T> + void Gatherv(const mem snd, + const int sendcount, + mem rcv, + const memory recvCounts, + const memory recvOffsets, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Gatherv(snd.ptr(), sendcount, type, + rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, + root, comm()); + mpiType::freeMpiType(type); + } + + /*scalar gather*/ + template class mem, typename T> + void Gather(const T& snd, + mem rcv, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Gather(&snd, 1, type, + rcv.ptr(), 1, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory scatter*/ + template class mem, typename T> + void Scatter(const mem snd, + mem rcv, + const int root, + const int count=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (count==-1) ? static_cast(rcv.length()) : count; + MPI_Scatter(snd.ptr(), cnt, type, + rcv.ptr(), cnt, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory scatterv*/ + template class mem, typename T> + void Scatterv(const mem snd, + const memory sendCounts, + const memory sendOffsets, + mem rcv, + const int recvcount, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Scatterv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, + rcv.ptr(), recvcount, type, + root, comm()); + mpiType::freeMpiType(type); + } + + /*scalar scatter*/ + template class mem, typename T> + void Scatter(T& rcv, + const mem snd, + const int root) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Scatter(snd.ptr, 1, type, + &rcv, 1, type, root, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory allgather*/ + template class mem, typename T> + void Allgather(const mem snd, + mem rcv, + const int sendCount=-1) const { + MPI_Datatype type = mpiType::getMpiType(); + const int cnt = (sendCount==-1) ? static_cast(snd.length()) : sendCount; + MPI_Allgather(snd.ptr(), cnt, type, + rcv.ptr(), cnt, type, comm()); + mpiType::freeMpiType(type); + } + template class mem, typename T> + void Allgather(mem m, + const int cnt) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Allgather(MPI_IN_PLACE, cnt, type, + m.ptr(), cnt, type, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory allgatherv*/ + template class mem, typename T> + void Allgatherv(const mem snd, + const int sendcount, + mem rcv, + const memory recvCounts, + const memory recvOffsets) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Allgatherv(snd.ptr(), sendcount, type, + rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, + comm()); + mpiType::freeMpiType(type); + } + + /*scalar allgather*/ + template class mem, typename T> + void Allgather(const T& snd, + mem rcv) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Allgather(&snd, 1, type, + rcv.ptr(), 1, type, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory alltoall*/ + template class mem, typename T> + void Alltoall(const mem snd, + mem rcv, + const int cnt=1) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Alltoall(snd.ptr(), cnt, type, + rcv.ptr(), cnt, type, comm()); + mpiType::freeMpiType(type); + } + + /*libp::memory alltoallv*/ + template class mem, typename T> + void Alltoallv(const mem snd, + const memory sendCounts, + const memory sendOffsets, + mem rcv, + const memory recvCounts, + const memory recvOffsets) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Alltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, + rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, + comm()); + mpiType::freeMpiType(type); + } + + template class mem, typename T> + void Ialltoallv(const mem snd, + const memory sendCounts, + const memory sendOffsets, + mem rcv, + const memory recvCounts, + const memory recvOffsets, + request_t &request) const { + MPI_Datatype type = mpiType::getMpiType(); + MPI_Ialltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, + rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, + comm(), &request); + mpiType::freeMpiType(type); + } + + void Wait(request_t &request) const; + void Waitall(const int count, memory &requests) const; + void Barrier() const; + + static void GetProcessorName(char* name, int &namelen) { + MPI_Get_processor_name(name,&namelen); + } +}; + +} //namespace libp + +#endif From 395bb934a4a048f596bdb2d67c6b6755640f4097 Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:09:36 -0600 Subject: [PATCH 02/12] New comm_t object --- libs/core/comm.cpp | 110 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 libs/core/comm.cpp diff --git a/libs/core/comm.cpp b/libs/core/comm.cpp new file mode 100644 index 0000000..02b33dc --- /dev/null +++ b/libs/core/comm.cpp @@ -0,0 +1,110 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "comm.hpp" + +namespace libp { + +/*Static MPI_Init and MPI_Finalize*/ +void comm_t::Init(int &argc, char** &argv) { MPI_Init(&argc, &argv); } +void comm_t::Finalize() { MPI_Finalize(); } + +/*Static handle to MPI_COMM_WORLD*/ +comm_t comm_t::world() { + comm_t c; + c.comm_ptr = std::make_shared(); + *(c.comm_ptr) = MPI_COMM_WORLD; + MPI_Comm_rank(c.comm(), &(c._rank)); + MPI_Comm_size(c.comm(), &(c._size)); + return c; +} + +/*MPI_Comm_dup and free*/ +comm_t comm_t::Dup() const { + comm_t c; + /*Make a new comm shared_ptr, which will call MPI_Comm_free when destroyed*/ + c.comm_ptr = std::shared_ptr(new MPI_Comm, + [](MPI_Comm *comm) { + if (*comm != MPI_COMM_NULL) + MPI_Comm_free(comm); + delete comm; + }); + MPI_Comm_dup(comm(), c.comm_ptr.get()); + MPI_Comm_rank(c.comm(), &(c._rank)); + MPI_Comm_size(c.comm(), &(c._size)); + return c; +} +void comm_t::Free() { + comm_ptr = nullptr; + _rank=0; + _size=0; +} +/*Split*/ +comm_t comm_t::Split(const int color, const int key) const { + comm_t c; + /*Make a new comm shared_ptr, which will call MPI_Comm_free when destroyed*/ + c.comm_ptr = std::shared_ptr(new MPI_Comm, + [](MPI_Comm *comm) { + if (*comm != MPI_COMM_NULL) + MPI_Comm_free(comm); + delete comm; + }); + + MPI_Comm_split(comm(), color, key, c.comm_ptr.get()); + MPI_Comm_rank(c.comm(), &(c._rank)); + MPI_Comm_size(c.comm(), &(c._size)); + return c; +} + +/*Rank and size getters*/ +const int comm_t::rank() const { + return _rank; +} +const int comm_t::size() const { + return _size; +} + +MPI_Comm comm_t::comm() const { + if (comm_ptr == nullptr) { + return MPI_COMM_NULL; + } else { + return *comm_ptr; + } +} + +void comm_t::Wait(request_t &request) const { + MPI_Wait(&request, MPI_STATUS_IGNORE); +} + +void comm_t::Waitall(const int count, memory &requests) const { + MPI_Waitall(count, requests.ptr(), MPI_STATUSES_IGNORE); +} + +void comm_t::Barrier() const { + MPI_Barrier(comm()); +} + +} //namespace libp From 79783f9f534a761ed0098fbdf3bafcfabcb6f05a Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:09:58 -0600 Subject: [PATCH 03/12] New timer wrapper --- include/timer.hpp | 56 ++++++++++++++++++++++++++++++++++++++++++ libs/core/timer.cpp | 60 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 include/timer.hpp create mode 100644 libs/core/timer.cpp diff --git a/include/timer.hpp b/include/timer.hpp new file mode 100644 index 0000000..15a7067 --- /dev/null +++ b/include/timer.hpp @@ -0,0 +1,56 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef LIBP_TIMER_HPP +#define LIBP_TIMER_HPP + +#include "core.hpp" +#include "comm.hpp" +#include "platform.hpp" +#include + +namespace libp { + +using timePoint_t = std::chrono::time_point; + +/* Host time*/ +timePoint_t Time(); + +/* Host time after global sync*/ +timePoint_t GlobalTime(comm_t comm); + +/* Host time after platform sync*/ +timePoint_t PlatformTime(platform_t &platform); + +/* Host time after platform sync*/ +timePoint_t GlobalPlatformTime(platform_t &platform); + +/*Time between time points, in seconds*/ +double ElapsedTime(const timePoint_t start, const timePoint_t end); + +} //namespace libp + +#endif diff --git a/libs/core/timer.cpp b/libs/core/timer.cpp new file mode 100644 index 0000000..cb9cfaf --- /dev/null +++ b/libs/core/timer.cpp @@ -0,0 +1,60 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "timer.hpp" + +namespace libp { + +/* Host time*/ +timePoint_t Time() { + return std::chrono::high_resolution_clock::now(); +} + +/* Host time after global sync*/ +timePoint_t GlobalTime(comm_t comm) { + comm.Barrier(); + return Time(); +} + +/* Host time after platform sync*/ +timePoint_t PlatformTime(platform_t &platform) { + platform.finish(); + return Time(); +} + +/* Host time after platform sync*/ +timePoint_t GlobalPlatformTime(platform_t &platform) { + platform.finish(); + platform.comm.Barrier(); + return Time(); +} + +/*Time between time points, in seconds*/ +double ElapsedTime(const timePoint_t start, const timePoint_t end) { + return std::chrono::duration_cast(end-start).count()/(1.0e6); +} + +} //namespace libp From b7af4f78c49bddfa144cca84faed6fc1c42ee1f0 Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:10:23 -0600 Subject: [PATCH 04/12] Better exception handling --- include/utils.hpp | 104 +++++++++++++++++++++++++++------------- libs/core/exception.cpp | 90 ++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 32 deletions(-) create mode 100644 libs/core/exception.cpp diff --git a/include/utils.hpp b/include/utils.hpp index bd4c645..4fdc85b 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -27,48 +27,88 @@ SOFTWARE. #ifndef UTILS_HPP #define UTILS_HPP -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "types.h" namespace libp { +using properties_t = occa::json; +using device_t = occa::device; +using kernel_t = occa::kernel; +using stream_t = occa::stream; + //error codes -#define HIPBONE_SUCCESS 0 -#define HIPBONE_ERROR -1 +#define LIBP_SUCCESS 0 +#define LIBP_ERROR -1 #ifndef __PRETTY_FUNCTION__ # define __PRETTY_FUNCTION__ __FUNCTION__ #endif -#define HIPBONE_ABORT2(filename, function, line, message) \ - { \ - std::string banner = "---[ Error ]"; \ - std::cerr << '\n' \ - << std::string(74, '=') << '\n' \ - << banner << std::string(74 - banner.size(), '-') << '\n' \ - << " File : " << filename << '\n' \ - << " Line : " << line << '\n' \ - << " Function : " << function << '\n' \ - << " Message : " << message << '\n' \ - << std::string(74, '=') << '\n'; \ - MPI_Abort(MPI_COMM_WORLD,HIPBONE_ERROR); \ - } -#define HIPBONE_ABORT(message) HIPBONE_ABORT2(__FILE__, __PRETTY_FUNCTION__, __LINE__, message) - -#define HIPBONE_WARNING(message) \ - { \ - std::string banner = "---[ Warning ]"; \ - std::cerr << '\n' \ - << std::string(74, '=') << '\n' \ - << banner << std::string(74 - banner.size(), '-') << '\n' \ - << " " << message << '\n' \ - << std::string(74, '=') << '\n'; \ - } - -#define mymax(a,b) (((a)>(b))?(a):(b)) -#define mymin(a,b) (((a)<(b))?(a):(b)) +#define LIBP_TEMPLATE_CHECK(checkFunction, expr, filename, function, line, message) \ + do { \ + const bool isErr = (bool) (expr); \ + if (isErr) { \ + std::stringstream _check_ss; \ + _check_ss << message; \ + checkFunction(filename, function, line, _check_ss.str()); \ + } \ + } while (false) + +#define LIBP_ABORT3(expr, filename, function, line, message) LIBP_TEMPLATE_CHECK(libp::abort, expr, filename, function, line, message) +#define LIBP_ABORT2(expr, filename, function, line, message) LIBP_ABORT3(expr, filename, function, line, message) +#define LIBP_ABORT(message, expr) LIBP_ABORT2(expr, __FILE__, __PRETTY_FUNCTION__, __LINE__, message) + +#define LIBP_WARNING3(expr, filename, function, line, message) LIBP_TEMPLATE_CHECK(libp::warn, expr, filename, function, line, message) +#define LIBP_WARNING2(expr, filename, function, line, message) LIBP_WARNING3(expr, filename, function, line, message) +#define LIBP_WARNING(message, expr) LIBP_WARNING2(expr, __FILE__, __PRETTY_FUNCTION__, __LINE__, message) + +#define LIBP_FORCE_ABORT(message) LIBP_ABORT(message, true) +#define LIBP_FORCE_WARNING(message) LIBP_WARNING(message, true) + +class exception : public std::exception { + public: + const std::string header; + const std::string filename; + const std::string function; + const std::string message; + const int line; + + std::string exceptionMessage; + + exception(const std::string &header_, + const std::string &filename_, + const std::string &function_, + const int line_, + const std::string &message_); + ~exception() throw(); + + const char* what() const throw(); + std::string toString() const; + std::string location() const; +}; + +std::ostream& operator << (std::ostream& out, + const exception &exc); + +void abort(const std::string &filename, + const std::string &function, + const int line, + const std::string &message); + +void warn(const std::string &filename, + const std::string &function, + const int line, + const std::string &message); } //namespace libp diff --git a/libs/core/exception.cpp b/libs/core/exception.cpp new file mode 100644 index 0000000..5fd60d8 --- /dev/null +++ b/libs/core/exception.cpp @@ -0,0 +1,90 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017-2021 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "utils.hpp" + +namespace libp { + + +exception::exception(const std::string &header_, + const std::string &filename_, + const std::string &function_, + const int line_, + const std::string &message_) : + header(header_), + filename(filename_), + function(function_), + message(message_), + line(line_), + exceptionMessage(toString()) {} + +exception::~exception() throw() {} + +const char* exception::what() const throw() { + return exceptionMessage.c_str(); +} + +std::string exception::toString() const { + std::stringstream ss; + std::string banner = "---[ " + header + " ]"; + ss << '\n' + << banner << std::string(80 - banner.size(), '-') << '\n' + << location() + << " Message : " << message << '\n' + << std::string(80, '=') << '\n'; + return ss.str(); +} + +std::string exception::location() const { + std::stringstream ss; + ss << " File : " << filename << '\n' + << " Line : " << line << '\n' + << " Function : " << function << '\n'; + return ss.str(); +} + +std::ostream& operator << (std::ostream& out, + const exception &exc) { + out << exc.toString() << std::flush; + return out; +} + +void abort(const std::string &filename, + const std::string &function, + const int line, + const std::string &message) { + throw exception("Error", filename, function, line, message); +} + +void warn(const std::string &filename, + const std::string &function, + const int line, + const std::string &message) { + exception exp("Warning", filename, function, line, message); + std::cout << exp; +} + +} //namespace libp From a3b2cf1b23ea211c07d59a450d82e2bc4134c406 Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:11:03 -0600 Subject: [PATCH 05/12] Latest ogs lib from libp --- include/ogs.hpp | 190 +++++++----- include/ogs/ogsBase.hpp | 25 +- include/ogs/ogsExchange.hpp | 308 +++++++++++++------- include/ogs/ogsOperator.hpp | 125 ++++---- include/ogs/ogsUtils.hpp | 26 +- libs/ogs/ogs.cpp | 528 +++++++++++++++++++++++++--------- libs/ogs/ogsAllToAll.cpp | 210 +++++++------- libs/ogs/ogsAuto.cpp | 145 +++++++--- libs/ogs/ogsCrystalRouter.cpp | 295 +++++++++++-------- libs/ogs/ogsHalo.cpp | 363 ++++++++++++++++------- libs/ogs/ogsOperator.cpp | 456 ++++++++++++++--------------- libs/ogs/ogsPairwise.cpp | 254 ++++++++-------- libs/ogs/ogsSetup.cpp | 208 +++++++------- libs/ogs/ogsUtils.cpp | 61 +--- 14 files changed, 1918 insertions(+), 1276 deletions(-) mode change 100644 => 100755 include/ogs.hpp diff --git a/include/ogs.hpp b/include/ogs.hpp old mode 100644 new mode 100755 index e97010a..cddd8e6 --- a/include/ogs.hpp +++ b/include/ogs.hpp @@ -29,9 +29,9 @@ SOFTWARE. The code - MPI_Comm comm; - dlong N; - hlong id[N]; // the hlong and dlong types are defined in "types.h" + comm_t comm; + dlong N; + memory id(N); // the hlong and dlong types are defined in "types.h" bool verbose; bool unique; ogs_t ogs(platform); @@ -57,9 +57,9 @@ SOFTWARE. A basic gatherScatter operation is, e.g., - occa::memory o_v; + deviceMemory o_v; ... - ogs.GatherScatter(o_v, 1, ogs::Double, ogs::Add, ogs::Sym); + ogs.GatherScatter(o_v, 1, ogs::Add, ogs::Sym); This gs call has the effect, @@ -74,14 +74,11 @@ SOFTWARE. Summation on doubles is not the only operation and datatype supported. Support includes the operations ogs::Add, ogs::Mul, ogs::Max, ogs::Min - and datatypes - ogs::Dfloat, ogs::Double, ogs::Float, ogs::Int32, ogs::Int64, ogs::Dlong, ogs::Hlong. - (The int32 and int64 types are the normal C++ types, whereas dfloat, dlong, and hlong - are defined in "types.h"). + and datatypes: float, double, int, long long int. For the nonsymmetric behavior, the "Transpose" parameter is important: - ogs.GatherScatter(o_v, 1, ogs::Double, ogs::Add, ogs::[NoTrans/Trans/Sym]); + ogs.GatherScatter(o_v, 1, ogs::Add, ogs::[NoTrans/Trans/Sym]); When transpose == ogs::NoTrans, any "flagged" (p,i) pairs (id[i] negative on p) do not participate in the sum, but *do* still receive the sum on output. @@ -102,7 +99,7 @@ SOFTWARE. consistent way. When all groups of (p,i) pairs have a single "unflagged" pair in this mannor, an additional nonsymmetric operation is available: - ogs.Gather(o_Gv, o_v, 1, ogs::Double, ogs::Add, ogs::Trans); + ogs.Gather(o_Gv, o_v, 1, ogs::Add, ogs::Trans); this has the effect of "assembling" the vector o_Gv. That is @@ -115,7 +112,7 @@ SOFTWARE. The inverse of this operation is - ogs.Scatter(o_v, o_Gv, 1, ogs::Double, ogs::Add, ogs::Trans); + ogs.Scatter(o_v, o_Gv, 1, ogs::Add, ogs::Trans); which has the effect of scattering in the assembled entries in o_Gv back to the orginal ordering. When Transpose == ogs::Trans, "flagged" (p,i) pairs (id[i] @@ -124,7 +121,7 @@ SOFTWARE. For operating on contiguously packed vectors, the K parameter is used, e.g., - ogs.GatherScatter(o_v, 3, ogs::Double, ogs::Add, ogs::Sym); + ogs.GatherScatter(o_v, 3, ogs::Add, ogs::Sym); which is like "GatherScatter" operating on the datatype double[3], with summation here being vector summation. Number of messages sent @@ -132,9 +129,9 @@ SOFTWARE. Asynchronous versions of the various GatherScatter functions are provided by - ogs.GatherScatterStart(o_v, k, ogs::Double, ogs::Add, ogs::Sym); + ogs.GatherScatterStart(o_v, k, ogs::Add, ogs::Sym); ... - ogs.GatherScatterFinish(o_v, k, ogs::Double, ogs::Add, ogs::Sym); + ogs.GatherScatterFinish(o_v, k, ogs::Add, ogs::Sym); MPI communication is not initiated in GatherScatterStart, rather some initial message packing and host<->device transfers are queued. The user can then queue @@ -154,14 +151,14 @@ SOFTWARE. halo_t halo(platofrm); halo.Setup(N, ids, comm, ogs::Auto, verbose); - halo.Exchange(o_v, k, ogs::Double); + halo.Exchange(o_v, k); which has the effect of filling all "flagged" pairs (p,i) on all processes with the corresponding value from the unique "unflagged" pair in S_j. An additional untility operation available in the halo_t object is - halo.Combine(o_v, k, ogs::Double); + halo.Combine(o_v, k); which has the effect of summing the entries in S_j and writing the result to the sole "unflagged" pair in S_j. @@ -222,94 +219,125 @@ class ogs_t : public ogsBase_t { ~ogs_t()=default; void Setup(const dlong _N, - hlong *ids, - MPI_Comm _comm, + memory ids, + comm_t _comm, const Kind _kind, const Method method, const bool _unique, const bool verbose, platform_t& _platform); - void SetupGlobalToLocalMapping(dlong *GlobalToLocal); + void SetupGlobalToLocalMapping(memory GlobalToLocal); - // host versions - void GatherScatter(void* v, + // Synchronous host versions + template + void GatherScatter(memory v, const int k, - const Type type, const Op op, const Transpose trans); + // Asynchronous host buffer versions + template + void GatherScatterStart (memory v, + const int k, + const Op op, + const Transpose trans); + template + void GatherScatterFinish(memory v, + const int k, + const Op op, + const Transpose trans); // Synchronous device buffer versions - void GatherScatter(occa::memory& o_v, + template + void GatherScatter(deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans); // Asynchronous device buffer versions - void GatherScatterStart (occa::memory& o_v, + template + void GatherScatterStart (deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans); - void GatherScatterFinish(occa::memory& o_v, + template + void GatherScatterFinish(deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans); - // host versions - void Gather(void* gv, - const void* v, + // Synchronous host versions + template + void Gather(memory gv, + const memory v, const int k, - const Type type, const Op op, const Transpose trans); + // Asynchronous host buffer versions + template + void GatherStart (memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans); + template + void GatherFinish(memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans); // Synchronous device buffer versions - void Gather(occa::memory& o_gv, - occa::memory& o_v, + template + void Gather(deviceMemory o_gv, + deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans); // Asynchronous device buffer versions - void GatherStart (occa::memory& o_gv, - occa::memory& o_v, + template + void GatherStart (deviceMemory o_gv, + deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans); - void GatherFinish(occa::memory& o_gv, - occa::memory& o_v, + template + void GatherFinish(deviceMemory o_gv, + deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans); - // host versions - void Scatter(void* v, - const void* gv, + // Synchronous host versions + template + void Scatter(memory v, + const memory gv, const int k, - const Type type, - const Op op, const Transpose trans); + // Asynchronous host buffer versions + template + void ScatterStart (memory v, + const memory gv, + const int k, + const Transpose trans); + template + void ScatterFinish(memory v, + memory gv, + const int k, + const Transpose trans); // Synchronous device buffer versions - void Scatter(occa::memory& o_v, - occa::memory& o_gv, + template + void Scatter(deviceMemory o_v, + deviceMemory o_gv, const int k, - const Type type, - const Op op, const Transpose trans); // Asynchronous device buffer versions - void ScatterStart (occa::memory& o_v, - occa::memory& o_gv, + template + void ScatterStart (deviceMemory o_v, + deviceMemory o_gv, const int k, - const Type type, - const Op op, const Transpose trans); - void ScatterFinish(occa::memory& o_v, - occa::memory& o_gv, + template + void ScatterFinish(deviceMemory o_v, + deviceMemory o_gv, const int k, - const Type type, - const Op op, const Transpose trans); friend class halo_t; @@ -325,29 +353,47 @@ class halo_t : public ogsBase_t { dlong Nhalo=0; void Setup(const dlong _N, - hlong *ids, - MPI_Comm _comm, + memory ids, + comm_t _comm, const Method method, const bool verbose, platform_t& _platform); void SetupFromGather(ogs_t& ogs); - // Host version - void Exchange(void *v, const int k, const Type type); + // Synchronous Host version + template + void Exchange(memory v, const int k); + // Asynchronous host version + template + void ExchangeStart (memory v, const int k); + template + void ExchangeFinish(memory v, const int k); // Synchronous device buffer version - void Exchange(occa::memory &o_v, const int k, const Type type); + template + void Exchange(deviceMemory o_v, const int k); // Asynchronous device buffer version - void ExchangeStart (occa::memory &o_v, const int k, const Type type); - void ExchangeFinish(occa::memory &o_v, const int k, const Type type); - - // Host version - void Combine(void *v, const int k, const Type type); + template + void ExchangeStart (deviceMemory o_v, const int k); + template + void ExchangeFinish(deviceMemory o_v, const int k); + + // Synchronous Host version + template + void Combine(memory v, const int k); + // Asynchronous host version + template + void CombineStart (memory v, const int k); + template + void CombineFinish(memory v, const int k); // Synchronous device buffer version - void Combine(occa::memory &o_v, const int k, const Type type); + template + void Combine(deviceMemory o_v, const int k); // Asynchronous device buffer version - void CombineStart (occa::memory &o_v, const int k, const Type type); - void CombineFinish(occa::memory &o_v, const int k, const Type type); + template + void CombineStart (deviceMemory o_v, const int k); + template + void CombineFinish(deviceMemory o_v, const int k); }; } //namespace ogs diff --git a/include/ogs/ogsBase.hpp b/include/ogs/ogsBase.hpp index 9c42be4..4962578 100644 --- a/include/ogs/ogsBase.hpp +++ b/include/ogs/ogsBase.hpp @@ -45,7 +45,7 @@ class halo_t; class ogsBase_t { public: platform_t platform; - MPI_Comm comm; + comm_t comm; dlong N=0; dlong Ngather=0; // total number of local positive gather nodes @@ -61,13 +61,14 @@ class ogsBase_t { bool unique=false; bool gather_defined=false; + static stream_t dataStream; + ogsBase_t()=default; - ogsBase_t(platform_t& _platform); virtual ~ogsBase_t()=default; virtual void Setup(const dlong _N, - hlong *ids, - MPI_Comm _comm, + memory ids, + comm_t _comm, const Kind _kind, const Method method, const bool _unique, @@ -84,22 +85,22 @@ class ogsBase_t { private: void FindSharedNodes(const dlong Nids, - libp::memory &nodes, + memory &nodes, const int verbose); void ConstructSharedNodes(const dlong Nids, - libp::memory &nodes, + memory &nodes, dlong &Nshared, - libp::memory &sharedNodes); + memory &sharedNodes); - void LocalSignedSetup(const dlong Nids, libp::memory &nodes); - void LocalUnsignedSetup(const dlong Nids, libp::memory &nodes); - void LocalHaloSetup(const dlong Nids, libp::memory &nodes); + void LocalSignedSetup(const dlong Nids, memory &nodes); + void LocalUnsignedSetup(const dlong Nids, memory &nodes); + void LocalHaloSetup(const dlong Nids, memory &nodes); ogsExchange_t* AutoSetup(dlong Nshared, - libp::memory &sharedNodes, + memory &sharedNodes, ogsOperator_t& gatherHalo, - MPI_Comm _comm, + comm_t _comm, platform_t &_platform, const int verbose); }; diff --git a/include/ogs/ogsExchange.hpp b/include/ogs/ogsExchange.hpp index f5bd9c6..4f80d7c 100644 --- a/include/ogs/ogsExchange.hpp +++ b/include/ogs/ogsExchange.hpp @@ -38,16 +38,16 @@ namespace ogs { class ogsExchange_t { public: platform_t platform; - MPI_Comm comm; + comm_t comm; int rank, size; dlong Nhalo, NhaloP; - char* haloBuf; - occa::memory o_haloBuf, h_haloBuf; + pinnedMemory h_workspace, h_sendspace; + deviceMemory o_workspace, o_sendspace; - static occa::stream dataStream; - static occa::kernel extractKernel[4]; + stream_t dataStream; + static kernel_t extractKernel[4]; #ifdef GPU_AWARE_MPI bool gpu_aware=true; @@ -55,27 +55,33 @@ class ogsExchange_t { bool gpu_aware=false; #endif - ogsExchange_t(platform_t &_platform, MPI_Comm _comm): + ogsExchange_t(platform_t &_platform, comm_t _comm, + stream_t _datastream): platform(_platform), - comm(_comm) { - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); - - if (!dataStream.isInitialized()) - dataStream = platform.device.createStream(); + comm(_comm), + dataStream(_datastream) { + rank = comm.rank(); + size = comm.size(); } virtual ~ogsExchange_t() {} - virtual void Start(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host=false)=0; - virtual void Finish(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host=false)=0; + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans)=0; + + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans)=0; virtual void AllocBuffer(size_t Nbytes)=0; @@ -87,46 +93,76 @@ class ogsAllToAll_t: public ogsExchange_t { private: dlong NsendN=0, NsendT=0; - libp::memory sendIdsN, sendIdsT; - occa::memory o_sendIdsN, o_sendIdsT; + memory sendIdsN, sendIdsT; + deviceMemory o_sendIdsN, o_sendIdsT; ogsOperator_t postmpi; - char* sendBuf; - occa::memory o_sendBuf; - occa::memory h_sendBuf; + memory mpiSendCountsN; + memory mpiSendCountsT; + memory mpiRecvCountsN; + memory mpiRecvCountsT; + memory mpiSendOffsetsN; + memory mpiSendOffsetsT; + memory mpiRecvOffsetsN; + memory mpiRecvOffsetsT; - libp::memory mpiSendCountsN; - libp::memory mpiSendCountsT; - libp::memory mpiRecvCountsN; - libp::memory mpiRecvCountsT; - libp::memory mpiSendOffsetsN; - libp::memory mpiSendOffsetsT; - libp::memory mpiRecvOffsetsN; - libp::memory mpiRecvOffsetsT; + memory sendCounts; + memory recvCounts; + memory sendOffsets; + memory recvOffsets; - libp::memory sendCounts; - libp::memory recvCounts; - libp::memory sendOffsets; - libp::memory recvOffsets; + comm_t::request_t request; public: ogsAllToAll_t(dlong Nshared, - libp::memory &sharedNodes, + memory &sharedNodes, ogsOperator_t &gatherHalo, - MPI_Comm _comm, + stream_t _dataStream, + comm_t _comm, platform_t &_platform); - virtual void Start(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host=false); - virtual void Finish(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host=false); + template + void Start(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + + template + void Start(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); virtual void AllocBuffer(size_t Nbytes); @@ -137,49 +173,76 @@ class ogsPairwise_t: public ogsExchange_t { private: dlong NsendN=0, NsendT=0; - libp::memory sendIdsN, sendIdsT; - occa::memory o_sendIdsN, o_sendIdsT; + memory sendIdsN, sendIdsT; + deviceMemory o_sendIdsN, o_sendIdsT; ogsOperator_t postmpi; - char* sendBuf; - occa::memory o_sendBuf; - occa::memory h_sendBuf; - int NranksSendN=0, NranksRecvN=0; int NranksSendT=0, NranksRecvT=0; - libp::memory sendRanksN; - libp::memory sendRanksT; - libp::memory recvRanksN; - libp::memory recvRanksT; - libp::memory sendCountsN; - libp::memory sendCountsT; - libp::memory recvCountsN; - libp::memory recvCountsT; - libp::memory sendOffsetsN; - libp::memory sendOffsetsT; - libp::memory recvOffsetsN; - libp::memory recvOffsetsT; - libp::memory requests; - libp::memory statuses; + memory sendRanksN; + memory sendRanksT; + memory recvRanksN; + memory recvRanksT; + memory sendCountsN; + memory sendCountsT; + memory recvCountsN; + memory recvCountsT; + memory sendOffsetsN; + memory sendOffsetsT; + memory recvOffsetsN; + memory recvOffsetsT; + memory requests; public: ogsPairwise_t(dlong Nshared, - libp::memory &sharedNodes, + memory &sharedNodes, ogsOperator_t &gatherHalo, - MPI_Comm _comm, + stream_t _dataStream, + comm_t _comm, platform_t &_platform); - virtual void Start(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host=false); - virtual void Finish(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host=false); + template + void Start(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + + template + void Start(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); virtual void AllocBuffer(size_t Nbytes); }; @@ -195,48 +258,73 @@ class ogsCrystalRouter_t: public ogsExchange_t { int Nsend, Nrecv0, Nrecv1; dlong recvOffset; - libp::memory sendIds; - occa::memory o_sendIds; + memory sendIds; + deviceMemory o_sendIds; ogsOperator_t gather; }; - int buf_id=0; - occa::memory o_buf[2]; - occa::memory h_buf[2]; - char* buf[2]; + int buf_id=0, hbuf_id=0; + pinnedMemory h_work[2]; + deviceMemory o_work[2]; - MPI_Request request[3]; - MPI_Status status[3]; + memory request; int Nlevels=0; - libp::memory levelsN; - libp::memory levelsT; + memory levelsN; + memory levelsT; int NsendMax=0, NrecvMax=0; - char* sendBuf; - char* recvBuf; - occa::memory o_sendBuf; - occa::memory h_sendBuf; - occa::memory o_recvBuf; public: ogsCrystalRouter_t(dlong Nshared, - libp::memory &sharedNodes, + memory &sharedNodes, ogsOperator_t &gatherHalo, - MPI_Comm _comm, + stream_t _dataStream, + comm_t _comm, platform_t &_platform); - virtual void Start(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host=false); - virtual void Finish(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host=false); + template + void Start(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(pinnedMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(pinnedMemory &buf,const int k,const Op op,const Transpose trans); + + template + void Start(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + template + void Finish(deviceMemory &buf, + const int k, + const Op op, + const Transpose trans); + + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Start(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); + virtual void Finish(deviceMemory &buf,const int k,const Op op,const Transpose trans); virtual void AllocBuffer(size_t Nbytes); }; diff --git a/include/ogs/ogsOperator.hpp b/include/ogs/ogsOperator.hpp index 601b5e1..bd33c6f 100644 --- a/include/ogs/ogsOperator.hpp +++ b/include/ogs/ogsOperator.hpp @@ -46,22 +46,22 @@ class ogsOperator_t { dlong nnzN=0; dlong nnzT=0; - libp::memory rowStartsN; - libp::memory rowStartsT; - libp::memory colIdsN; - libp::memory colIdsT; + memory rowStartsN; + memory rowStartsT; + memory colIdsN; + memory colIdsT; - occa::memory o_rowStartsN; - occa::memory o_rowStartsT; - occa::memory o_colIdsN; - occa::memory o_colIdsT; + deviceMemory o_rowStartsN; + deviceMemory o_rowStartsT; + deviceMemory o_colIdsN; + deviceMemory o_colIdsT; dlong NrowBlocksN=0; dlong NrowBlocksT=0; - libp::memory blockRowStartsN; - libp::memory blockRowStartsT; - occa::memory o_blockRowStartsN; - occa::memory o_blockRowStartsT; + memory blockRowStartsN; + memory blockRowStartsT; + deviceMemory o_blockRowStartsN; + deviceMemory o_blockRowStartsT; Kind kind; @@ -74,55 +74,49 @@ class ogsOperator_t { void setupRowBlocks(); //Apply Z operator - void Gather(occa::memory& o_gv, - occa::memory& o_v, - const int k, - const Type type, - const Op op, - const Transpose trans); - void Gather(void* gv, - const void* v, - const int k, - const Type type, - const Op op, - const Transpose trans); + template class U, + template class V, + typename T> + void Gather(U gv, const V v, + const int k, const Op op, const Transpose trans); + + template + void Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); //Apply Z^T transpose operator - void Scatter(occa::memory& o_v, - occa::memory& o_gv, - const int k, - const Type type, - const Op op, - const Transpose trans); - void Scatter(void* v, - const void* gv, - const int k, - const Type type, - const Op op, - const Transpose trans); + template class U, + template class V, + typename T> + void Scatter(U v, const V gv, + const int k, const Transpose trans); + + template + void Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); //Apply Z^T*Z operator - void GatherScatter(occa::memory& o_v, - const int k, - const Type type, - const Op op, - const Transpose trans); - void GatherScatter(void* v, - const int k, - const Type type, - const Op op, - const Transpose trans); + template class U, + typename T> + void GatherScatter(U v, const int k, + const Op op, const Transpose trans); + + template + void GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); private: - template class Op> - void Gather(T* gv, const T* v, - const int K, const Transpose trans); - template - void Scatter(T* v, const T* gv, - const int K, const Transpose trans); - template class Op> - void GatherScatter(T* v, + template class U, + template class V, + template class Op, + typename T> + void Gather(U gv, const V v, const int K, const Transpose trans); + template class U, + template class Op, + typename T> + void GatherScatter(U v, const int K, + const Transpose trans); //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU static constexpr int blockSize = 256; @@ -130,26 +124,21 @@ class ogsOperator_t { //4 types - Float, Double, Int32, Int64 //4 ops - Add, Mul, Max, Min - static occa::kernel gatherScatterKernel[4][4]; - static occa::kernel gatherKernel[4][4]; - static occa::kernel scatterKernel[4]; + static kernel_t gatherScatterKernel[4][4]; + static kernel_t gatherKernel[4][4]; + static kernel_t scatterKernel[4]; friend void InitializeKernels(platform_t& platform, const Type type, const Op op); }; -template -void extract(const dlong N, - const int K, - const dlong *ids, - const T *q, - T *gatherq); - +template class U, + template class V, + typename T> void extract(const dlong N, const int K, - const Type type, - const dlong *ids, - const void *q, - void *gatherq); + const memory ids, + const U q, + V gatherq); } //namespace ogs diff --git a/include/ogs/ogsUtils.hpp b/include/ogs/ogsUtils.hpp index aba29f9..b25d649 100644 --- a/include/ogs/ogsUtils.hpp +++ b/include/ogs/ogsUtils.hpp @@ -33,11 +33,6 @@ namespace libp { namespace ogs { -extern MPI_Datatype MPI_PARALLELNODE_T; - -void InitMPIType(); -void DestroyMPIType(); - struct parallelNode_t{ dlong localId; // local node id @@ -51,13 +46,28 @@ struct parallelNode_t{ }; -size_t Sizeof(const Type type); -MPI_Datatype MPI_Type(const Type type); +template +struct ogsType { + static constexpr Type get(); +}; + +template<> struct ogsType { + static constexpr Type get() { return Float; } +}; +template<> struct ogsType { + static constexpr Type get() { return Double; } +}; +template<> struct ogsType { + static constexpr Type get() { return Int32; } +}; +template<> struct ogsType { + static constexpr Type get() { return Int64; } +}; //permute an array A, according to the ordering returned by P // i.e. for all n, A[P(n)] <- A[n] template -void permute(const dlong N, libp::memory A, Order P) { +void permute(const dlong N, memory A, Order P) { for(dlong n=0;n +void ogs_t::GatherScatter(deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans){ - GatherScatterStart (o_v, k, type, op, trans); - GatherScatterFinish(o_v, k, type, op, trans); + GatherScatterStart (o_v, k, op, trans); + GatherScatterFinish(o_v, k, op, trans); } -void ogs_t::GatherScatterStart(occa::memory& o_v, +template +void ogs_t::GatherScatterStart(deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans){ - exchange->AllocBuffer(k*Sizeof(type)); + exchange->AllocBuffer(k*sizeof(T)); + + deviceMemory o_haloBuf = exchange->o_workspace; //collect halo buffer - gatherHalo->Gather(exchange->o_haloBuf, o_v, - k, type, op, trans); + gatherHalo->Gather(o_haloBuf, o_v, k, op, trans); - //prepare MPI exchange - exchange->Start(k, type, op, trans); + if (exchange->gpu_aware) { + //prepare MPI exchange + exchange->Start(o_haloBuf, k, op, trans); + } else { + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + pinnedMemory haloBuf = exchange->h_workspace; + + //if not using gpu-aware mpi move the halo buffer to the host + const dlong Nhalo = (trans == NoTrans) ? NhaloP : NhaloT; + + //wait for o_haloBuf to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_haloBuf, Nhalo*k, + 0, "async: true"); + device.setStream(currentStream); + } } -void ogs_t::GatherScatterFinish(occa::memory& o_v, +template +void ogs_t::GatherScatterFinish(deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans){ //queue local gs operation - gatherLocal->GatherScatter(o_v, k, type, op, trans); + gatherLocal->GatherScatter(o_v, k, op, trans); - //finish MPI exchange - exchange->Finish(k, type, op, trans); + deviceMemory o_haloBuf = exchange->o_workspace; + + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, op, trans); + } else { + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, op, trans); + exchange->Finish(haloBuf, k, op, trans); + + // copy recv back to device + const dlong Nhalo = (trans == Trans) ? NhaloP : NhaloT; + haloBuf.copyTo(o_haloBuf, Nhalo*k, + 0, "async: true"); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } //write exchanged halo buffer back to vector - gatherHalo->Scatter(o_v, exchange->o_haloBuf, - k, type, op, trans); + gatherHalo->Scatter(o_v, o_haloBuf, k, trans); } -void ogs_t::GatherScatter(void* v, +template +void ogs_t::GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(deviceMemory v, const int k, + const Op op, const Transpose trans); + +/******************************** + * Host GatherScatter + ********************************/ +template +void ogs_t::GatherScatter(memory v, const int k, - const Type type, const Op op, const Transpose trans){ - exchange->AllocBuffer(k*Sizeof(type)); + GatherScatterStart (v, k, op, trans); + GatherScatterFinish(v, k, op, trans); +} + +template +void ogs_t::GatherScatterStart(memory v, + const int k, + const Op op, + const Transpose trans){ + exchange->AllocBuffer(k*sizeof(T)); + + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; //collect halo buffer - gatherHalo->Gather(exchange->haloBuf, v, - k, type, op, trans); + gatherHalo->Gather(haloBuf, v, k, op, trans); //prepare MPI exchange - exchange->Start(k, type, op, trans, true); + exchange->Start(haloBuf, k, op, trans); +} + +template +void ogs_t::GatherScatterFinish(memory v, + const int k, + const Op op, + const Transpose trans){ + + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; //queue local gs operation - gatherLocal->GatherScatter(v, k, type, op, trans); + gatherLocal->GatherScatter(v, k, op, trans); //finish MPI exchange - exchange->Finish(k, type, op, trans, true); + exchange->Finish(haloBuf, k, op, trans); //write exchanged halo buffer back to vector - gatherHalo->Scatter(v, exchange->haloBuf, - k, type, op, trans); + gatherHalo->Scatter(v, haloBuf, k, trans); } +template +void ogs_t::GatherScatter(memory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(memory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(memory v, const int k, + const Op op, const Transpose trans); +template +void ogs_t::GatherScatter(memory v, const int k, + const Op op, const Transpose trans); + /******************************** - * Scalar Gather + * Device Gather ********************************/ -void ogs_t::Gather(occa::memory& o_gv, - occa::memory& o_v, +template +void ogs_t::Gather(deviceMemory o_gv, + deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans){ - GatherStart (o_gv, o_v, k, type, op, trans); - GatherFinish(o_gv, o_v, k, type, op, trans); + GatherStart (o_gv, o_v, k, op, trans); + GatherFinish(o_gv, o_v, k, op, trans); } -void ogs_t::GatherStart(occa::memory& o_gv, - occa::memory& o_v, +template +void ogs_t::GatherStart(deviceMemory o_gv, + deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans){ AssertGatherDefined(); + deviceMemory o_haloBuf = exchange->o_workspace; + if (trans==Trans) { //if trans!=ogs::Trans theres no comms required - exchange->AllocBuffer(k*Sizeof(type)); + exchange->AllocBuffer(k*sizeof(T)); //collect halo buffer - gatherHalo->Gather(exchange->o_haloBuf, o_v, - k, type, op, Trans); - - //prepare MPI exchange - exchange->Start(k, type, op, Trans); + gatherHalo->Gather(o_haloBuf, o_v, k, op, Trans); + + if (exchange->gpu_aware) { + //prepare MPI exchange + exchange->Start(o_haloBuf, k, op, Trans); + } else { + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //if not using gpu-aware mpi move the halo buffer to the host + pinnedMemory haloBuf = exchange->h_workspace; + + //wait for o_haloBuf to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_haloBuf, NhaloT*k, + 0, "async: true"); + device.setStream(currentStream); + } } else { //gather halo - occa::memory o_gvHalo = o_gv + k*NlocalT*Sizeof(type); - gatherHalo->Gather(o_gvHalo, o_v, - k, type, op, trans); + gatherHalo->Gather(o_gv + k*NlocalT, o_v, k, op, trans); } } -void ogs_t::GatherFinish(occa::memory& o_gv, - occa::memory& o_v, +template +void ogs_t::GatherFinish(deviceMemory o_gv, + deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans){ AssertGatherDefined(); + deviceMemory o_haloBuf = exchange->o_workspace; + //queue local g operation - gatherLocal->Gather(o_gv, o_v, - k, type, op, trans); + gatherLocal->Gather(o_gv, o_v, k, op, trans); if (trans==Trans) { //if trans!=ogs::Trans theres no comms required - //finish MPI exchange - exchange->Finish(k, type, op, Trans); - - //put the result at the end of o_gv - if (NhaloP) - exchange->o_haloBuf.copyTo(o_gv + k*NlocalT*Sizeof(type), - k*NhaloP*Sizeof(type), 0, 0, "async: true"); + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, op, Trans); + + //put the result at the end of o_gv + o_haloBuf.copyTo(o_gv + k*NlocalT, + k*NhaloP, 0, "async: true"); + } else { + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, op, trans); + exchange->Finish(haloBuf, k, op, trans); + + // copy recv back to device + //put the result at the end of o_gv + haloBuf.copyTo(o_gv + k*NlocalT, k*NhaloP, + 0, "async: true"); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } } } +template +void ogs_t::Gather(deviceMemory v, const deviceMemory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(deviceMemory v, const deviceMemory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(deviceMemory v, const deviceMemory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(deviceMemory v, const deviceMemory gv, + const int k, const Op op, const Transpose trans); + +/******************************** + * Host Gather + ********************************/ + //host versions -void ogs_t::Gather(void* gv, - const void* v, +template +void ogs_t::Gather(memory gv, + const memory v, const int k, - const Type type, const Op op, const Transpose trans){ + GatherStart (gv, v, k, op, trans); + GatherFinish(gv, v, k, op, trans); +} + +template +void ogs_t::GatherStart(memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans){ AssertGatherDefined(); if (trans==Trans) { //if trans!=ogs::Trans theres no comms required - exchange->AllocBuffer(k*Sizeof(type)); + exchange->AllocBuffer(k*sizeof(T)); + + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; //collect halo buffer - gatherHalo->Gather(exchange->haloBuf, v, - k, type, op, Trans); + gatherHalo->Gather(haloBuf, v, k, op, Trans); //prepare MPI exchange - exchange->Start(k, type, op, Trans, true); + exchange->Start(haloBuf, k, op, Trans); + } else { + //gather halo + gatherHalo->Gather(gv + k*NlocalT, v, k, op, trans); + } +} + +template +void ogs_t::GatherFinish(memory gv, + const memory v, + const int k, + const Op op, + const Transpose trans){ + AssertGatherDefined(); - //local gather - gatherLocal->Gather(gv, v, k, type, op, Trans); + //queue local g operation + gatherLocal->Gather(gv, v, k, op, trans); + + if (trans==Trans) { //if trans!=ogs::Trans theres no comms required + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; //finish MPI exchange - exchange->Finish(k, type, op, Trans, true); + exchange->Finish(haloBuf, k, op, Trans); //put the result at the end of o_gv - std::memcpy(static_cast(gv) + k*NlocalT*Sizeof(type), - exchange->haloBuf, - k*NhaloP*Sizeof(type)); - } else { - //local gather - gatherLocal->Gather(gv, v, k, type, op, trans); - - //gather halo - gatherHalo->Gather(static_cast(gv) + k*NlocalT*Sizeof(type), - v, k, type, op, trans); + haloBuf.copyTo(gv+k*NlocalT, k*NhaloP); } } +template +void ogs_t::Gather(memory v, const memory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(memory v, const memory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(memory v, const memory gv, + const int k, const Op op, const Transpose trans); +template +void ogs_t::Gather(memory v, const memory gv, + const int k, const Op op, const Transpose trans); /******************************** - * Scalar Scatter + * Device Scatter ********************************/ -void ogs_t::Scatter(occa::memory& o_v, - occa::memory& o_gv, +template +void ogs_t::Scatter(deviceMemory o_v, + deviceMemory o_gv, const int k, - const Type type, - const Op op, const Transpose trans){ - ScatterStart (o_v, o_gv, k, type, op, trans); - ScatterFinish(o_v, o_gv, k, type, op, trans); + ScatterStart (o_v, o_gv, k, trans); + ScatterFinish(o_v, o_gv, k, trans); } -void ogs_t::ScatterStart(occa::memory& o_v, - occa::memory& o_gv, +template +void ogs_t::ScatterStart(deviceMemory o_v, + deviceMemory o_gv, const int k, - const Type type, - const Op op, const Transpose trans){ AssertGatherDefined(); + deviceMemory o_haloBuf = exchange->o_workspace; + if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required - exchange->AllocBuffer(k*Sizeof(type)); + exchange->AllocBuffer(k*sizeof(T)); - //collect halo buffer - if (NhaloP) - exchange->o_haloBuf.copyFrom(o_gv + k*NlocalT*Sizeof(type), - k*NhaloP*Sizeof(type), 0, 0, "async: true"); + device_t &device = platform.device; - //prepare MPI exchange - exchange->Start(k, type, op, NoTrans); + if (exchange->gpu_aware) { + //collect halo buffer + o_haloBuf.copyFrom(o_gv + k*NlocalT, + k*NhaloP, 0, "async: true"); + + //wait for o_haloBuf to be ready + device.finish(); + + //prepare MPI exchange + exchange->Start(o_haloBuf, k, Add, NoTrans); + } else { + //get current stream + stream_t currentStream = device.getStream(); + + //if not using gpu-aware mpi move the halo buffer to the host + pinnedMemory haloBuf = exchange->h_workspace; + + //wait for o_gv to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_gv + k*NlocalT, NhaloP*k, + 0, "async: true"); + device.setStream(currentStream); + } } } - -void ogs_t::ScatterFinish(occa::memory& o_v, - occa::memory& o_gv, +template +void ogs_t::ScatterFinish(deviceMemory o_v, + deviceMemory o_gv, const int k, - const Type type, - const Op op, const Transpose trans){ AssertGatherDefined(); + deviceMemory o_haloBuf = exchange->o_workspace; + //queue local s operation - gatherLocal->Scatter(o_v, o_gv, k, type, op, trans); + gatherLocal->Scatter(o_v, o_gv, k, trans); if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required - //finish MPI exchange (and put the result at the end of o_gv) - exchange->Finish(k, type, op, NoTrans); + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, Add, NoTrans); + } else { + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, Add, NoTrans); + exchange->Finish(haloBuf, k, Add, NoTrans); + + // copy recv back to device + haloBuf.copyTo(o_haloBuf, NhaloT*k, + 0, "async: true"); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } //scatter halo buffer - gatherHalo->Scatter(o_v, exchange->o_haloBuf, - k, type, op, NoTrans); + gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans); } else { //scatter halo - occa::memory o_gvHalo = o_gv + k*NlocalT*Sizeof(type); - gatherHalo->Scatter(o_v, o_gvHalo, k, type, op, trans); + gatherHalo->Scatter(o_v, o_gv + k*NlocalT, k, trans); } } +template +void ogs_t::Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(deviceMemory v, const deviceMemory gv, + const int k, const Transpose trans); + +/******************************** + * Host Scatter + ********************************/ + //host versions -void ogs_t::Scatter(void* v, - const void* gv, +template +void ogs_t::Scatter(memory v, + const memory gv, const int k, - const Type type, - const Op op, const Transpose trans){ + ScatterStart (v, gv, k, trans); + ScatterFinish(v, gv, k, trans); +} + +template +void ogs_t::ScatterStart(memory v, + const memory gv, + const int k, + const Transpose trans){ AssertGatherDefined(); if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required - exchange->AllocBuffer(k*Sizeof(type)); + exchange->AllocBuffer(k*sizeof(T)); + + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; //collect halo buffer - std::memcpy(exchange->haloBuf, - static_cast(gv) + k*NlocalT*Sizeof(type), - k*NhaloP*Sizeof(type)); + haloBuf.copyFrom(gv + k*NlocalT, k*NhaloP); //prepare MPI exchange - exchange->Start(k, type, op, NoTrans, true); + exchange->Start(haloBuf, k, Add, NoTrans); + } +} - //local scatter - gatherLocal->Scatter(v, gv, k, type, op, NoTrans); +template +void ogs_t::ScatterFinish(memory v, + const memory gv, + const int k, + const Transpose trans){ + AssertGatherDefined(); + + //queue local s operation + gatherLocal->Scatter(v, gv, k, trans); + + if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required + /*Cast workspace to type T*/ + pinnedMemory haloBuf = exchange->h_workspace; //finish MPI exchange (and put the result at the end of o_gv) - exchange->Finish(k, type, op, NoTrans, true); + exchange->Finish(haloBuf, k, Add, NoTrans); //scatter halo buffer - gatherHalo->Scatter(v, exchange->haloBuf, - k, type, op, NoTrans); + gatherHalo->Scatter(v, haloBuf, k, NoTrans); } else { - //local scatter - gatherLocal->Scatter(v, gv, k, type, op, trans); - //scatter halo - gatherHalo->Scatter(v, - static_cast(gv) - + k*NlocalT*Sizeof(type), - k, type, op, trans); + gatherHalo->Scatter(v, gv + k*NlocalT, k, trans); } } +template +void ogs_t::Scatter(memory v, const memory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(memory v, const memory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(memory v, const memory gv, + const int k, const Transpose trans); +template +void ogs_t::Scatter(memory v, const memory gv, + const int k, const Transpose trans); } //namespace ogs } //namespace libp diff --git a/libs/ogs/ogsAllToAll.cpp b/libs/ogs/ogsAllToAll.cpp index 7f92fa5..6ca2713 100644 --- a/libs/ogs/ogsAllToAll.cpp +++ b/libs/ogs/ogsAllToAll.cpp @@ -39,80 +39,100 @@ namespace libp { namespace ogs { -void ogsAllToAll_t::Start(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host){ - - occa::device &device = platform.device; +/********************************** +* Host exchange +***********************************/ +template +inline void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ - //get current stream - occa::stream currentStream = device.getStream(); + pinnedMemory sendBuf = h_sendspace; - const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT; - const dlong N = (trans == NoTrans) ? NhaloP : Nhalo; + // extract the send buffer + if (trans == NoTrans) + extract(NsendN, k, sendIdsN, buf, sendBuf); + else + extract(NsendT, k, sendIdsT, buf, sendBuf); - if (Nsend) { - if (gpu_aware && !host) { - //if using gpu-aware mpi and exchanging device buffers, - // assemble the send buffer on device - if (trans == NoTrans) { - extractKernel[type](NsendN, k, o_sendIdsN, o_haloBuf, o_sendBuf); - } else { - extractKernel[type](NsendT, k, o_sendIdsT, o_haloBuf, o_sendBuf); - } - //if not overlapping, wait for kernel to finish on default stream - device.finish(); - } else if (!host) { - //if not using gpu-aware mpi and exchanging device buffers, - // move the halo buffer to the host - device.setStream(dataStream); - const size_t Nbytes = k*Sizeof(type); - o_haloBuf.copyTo(haloBuf, N*Nbytes, 0, "async: true"); - device.setStream(currentStream); + if (trans==NoTrans) { + for (int r=0;r +inline void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ -void ogsAllToAll_t::Finish(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host){ + comm.Wait(request); - const size_t Nbytes = k*Sizeof(type); - occa::device &device = platform.device; + //if we recvieved anything via MPI, gather the recv buffer and scatter + // it back to to original vector + dlong Nrecv = recvOffsets[size]; + if (Nrecv) { + // gather the recieved nodes + postmpi.Gather(buf, buf, k, op, trans); + } +} - //get current stream - occa::stream currentStream = device.getStream(); +void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + +/********************************** +* GPU-aware exchange +***********************************/ +template +void ogsAllToAll_t::Start(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT; - if (Nsend && !gpu_aware && !host) { - //synchronize data stream to ensure the host buffer is on the host - device.setStream(dataStream); + if (Nsend) { + deviceMemory o_sendBuf = o_sendspace; + + // assemble the send buffer on device + if (trans == NoTrans) { + extractKernel[ogsType::get()](NsendN, k, o_sendIdsN, o_buf, o_sendBuf); + } else { + extractKernel[ogsType::get()](NsendT, k, o_sendIdsT, o_buf, o_sendBuf); + } + //wait for kernel to finish on default stream + device_t &device = platform.device; device.finish(); - device.setStream(currentStream); } +} - //if the halo data is on the host, extract the send buffer - if (host || !gpu_aware) { - if (trans == NoTrans) - extract(NsendN, k, type, sendIdsN.ptr(), haloBuf, sendBuf); - else - extract(NsendT, k, type, sendIdsT.ptr(), haloBuf, sendBuf); - } +template +void ogsAllToAll_t::Finish(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ - char *sendPtr, *recvPtr; - if (gpu_aware && !host) { //device pointer - sendPtr = static_cast(o_sendBuf.ptr()); - recvPtr = static_cast(o_haloBuf.ptr()) + Nhalo*Nbytes; - } else { //host pointer - sendPtr = sendBuf; - recvPtr = haloBuf + Nhalo*Nbytes; - } + deviceMemory o_sendBuf = o_sendspace; if (trans==NoTrans) { for (int r=0;r &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsAllToAll_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsAllToAll_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + ogsAllToAll_t::ogsAllToAll_t(dlong Nshared, - libp::memory &sharedNodes, + memory &sharedNodes, ogsOperator_t& gatherHalo, - MPI_Comm _comm, + stream_t _dataStream, + comm_t _comm, platform_t &_platform): - ogsExchange_t(_platform,_comm) { + ogsExchange_t(_platform,_comm, _dataStream) { Nhalo = gatherHalo.NrowsT; NhaloP = gatherHalo.NrowsN; @@ -195,10 +209,8 @@ ogsAllToAll_t::ogsAllToAll_t(dlong Nshared, } //shared counts - MPI_Alltoall(mpiSendCountsT.ptr(), 1, MPI_INT, - mpiRecvCountsT.ptr(), 1, MPI_INT, comm); - MPI_Alltoall(mpiSendCountsN.ptr(), 1, MPI_INT, - mpiRecvCountsN.ptr(), 1, MPI_INT, comm); + comm.Alltoall(mpiSendCountsT, mpiRecvCountsT); + comm.Alltoall(mpiSendCountsN, mpiRecvCountsN); //cumulative sum mpiSendOffsetsN[0] = 0; @@ -234,13 +246,11 @@ ogsAllToAll_t::ogsAllToAll_t(dlong Nshared, //send the node lists so we know what we'll receive dlong Nrecv = mpiRecvOffsetsT[size]; - libp::memory recvNodes(Nrecv); + memory recvNodes(Nrecv); //Send list of nodes to each rank - MPI_Alltoallv(sharedNodes.ptr(), mpiSendCountsT.ptr(), mpiSendOffsetsT.ptr(), MPI_PARALLELNODE_T, - recvNodes.ptr(), mpiRecvCountsT.ptr(), mpiRecvOffsetsT.ptr(), MPI_PARALLELNODE_T, - comm); - MPI_Barrier(comm); + comm.Alltoallv(sharedNodes, mpiSendCountsT, mpiSendOffsetsT, + recvNodes, mpiRecvCountsT, mpiRecvOffsetsT); //make ops for gathering halo nodes after an MPI_Allgatherv postmpi.platform = platform; @@ -252,10 +262,10 @@ ogsAllToAll_t::ogsAllToAll_t(dlong Nshared, postmpi.rowStartsT.malloc(Nhalo+1); //make array of counters - libp::memory haloGatherTCounts(Nhalo); - libp::memory haloGatherNCounts(Nhalo); + memory haloGatherTCounts(Nhalo); + memory haloGatherNCounts(Nhalo); - //count the data that will already be in haloBuf + //count the data that will already be in h_haloBuf.ptr() for (dlong n=0;n(platform.hostMalloc(postmpi.nnzT*Nbytes, nullptr, h_haloBuf)); - o_haloBuf = platform.malloc(postmpi.nnzT*Nbytes); + if (o_workspace.size() < postmpi.nnzT*Nbytes) { + h_workspace = platform.hostMalloc(postmpi.nnzT*Nbytes); + o_workspace = platform.malloc(postmpi.nnzT*Nbytes); } - if (o_sendBuf.size() < NsendT*Nbytes) { - if (o_sendBuf.size()) o_sendBuf.free(); - sendBuf = static_cast(platform.hostMalloc(NsendT*Nbytes, nullptr, h_sendBuf)); - o_sendBuf = platform.malloc(NsendT*Nbytes); + if (o_sendspace.size() < NsendT*Nbytes) { + h_sendspace = platform.hostMalloc(NsendT*Nbytes); + o_sendspace = platform.malloc(NsendT*Nbytes); } } diff --git a/libs/ogs/ogsAuto.cpp b/libs/ogs/ogsAuto.cpp index 8469574..7ec99e5 100644 --- a/libs/ogs/ogsAuto.cpp +++ b/libs/ogs/ogsAuto.cpp @@ -28,41 +28,111 @@ SOFTWARE. #include "ogs/ogsUtils.hpp" #include "ogs/ogsOperator.hpp" #include "ogs/ogsExchange.hpp" +#include "timer.hpp" namespace libp { namespace ogs { -static void ExchangeTest(ogsExchange_t* exchange, double time[3], bool host=false) { +static void DeviceExchangeTest(ogsExchange_t* exchange, double time[3]) { const int Ncold = 10; const int Nhot = 10; - double start, end; double localTime, sumTime, minTime, maxTime; - int rank, size; - MPI_Comm_rank(exchange->comm, &rank); - MPI_Comm_size(exchange->comm, &size); + comm_t& comm = exchange->comm; + int size = comm.size(); + + pinnedMemory buf = exchange->h_workspace; + deviceMemory o_buf = exchange->o_workspace; + + device_t &device = exchange->platform.device; + + //dry run + for (int n=0;ngpu_aware) { + /*GPU-aware exchange*/ + exchange->Start (o_buf, 1, Add, Sym); + exchange->Finish(o_buf, 1, Add, Sym); + } else { + //if not using gpu-aware mpi move the halo buffer to the host + o_buf.copyTo(buf, exchange->Nhalo, + 0, "async: true"); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (buf, 1, Add, Sym); + exchange->Finish(buf, 1, Add, Sym); + + // copy recv back to device + o_buf.copyFrom(buf, exchange->Nhalo, + 0, "async: true"); + device.finish(); //wait for transfer to finish + } + } + + //hot runs + timePoint_t start = Time(); + for (int n=0;ngpu_aware) { + /*GPU-aware exchange*/ + exchange->Start (o_buf, 1, Add, Sym); + exchange->Finish(o_buf, 1, Add, Sym); + } else { + //if not using gpu-aware mpi move the halo buffer to the host + o_buf.copyTo(buf, exchange->Nhalo, + 0, "async: true"); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (buf, 1, Add, Sym); + exchange->Finish(buf, 1, Add, Sym); + + // copy recv back to device + o_buf.copyFrom(buf, exchange->Nhalo, + 0, "async: true"); + device.finish(); //wait for transfer to finish + } + } + timePoint_t end = Time(); + + localTime = ElapsedTime(start,end)/Nhot; + comm.Allreduce(localTime, sumTime, comm_t::Sum); + comm.Allreduce(localTime, maxTime, comm_t::Max); + comm.Allreduce(localTime, minTime, comm_t::Min); + + time[0] = sumTime/size; //avg + time[1] = minTime; //min + time[2] = maxTime; //max +} + +static void HostExchangeTest(ogsExchange_t* exchange, double time[3]) { + const int Ncold = 10; + const int Nhot = 10; + double localTime, sumTime, minTime, maxTime; + + comm_t& comm = exchange->comm; + int size = comm.size(); + + pinnedMemory buf = exchange->h_workspace; //dry run for (int n=0;nStart(1, Dfloat, Add, Sym, host); - exchange->Finish(1, Dfloat, Add, Sym, host); + exchange->Start (buf, 1, Add, Sym); + exchange->Finish(buf, 1, Add, Sym); } //hot runs - if (!host) exchange->platform.device.finish(); - start = MPI_Wtime(); + timePoint_t start = Time(); for (int n=0;nStart(1, Dfloat, Add, Sym, host); - exchange->Finish(1, Dfloat, Add, Sym, host); + exchange->Start (buf, 1, Add, Sym); + exchange->Finish(buf, 1, Add, Sym); } - if (!host) exchange->platform.device.finish(); - end = MPI_Wtime(); + timePoint_t end = Time(); - localTime = (end-start)/Nhot; - MPI_Allreduce(&localTime, &sumTime, 1, MPI_DOUBLE, MPI_SUM, exchange->comm); - MPI_Allreduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, exchange->comm); - MPI_Allreduce(&localTime, &minTime, 1, MPI_DOUBLE, MPI_MIN, exchange->comm); + localTime = ElapsedTime(start,end)/Nhot; + comm.Allreduce(localTime, sumTime, comm_t::Sum); + comm.Allreduce(localTime, maxTime, comm_t::Max); + comm.Allreduce(localTime, minTime, comm_t::Min); time[0] = sumTime/size; //avg time[1] = minTime; //min @@ -70,18 +140,19 @@ static void ExchangeTest(ogsExchange_t* exchange, double time[3], bool host=fals } ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, - libp::memory &sharedNodes, + memory &sharedNodes, ogsOperator_t& _gatherHalo, - MPI_Comm _comm, + comm_t _comm, platform_t &_platform, const int verbose) { int rank, size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + rank = comm.rank(); + size = comm.size(); if (size==1) return new ogsPairwise_t(Nshared, sharedNodes, - _gatherHalo, comm, platform); + _gatherHalo, dataStream, + comm, platform); ogsExchange_t* bestExchange; Method method; @@ -102,13 +173,14 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, * Pairwise ********************************/ ogsExchange_t* pairwise = new ogsPairwise_t(Nshared, sharedNodes, - _gatherHalo, comm, platform); + _gatherHalo, dataStream, + comm, platform); //standard copy to host - exchange - copy back to device pairwise->gpu_aware=false; double pairwiseTime[3]; - ExchangeTest(pairwise, pairwiseTime, false); + DeviceExchangeTest(pairwise, pairwiseTime); double pairwiseAvg = pairwiseTime[0]; #ifdef GPU_AWARE_MPI @@ -116,7 +188,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, pairwise->gpu_aware=true; double pairwiseGATime[3]; - ExchangeTest(pairwise, pairwiseGATime, false); + DeviceExchangeTest(pairwise, pairwiseGATime); if (pairwiseGATime[0] < pairwiseAvg) pairwiseAvg = pairwiseGATime[0]; @@ -127,7 +199,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, //test exchange from host memory (just for reporting) double pairwiseHostTime[3]; - ExchangeTest(pairwise, pairwiseHostTime, true); + HostExchangeTest(pairwise, pairwiseHostTime); bestExchange = pairwise; method = Pairwise; @@ -150,13 +222,13 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, * All-to-All ********************************/ ogsExchange_t* alltoall = new ogsAllToAll_t(Nshared, sharedNodes, - _gatherHalo, comm, platform); - + _gatherHalo, dataStream, + comm, platform); //standard copy to host - exchange - copy back to device alltoall->gpu_aware=false; double alltoallTime[3]; - ExchangeTest(alltoall, alltoallTime, false); + DeviceExchangeTest(alltoall, alltoallTime); double alltoallAvg = alltoallTime[0]; #ifdef GPU_AWARE_MPI @@ -164,7 +236,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, alltoall->gpu_aware=true; double alltoallGATime[3]; - ExchangeTest(alltoall, alltoallGATime, false); + DeviceExchangeTest(alltoall, alltoallGATime); if (alltoallGATime[0] < alltoallAvg) alltoallAvg = alltoallGATime[0]; @@ -175,7 +247,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, //test exchange from host memory (just for reporting) double alltoallHostTime[3]; - ExchangeTest(alltoall, alltoallHostTime, true); + HostExchangeTest(alltoall, alltoallHostTime); if (alltoallAvg < bestTime) { delete bestExchange; @@ -202,14 +274,15 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, /******************************** * Crystal Router ********************************/ - ogsExchange_t* crystal = new ogsAllToAll_t(Nshared, sharedNodes, - _gatherHalo, comm, platform); + ogsExchange_t* crystal = new ogsCrystalRouter_t(Nshared, sharedNodes, + _gatherHalo, dataStream, + comm, platform); //standard copy to host - exchange - copy back to device crystal->gpu_aware=false; double crystalTime[3]; - ExchangeTest(crystal, crystalTime, false); + DeviceExchangeTest(crystal, crystalTime); double crystalAvg = crystalTime[0]; #ifdef GPU_AWARE_MPI @@ -217,7 +290,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, crystal->gpu_aware=true; double crystalGATime[3]; - ExchangeTest(crystal, crystalGATime, false); + DeviceExchangeTest(crystal, crystalGATime); if (crystalGATime[0] < crystalAvg) crystalAvg = crystalGATime[0]; @@ -228,7 +301,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared, //test exchange from host memory (just for reporting) double crystalHostTime[3]; - ExchangeTest(crystal, crystalHostTime, true); + HostExchangeTest(crystal, crystalHostTime); if (crystalAvg < bestTime) { delete bestExchange; diff --git a/libs/ogs/ogsCrystalRouter.cpp b/libs/ogs/ogsCrystalRouter.cpp index 661c95c..fa54b1b 100644 --- a/libs/ogs/ogsCrystalRouter.cpp +++ b/libs/ogs/ogsCrystalRouter.cpp @@ -39,129 +39,176 @@ namespace libp { namespace ogs { -void ogsCrystalRouter_t::Start(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host){ +/********************************** +* Host exchange +***********************************/ +template +inline void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){} - occa::device &device = platform.device; +template +inline void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ - //get current stream - occa::stream currentStream = device.getStream(); - - const dlong N = (trans == NoTrans) ? NhaloP : Nhalo; - - if (N) { - if (!gpu_aware && !host) { - //if not using gpu-aware mpi and exchanging device buffers, - // move the halo buffer to the host - device.setStream(dataStream); - const size_t Nbytes = k*Sizeof(type); - o_haloBuf.copyTo(haloBuf, N*Nbytes, 0, "async: true"); - device.setStream(currentStream); + + memory levels; + if (trans==NoTrans) { + levels = levelsN; + } else { + levels = levelsT; + } + + pinnedMemory sendBuf = h_sendspace; + + // To start, buf = h_workspace = h_work[(hbuf_id+0)%2]; + // sendBuf = h_sendspace; + for (int l=0;l recvBuf = h_workspace; + + //post recvs + if (levels[l].Nmsg>0) { + comm.Irecv(recvBuf + levels[l].recvOffset*k, + levels[l].partner, + k*levels[l].Nrecv0, + levels[l].partner, + request[1]); } + if (levels[l].Nmsg==2) { + comm.Irecv(recvBuf + levels[l].recvOffset*k + levels[l].Nrecv0*k, + rank-1, + k*levels[l].Nrecv1, + rank-1, + request[2]); + } + + //assemble send buffer + extract(levels[l].Nsend, k, levels[l].sendIds, buf, sendBuf); + + //post send + comm.Isend(sendBuf, + levels[l].partner, + k*levels[l].Nsend, + rank, + request[0]); + + comm.Waitall(levels[l].Nmsg+1, request); + + //rotate buffers + h_workspace = h_work[(hbuf_id+1)%2]; + hbuf_id = (hbuf_id+1)%2; + + recvBuf = buf; + buf = h_workspace; + + //Gather the recv'd values into the haloBuffer + levels[l].gather.Gather(buf, recvBuf, k, op, Trans); } } +void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + +/********************************** +* GPU-aware exchange +***********************************/ +template +inline void ogsCrystalRouter_t::Start(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ +} -void ogsCrystalRouter_t::Finish(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host){ +template +inline void ogsCrystalRouter_t::Finish(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ - const size_t Nbytes = k*Sizeof(type); - occa::device &device = platform.device; + device_t &device = platform.device; //get current stream - occa::stream currentStream = device.getStream(); + stream_t currentStream = device.getStream(); //the intermediate kernels are always overlapped with the default stream device.setStream(dataStream); - dlong N = (trans == NoTrans) ? NhaloP : Nhalo; - - if (N && !gpu_aware && !host) { - //synchronize data stream to ensure the host buffer is on the host - device.finish(); - } - - libp::memory levels; - if (trans==NoTrans) + memory levels; + if (trans==NoTrans) { levels = levelsN; - else + } else { levels = levelsT; + } - for (int l=0;l o_sendBuf = o_sendspace; - char *sendPtr, *recvPtr; - if (gpu_aware && !host) { //device pointer - sendPtr = (char*)o_sendBuf.ptr(); - recvPtr = (char*)o_haloBuf.ptr() + levels[l].recvOffset*Nbytes; - } else { //host pointer - sendPtr = sendBuf; - recvPtr = haloBuf + levels[l].recvOffset*Nbytes; - } + // To start, o_buf = o_workspace = o_work[(buf_id+0)%2]; + // o_sendBuf = o_sendspace + for (int l=0;l o_recvBuf = o_workspace; //post recvs if (levels[l].Nmsg>0) { - MPI_Irecv(recvPtr, k*levels[l].Nrecv0, MPI_Type(type), - levels[l].partner, levels[l].partner, comm, request+1); + comm.Irecv(o_recvBuf + levels[l].recvOffset*k, + levels[l].partner, + k*levels[l].Nrecv0, + levels[l].partner, + request[1]); } if (levels[l].Nmsg==2) { - MPI_Irecv(recvPtr+levels[l].Nrecv0*Nbytes, - k*levels[l].Nrecv1, MPI_Type(type), - rank-1, rank-1, comm, request+2); + comm.Irecv(o_recvBuf + levels[l].recvOffset*k + levels[l].Nrecv0*k, + rank-1, + k*levels[l].Nrecv1, + rank-1, + request[2]); } //assemble send buffer - if (gpu_aware) { - if (levels[l].Nsend) { - extractKernel[type](levels[l].Nsend, k, - levels[l].o_sendIds, - o_haloBuf, o_sendBuf); - device.finish(); - } - } else { - extract(levels[l].Nsend, k, type, - levels[l].sendIds.ptr(), haloBuf, sendBuf); + if (levels[l].Nsend) { + extractKernel[ogsType::get()](levels[l].Nsend, k, + levels[l].o_sendIds, + o_buf, o_sendBuf); + device.finish(); } //post send - MPI_Isend(sendPtr, k*levels[l].Nsend, MPI_Type(type), - levels[l].partner, rank, comm, request+0); - MPI_Waitall(levels[l].Nmsg+1, request, status); + comm.Isend(o_sendBuf, + levels[l].partner, + k*levels[l].Nsend, + rank, + request[0]); + + comm.Waitall(levels[l].Nmsg+1, request); //rotate buffers - o_recvBuf = o_buf[(buf_id+0)%2]; - o_haloBuf = o_buf[(buf_id+1)%2]; - recvBuf = buf[(buf_id+0)%2]; - haloBuf = buf[(buf_id+1)%2]; + o_workspace = o_work[(buf_id+1)%2]; buf_id = (buf_id+1)%2; - //Gather the recv'd values into the haloBuffer - if (gpu_aware) { - levels[l].gather.Gather(o_haloBuf, o_recvBuf, - k, type, op, Trans); - } else { - levels[l].gather.Gather(haloBuf, recvBuf, - k, type, op, Trans); - } - } + o_recvBuf = o_buf; + o_buf = o_workspace; - N = (trans == Trans) ? NhaloP : Nhalo; - if (N) { - if (!gpu_aware && !host) { - // copy recv back to device - o_haloBuf.copyFrom(haloBuf, N*Nbytes, 0, "async: true"); - device.finish(); //wait for transfer to finish - } + //Gather the recv'd values into the haloBuffer + levels[l].gather.Gather(o_buf, o_recvBuf, k, op, Trans); } device.setStream(currentStream); } +void ogsCrystalRouter_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsCrystalRouter_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + + /* *Crystal Router performs the needed MPI communcation via recursive * folding of a hypercube. Consider a set of NP ranks. We select a @@ -198,11 +245,12 @@ void ogsCrystalRouter_t::Finish(const int k, */ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, - libp::memory &sharedNodes, + memory &sharedNodes, ogsOperator_t& gatherHalo, - MPI_Comm _comm, + stream_t _dataStream, + comm_t _comm, platform_t &_platform): - ogsExchange_t(_platform,_comm) { + ogsExchange_t(_platform,_comm,_dataStream) { NhaloP = gatherHalo.NrowsN; Nhalo = gatherHalo.NrowsT; @@ -229,6 +277,7 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, levelsN.malloc(Nlevels); levelsT.malloc(Nlevels); + request.malloc(3); //Now build the levels Nlevels = 0; @@ -236,7 +285,7 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, np_offset=0; dlong N = Nshared + Nhalo; - libp::memory nodes(N); + memory nodes(N); //setup is easier if we include copies of the nodes we own // in the list of shared nodes @@ -298,15 +347,15 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, int Nsend=(is_lo) ? Nhi : Nlo; - MPI_Isend(&Nsend, 1, MPI_INT, partner, rank, comm, request+0); + comm.Isend(Nsend, partner, rank, request[0]); int Nrecv0=0, Nrecv1=0; if (Nmsg>0) - MPI_Irecv(&Nrecv0, 1, MPI_INT, partner, partner, comm, request+1); + comm.Irecv(Nrecv0, partner, partner, request[1]); if (Nmsg==2) - MPI_Irecv(&Nrecv1, 1, MPI_INT, r_half-1, r_half-1, comm, request+2); + comm.Irecv(Nrecv1, r_half-1, r_half-1, request[2]); - MPI_Waitall(Nmsg+1, request, status); + comm.Waitall(Nmsg+1, request); int Nrecv = Nrecv0+Nrecv1; @@ -315,8 +364,8 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, else Nhi+=Nrecv; //split node list in two - libp::memory loNodes(Nlo); - libp::memory hiNodes(Nhi); + memory loNodes(Nlo); + memory hiNodes(Nhi); Nlo=0, Nhi=0; for (dlong n=0;n sendNodes = is_lo ? hiNodes : loNodes; + memory sendNodes = is_lo ? hiNodes : loNodes; //count how many entries from the halo buffer we're sending int NentriesSendN=0; @@ -365,29 +414,29 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, levelsN[Nlevels].o_sendIds = platform.malloc(levelsN[Nlevels].sendIds); //share the entry count with our partner - MPI_Isend(&NentriesSendT, 1, MPI_INT, partner, rank, comm, request+0); + comm.Isend(NentriesSendT, partner, rank, request[0]); int NentriesRecvT0=0, NentriesRecvT1=0; if (Nmsg>0) - MPI_Irecv(&NentriesRecvT0, 1, MPI_INT, partner, partner, comm, request+1); + comm.Irecv(NentriesRecvT0, partner, partner, request[1]); if (Nmsg==2) - MPI_Irecv(&NentriesRecvT1, 1, MPI_INT, r_half-1, r_half-1, comm, request+2); + comm.Irecv(NentriesRecvT1, r_half-1, r_half-1, request[2]); - MPI_Waitall(Nmsg+1, request, status); + comm.Waitall(Nmsg+1, request); levelsT[Nlevels].Nrecv0 = NentriesRecvT0; levelsT[Nlevels].Nrecv1 = NentriesRecvT1; levelsT[Nlevels].recvOffset = NhaloExtT; - MPI_Isend(&NentriesSendN, 1, MPI_INT, partner, rank, comm, request+0); + comm.Isend(NentriesSendN, partner, rank, request[0]); int NentriesRecvN0=0, NentriesRecvN1=0; if (Nmsg>0) - MPI_Irecv(&NentriesRecvN0, 1, MPI_INT, partner, partner, comm, request+1); + comm.Irecv(NentriesRecvN0, partner, partner, request[1]); if (Nmsg==2) - MPI_Irecv(&NentriesRecvN1, 1, MPI_INT, r_half-1, r_half-1, comm, request+2); + comm.Irecv(NentriesRecvN1, r_half-1, r_half-1, request[2]); - MPI_Waitall(Nmsg+1, request, status); + comm.Waitall(Nmsg+1, request); levelsN[Nlevels].Nrecv0 = NentriesRecvN0; levelsN[Nlevels].Nrecv1 = NentriesRecvN1; @@ -399,18 +448,15 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, //send half the list to our partner - MPI_Isend(sendNodes.ptr(), Nsend, - MPI_PARALLELNODE_T, partner, rank, comm, request+0); + comm.Isend(sendNodes, partner, Nsend, rank, request[0]); //recv new nodes from our partner(s) if (Nmsg>0) - MPI_Irecv(nodes.ptr()+offset, Nrecv0, - MPI_PARALLELNODE_T, partner, partner, comm, request+1); + comm.Irecv(nodes+offset, partner, Nrecv0, partner, request[1]); if (Nmsg==2) - MPI_Irecv(nodes.ptr()+offset+Nrecv0, Nrecv1, - MPI_PARALLELNODE_T, r_half-1, r_half-1, comm, request+2); + comm.Irecv(nodes+offset+Nrecv0, r_half-1, Nrecv1, r_half-1, request[2]); - MPI_Waitall(Nmsg+1, request, status); + comm.Waitall(Nmsg+1, request); sendNodes.free(); @@ -459,7 +505,7 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, //make an index map to save the original extended halo ids - libp::memory indexMap(NhaloExtT); + memory indexMap(NhaloExtT); //fill newIds of new entries if possible, or give them an index NhaloExtT = Nhalo + NhaloExtN; @@ -702,25 +748,24 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared, } //make scratch space - AllocBuffer(Sizeof(Dfloat)); + AllocBuffer(sizeof(dfloat)); } void ogsCrystalRouter_t::AllocBuffer(size_t Nbytes) { - if (o_sendBuf.size() < NsendMax*Nbytes) { - sendBuf = static_cast(platform.hostMalloc(NsendMax*Nbytes, nullptr, h_sendBuf)); - o_sendBuf = platform.malloc(NsendMax*Nbytes); + if (o_sendspace.size() < NsendMax*Nbytes) { + h_sendspace = platform.hostMalloc(NsendMax*Nbytes); + o_sendspace = platform.malloc(NsendMax*Nbytes); } - if (o_buf[0].size() < NrecvMax*Nbytes) { - buf[0] = static_cast(platform.hostMalloc(NrecvMax*Nbytes, nullptr, h_buf[0])); - buf[1] = static_cast(platform.hostMalloc(NrecvMax*Nbytes, nullptr, h_buf[1])); - haloBuf = buf[0]; - recvBuf = buf[1]; - - o_buf[0] = platform.malloc(NrecvMax*Nbytes); - o_buf[1] = platform.malloc(NrecvMax*Nbytes); - o_haloBuf = o_buf[0]; - o_recvBuf = o_buf[1]; + if (o_work[0].size() < NrecvMax*Nbytes) { + h_work[0] = platform.hostMalloc(NrecvMax*Nbytes); + h_work[1] = platform.hostMalloc(NrecvMax*Nbytes); + h_workspace = h_work[0]; + hbuf_id=0; + + o_work[0] = platform.malloc(NrecvMax*Nbytes); + o_work[1] = platform.malloc(NrecvMax*Nbytes); + o_workspace = o_work[0]; buf_id=0; } } diff --git a/libs/ogs/ogsHalo.cpp b/libs/ogs/ogsHalo.cpp index 29a9698..c47f9bd 100644 --- a/libs/ogs/ogsHalo.cpp +++ b/libs/ogs/ogsHalo.cpp @@ -34,173 +34,330 @@ namespace libp { namespace ogs { /******************************** - * Exchange + * Device Exchange ********************************/ -void halo_t::Exchange(occa::memory& o_v, - const int k, - const Type type) { - ExchangeStart (o_v, k, type); - ExchangeFinish(o_v, k, type); +template +void halo_t::Exchange(deviceMemory o_v, const int k) { + ExchangeStart (o_v, k); + ExchangeFinish(o_v, k); } -void halo_t::ExchangeStart(occa::memory& o_v, - const int k, - const Type type){ - exchange->AllocBuffer(k*Sizeof(type)); +template +void halo_t::ExchangeStart(deviceMemory o_v, const int k){ + exchange->AllocBuffer(k*sizeof(T)); + + deviceMemory o_haloBuf = exchange->o_workspace; + + if (exchange->gpu_aware) { + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + o_haloBuf.copyFrom(o_v + k*NlocalT, k*NhaloP, + 0, "async: true"); + } else { + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, Add, NoTrans); + } + + //prepare MPI exchange + exchange->Start(o_haloBuf, k, Add, NoTrans); - //collect halo buffer - if (gathered_halo) { - //if this halo was build from a gathered ogs the halo nodes are at the end - if (NhaloP) - exchange->o_haloBuf.copyFrom(o_v + k*NlocalT*Sizeof(type), - k*NhaloP*Sizeof(type), 0, 0, "async: true"); } else { - gatherHalo->Gather(exchange->o_haloBuf, o_v, - k, type, Add, NoTrans); + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //if not using gpu-aware mpi move the halo buffer to the host + pinnedMemory haloBuf = exchange->h_workspace; + + if (gathered_halo) { + //wait for o_v to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_v + k*NlocalT, NhaloP*k, + 0, "async: true"); + device.setStream(currentStream); + } else { + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, Add, NoTrans); + + //wait for o_haloBuf to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_haloBuf, NhaloP*k, + 0, "async: true"); + device.setStream(currentStream); + } } - - //prepare MPI exchange - exchange->Start(k, type, Add, NoTrans); } -void halo_t::ExchangeFinish(occa::memory& o_v, - const int k, - const Type type){ +template +void halo_t::ExchangeFinish(deviceMemory o_v, const int k){ - //finish MPI exchange - exchange->Finish(k, type, Add, NoTrans); + deviceMemory o_haloBuf = exchange->o_workspace; //write exchanged halo buffer back to vector - if (gathered_halo) { - //if this halo was build from a gathered ogs the halo nodes are at the end - if (NhaloP) - exchange->o_haloBuf.copyTo(o_v + k*(NlocalT+NhaloP)*Sizeof(type), - k*Nhalo*Sizeof(type), - 0, k*NhaloP*Sizeof(type), "async: true"); + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, Add, NoTrans); + + if (gathered_halo) { + o_haloBuf.copyTo(o_v + k*(NlocalT+NhaloP), k*Nhalo, + k*NhaloP, "async: true"); + } else { + gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans); + } } else { - gatherHalo->Scatter(o_v, exchange->o_haloBuf, - k, type, Add, NoTrans); + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, Add, NoTrans); + exchange->Finish(haloBuf, k, Add, NoTrans); + + // copy recv back to device + if (gathered_halo) { + haloBuf.copyTo(o_v + k*(NlocalT+NhaloP), k*Nhalo, + k*NhaloP, "async: true"); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } else { + haloBuf.copyTo(o_haloBuf+k*NhaloP, k*Nhalo, + k*NhaloP, "async: true"); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + + gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans); + } } } +template void halo_t::Exchange(deviceMemory o_v, const int k); +template void halo_t::Exchange(deviceMemory o_v, const int k); +template void halo_t::Exchange(deviceMemory o_v, const int k); +template void halo_t::Exchange(deviceMemory o_v, const int k); + //host version -void halo_t::Exchange(void* v, - const int k, - const Type type) { - exchange->AllocBuffer(k*Sizeof(type)); +template +void halo_t::Exchange(memory v, const int k) { + ExchangeStart (v, k); + ExchangeFinish(v, k); +} + +template +void halo_t::ExchangeStart(memory v, const int k) { + exchange->AllocBuffer(k*sizeof(T)); + + pinnedMemory haloBuf = exchange->h_workspace; //collect halo buffer if (gathered_halo) { //if this halo was build from a gathered ogs the halo nodes are at the end - std::memcpy(exchange->haloBuf, - static_cast(v) + k*NlocalT*Sizeof(type), - k*NhaloP*Sizeof(type)); + haloBuf.copyFrom(v + k*NlocalT, k*NhaloP); } else { - gatherHalo->Gather(exchange->haloBuf, v, - k, type, Add, NoTrans); + gatherHalo->Gather(haloBuf, v, k, Add, NoTrans); } - //MPI exchange - exchange->Start (k, type, Add, NoTrans, true); - exchange->Finish(k, type, Add, NoTrans, true); + //Prepare MPI exchange + exchange->Start(haloBuf, k, Add, NoTrans); +} + +template +void halo_t::ExchangeFinish(memory v, const int k) { + + pinnedMemory haloBuf = exchange->h_workspace; + + //finish MPI exchange + exchange->Finish(haloBuf, k, Add, NoTrans); //write exchanged halo buffer back to vector if (gathered_halo) { //if this halo was build from a gathered ogs the halo nodes are at the end - std::memcpy(static_cast(v) + k*(NlocalT+NhaloP)*Sizeof(type), - static_cast(exchange->haloBuf) - + k*NhaloP*Sizeof(type), - k*Nhalo*Sizeof(type)); + haloBuf.copyTo(v + k*(NlocalT+NhaloP), + k*Nhalo, + k*NhaloP); } else { - gatherHalo->Scatter(v, exchange->haloBuf, - k, type, Add, NoTrans); + gatherHalo->Scatter(v, haloBuf, k, NoTrans); } } +template void halo_t::Exchange(memory v, const int k); +template void halo_t::Exchange(memory v, const int k); +template void halo_t::Exchange(memory v, const int k); +template void halo_t::Exchange(memory v, const int k); + /******************************** * Combine ********************************/ -void halo_t::Combine(occa::memory& o_v, - const int k, - const Type type) { - CombineStart (o_v, k, type); - CombineFinish(o_v, k, type); +template +void halo_t::Combine(deviceMemory o_v, const int k) { + CombineStart (o_v, k); + CombineFinish(o_v, k); } -void halo_t::CombineStart(occa::memory& o_v, - const int k, - const Type type){ - exchange->AllocBuffer(k*Sizeof(type)); +template +void halo_t::CombineStart(deviceMemory o_v, const int k){ + exchange->AllocBuffer(k*sizeof(T)); - //collect halo buffer - if (gathered_halo) { - //if this halo was build from a gathered ogs the halo nodes are at the end - if (NhaloT) - exchange->o_haloBuf.copyFrom(o_v + k*NlocalT*Sizeof(type), - k*NhaloT*Sizeof(type), 0, 0, "async: true"); + deviceMemory o_haloBuf = exchange->o_workspace; + + if (exchange->gpu_aware) { + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + o_haloBuf.copyFrom(o_v + k*NlocalT, k*NhaloT, + 0, "async: true"); + } else { + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, Add, Trans); + } + + //prepare MPI exchange + exchange->Start(o_haloBuf, k, Add, Trans); } else { - gatherHalo->Gather(exchange->o_haloBuf, o_v, - k, type, Add, Trans); + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //if not using gpu-aware mpi move the halo buffer to the host + pinnedMemory haloBuf = exchange->h_workspace; + + if (gathered_halo) { + //wait for o_v to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_v + k*NlocalT, NhaloT*k, + 0, "async: true"); + device.setStream(currentStream); + } else { + //collect halo buffer + gatherHalo->Gather(o_haloBuf, o_v, k, Add, Trans); + + //wait for o_haloBuf to be ready + device.finish(); + + //queue copy to host + device.setStream(dataStream); + haloBuf.copyFrom(o_haloBuf, NhaloT*k, + 0, "async: true"); + device.setStream(currentStream); + } } - - //prepare MPI exchange - exchange->Start(k, type, Add, Trans); } +template +void halo_t::CombineFinish(deviceMemory o_v, const int k){ -void halo_t::CombineFinish(occa::memory& o_v, - const int k, - const Type type){ - - //finish MPI exchange - exchange->Finish(k, type, Add, Trans); + deviceMemory o_haloBuf = exchange->o_workspace; //write exchanged halo buffer back to vector - if (gathered_halo) { - //if this halo was build from a gathered ogs the halo nodes are at the end - if (NhaloP) - exchange->o_haloBuf.copyTo(o_v + k*NlocalT*Sizeof(type), - k*NhaloP*Sizeof(type), - 0, 0, "async: true"); + if (exchange->gpu_aware) { + //finish MPI exchange + exchange->Finish(o_haloBuf, k, Add, Trans); + + if (gathered_halo) { + //if this halo was build from a gathered ogs the halo nodes are at the end + o_haloBuf.copyTo(o_v + k*NlocalT, k*NhaloP, + 0, "async: true"); + } else { + gatherHalo->Scatter(o_v, o_haloBuf, k, Trans); + } } else { - gatherHalo->Scatter(o_v, exchange->o_haloBuf, - k, type, Add, Trans); + pinnedMemory haloBuf = exchange->h_workspace; + + //get current stream + device_t &device = platform.device; + stream_t currentStream = device.getStream(); + + //synchronize data stream to ensure the buffer is on the host + device.setStream(dataStream); + device.finish(); + + /*MPI exchange of host buffer*/ + exchange->Start (haloBuf, k, Add, Trans); + exchange->Finish(haloBuf, k, Add, Trans); + + if (gathered_halo) { + // copy recv back to device + haloBuf.copyTo(o_v + k*NlocalT, NhaloP*k, + 0, "async: true"); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + } else { + haloBuf.copyTo(o_haloBuf, NhaloP*k, + 0, "async: true"); + device.finish(); //wait for transfer to finish + device.setStream(currentStream); + + gatherHalo->Scatter(o_v, o_haloBuf, k, Trans); + } } } +template void halo_t::Combine(deviceMemory o_v, const int k); +template void halo_t::Combine(deviceMemory o_v, const int k); +template void halo_t::Combine(deviceMemory o_v, const int k); +template void halo_t::Combine(deviceMemory o_v, const int k); + //host version -void halo_t::Combine(void* v, - const int k, - const Type type) { - exchange->AllocBuffer(k*Sizeof(type)); +template +void halo_t::Combine(memory v, const int k) { + CombineStart (v, k); + CombineFinish(v, k); +} + +template +void halo_t::CombineStart(memory v, const int k) { + exchange->AllocBuffer(k*sizeof(T)); + + pinnedMemory haloBuf = exchange->h_workspace; //collect halo buffer if (gathered_halo) { //if this halo was build from a gathered ogs the halo nodes are at the end - std::memcpy(exchange->haloBuf, - static_cast(v) + k*NlocalT*Sizeof(type), - k*NhaloT*Sizeof(type)); + haloBuf.copyFrom(v + k*NlocalT, k*NhaloT); } else { - gatherHalo->Gather(exchange->haloBuf, v, - k, type, Add, Trans); + gatherHalo->Gather(haloBuf, v, k, Add, Trans); } - //MPI exchange - exchange->Start (k, type, Add, Trans, true); - exchange->Finish(k, type, Add, Trans, true); + //Prepare MPI exchange + exchange->Start(haloBuf, k, Add, Trans); +} + + +template +void halo_t::CombineFinish(memory v, const int k) { + + pinnedMemory haloBuf = exchange->h_workspace; + + //finish MPI exchange + exchange->Finish(haloBuf, k, Add, Trans); //write exchanged halo buffer back to vector if (gathered_halo) { //if this halo was build from a gathered ogs the halo nodes are at the end - std::memcpy(static_cast(v) + k*NlocalT*Sizeof(type), - exchange->haloBuf, - k*NhaloP*Sizeof(type)); + haloBuf.copyTo(v + k*NlocalT, k*NhaloP); } else { - gatherHalo->Scatter(v, exchange->haloBuf, - k, type, Add, Trans); + gatherHalo->Scatter(v, haloBuf, k, Trans); } } +template void halo_t::Combine(memory v, const int k); +template void halo_t::Combine(memory v, const int k); +template void halo_t::Combine(memory v, const int k); +template void halo_t::Combine(memory v, const int k); + } //namespace ogs } //namespace libp diff --git a/libs/ogs/ogsOperator.cpp b/libs/ogs/ogsOperator.cpp index 5546ca8..2ee7833 100644 --- a/libs/ogs/ogsOperator.cpp +++ b/libs/ogs/ogsOperator.cpp @@ -35,31 +35,34 @@ namespace ogs { template struct Op_Add { - const T init(){ return T{0}; } - void operator()(T& gv, const T v) { gv += v; } + inline const T init(){ return T{0}; } + inline void operator()(T& gv, const T v) { gv += v; } }; template struct Op_Mul { - const T init(){ return T{1}; } - void operator()(T& gv, const T v) { gv *= v; } + inline const T init(){ return T{1}; } + inline void operator()(T& gv, const T v) { gv *= v; } }; template struct Op_Max { - const T init(){ return -std::numeric_limits::max(); } - void operator()(T& gv, const T v) { gv = (v>gv) ? v : gv; } + inline const T init(){ return -std::numeric_limits::max(); } + inline void operator()(T& gv, const T v) { gv = (v>gv) ? v : gv; } }; template struct Op_Min { - const T init() {return std::numeric_limits::max(); } - void operator()(T& gv, const T v) { gv = (v::max(); } + inline void operator()(T& gv, const T v) { gv = (v class Op> -void ogsOperator_t::Gather(T* gv, - const T* v, +template class U, + template class V, + template class Op, + typename T> +void ogsOperator_t::Gather(U gv, + const V v, const int K, const Transpose trans) { @@ -75,6 +78,9 @@ void ogsOperator_t::Gather(T* gv, colIds = colIdsT.ptr(); } + const T* v_ptr = v.ptr(); + T* gv_ptr = gv.ptr(); + #pragma omp parallel for for(dlong n=0;n().init(); for(dlong g=start;g()(val, v[k+colIds[g]*K]); + Op()(val, v_ptr[k+colIds[g]*K]); } - gv[k+n*K] = val; + gv_ptr[k+n*K] = val; } } } -void ogsOperator_t::Gather(void* gv, - const void* v, - const int k, - const Type type, - const Op op, - const Transpose trans) { +template class U, + template class V, + typename T> +void ogsOperator_t::Gather(U gv, + const V v, + const int k, + const Op op, + const Transpose trans) { switch (op){ case Add: - switch (type){ - case Float: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Double: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Int32: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Int64: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - } - break; + Gather(gv, v, k, trans); break; case Mul: - switch (type){ - case Float: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Double: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Int32: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Int64: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - } - break; + Gather(gv, v, k, trans); break; case Max: - switch (type){ - case Float: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Double: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Int32: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Int64: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - } - break; + Gather(gv, v, k, trans); break; case Min: - switch (type){ - case Float: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Double: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Int32: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - case Int64: Gather - (static_cast(gv), - static_cast( v), - k, trans); break; - } - break; + Gather(gv, v, k, trans); break; } } -void ogsOperator_t::Gather(occa::memory& o_gv, - occa::memory& o_v, - const int k, - const Type type, - const Op op, - const Transpose trans) { +template +void ogsOperator_t::Gather(memory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(memory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(memory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(memory gv, const memory v, + const int k, const Op op, const Transpose trans); + +template +void ogsOperator_t::Gather(pinnedMemory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const memory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const memory v, + const int k, const Op op, const Transpose trans); + +template +void ogsOperator_t::Gather(pinnedMemory gv, const pinnedMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const pinnedMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const pinnedMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(pinnedMemory gv, const pinnedMemory v, + const int k, const Op op, const Transpose trans); + + +template +void ogsOperator_t::Gather(deviceMemory o_gv, + deviceMemory o_v, + const int k, + const Op op, + const Transpose trans) { + constexpr Type type = ogsType::get(); InitializeKernels(platform, type, op); if (trans==NoTrans) { @@ -209,14 +186,28 @@ void ogsOperator_t::Gather(occa::memory& o_gv, } } +template +void ogsOperator_t::Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); +template +void ogsOperator_t::Gather(deviceMemory gv, const deviceMemory v, + const int k, const Op op, const Transpose trans); + + /******************************** * Scatter Operation ********************************/ -template -void ogsOperator_t::Scatter(T* v, - const T* gv, - const int K, - const Transpose trans) { +template class U, + template class V, + typename T> +void ogsOperator_t::Scatter(U v, const V gv, + const int K, const Transpose trans) { dlong Nrows; dlong *rowStarts, *colIds; @@ -230,6 +221,9 @@ void ogsOperator_t::Scatter(T* v, colIds = colIdsT.ptr(); } + T* v_ptr = v.ptr(); + const T* gv_ptr = gv.ptr(); + #pragma omp parallel for for(dlong n=0;n(static_cast(v), - static_cast(gv), - k, trans); break; - case Double: Scatter(static_cast(v), - static_cast(gv), - k, trans); break; - case Int32: Scatter(static_cast(v), - static_cast(gv), - k, trans); break; - case Int64: Scatter(static_cast(v), - static_cast(gv), - k, trans); break; - } -} +template +void ogsOperator_t::Scatter(memory v, const memory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const memory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const memory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const memory gv, + const int K, const Transpose trans); + +template +void ogsOperator_t::Scatter(memory v, const pinnedMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const pinnedMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const pinnedMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(memory v, const pinnedMemory gv, + const int K, const Transpose trans); -void ogsOperator_t::Scatter(occa::memory& o_v, - occa::memory& o_gv, - const int k, - const Type type, - const Op op, - const Transpose trans) { +template +void ogsOperator_t::Scatter(deviceMemory o_v, + deviceMemory o_gv, + const int k, + const Transpose trans) { + constexpr Type type = ogsType::get(); InitializeKernels(platform, type, Add); if (trans==Trans) { @@ -294,12 +292,26 @@ void ogsOperator_t::Scatter(occa::memory& o_v, } } +template +void ogsOperator_t::Scatter(deviceMemory v, const deviceMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(deviceMemory v, const deviceMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(deviceMemory v, const deviceMemory gv, + const int K, const Transpose trans); +template +void ogsOperator_t::Scatter(deviceMemory v, const deviceMemory gv, + const int K, const Transpose trans); + /******************************** * GatherScatter Operation ********************************/ -template class Op> -void ogsOperator_t::GatherScatter(T* v, - const int K, +template class U, + template class Op, + typename T> +void ogsOperator_t::GatherScatter(U v, const int K, const Transpose trans) { dlong Nrows; @@ -326,6 +338,8 @@ void ogsOperator_t::GatherScatter(T* v, sColIds = colIdsT.ptr(); } + T* v_ptr = v.ptr(); + #pragma omp parallel for for(dlong n=0;n().init(); for(dlong g=gstart;g()(val, v[k+gColIds[g]*K]); + Op()(val, v_ptr[k+gColIds[g]*K]); } for(dlong s=sstart;s class U, + typename T> +void ogsOperator_t::GatherScatter(U v, const int k, - const Type type, const Op op, const Transpose trans) { switch (op){ case Add: - switch (type){ - case Float: GatherScatter - (static_cast(v), k, trans); break; - case Double: GatherScatter - (static_cast(v), k, trans); break; - case Int32: GatherScatter - (static_cast(v), k, trans); break; - case Int64: GatherScatter - (static_cast(v), k, trans); break; - } - break; + GatherScatter(v, k, trans); break; case Mul: - switch (type){ - case Float: GatherScatter - (static_cast(v), k, trans); break; - case Double: GatherScatter - (static_cast(v), k, trans); break; - case Int32: GatherScatter - (static_cast(v), k, trans); break; - case Int64: GatherScatter - (static_cast(v), k, trans); break; - } - break; + GatherScatter(v, k, trans); break; case Max: - switch (type){ - case Float: GatherScatter - (static_cast(v), k, trans); break; - case Double: GatherScatter - (static_cast(v), k, trans); break; - case Int32: GatherScatter - (static_cast(v), k, trans); break; - case Int64: GatherScatter - (static_cast(v), k, trans); break; - } - break; + GatherScatter(v, k, trans); break; case Min: - switch (type){ - case Float: GatherScatter - (static_cast(v), k, trans); break; - case Double: GatherScatter - (static_cast(v), k, trans); break; - case Int32: GatherScatter - (static_cast(v), k, trans); break; - case Int64: GatherScatter - (static_cast(v), k, trans); break; - } - break; + GatherScatter(v, k, trans); break; } } -void ogsOperator_t::GatherScatter(occa::memory& o_v, +template +void ogsOperator_t::GatherScatter(memory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(memory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(memory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(memory v,const int k, + const Op op, const Transpose trans); + +template +void ogsOperator_t::GatherScatter(deviceMemory o_v, const int k, - const Type type, const Op op, const Transpose trans) { + constexpr Type type = ogsType::get(); InitializeKernels(platform, type, Add); if (trans==Trans) { @@ -442,6 +431,19 @@ void ogsOperator_t::GatherScatter(occa::memory& o_v, } } +template +void ogsOperator_t::GatherScatter(deviceMemory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(deviceMemory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(deviceMemory v,const int k, + const Op op, const Transpose trans); +template +void ogsOperator_t::GatherScatter(deviceMemory v,const int k, + const Op op, const Transpose trans); + void ogsOperator_t::setupRowBlocks() { dlong blockSumN=0, blockSumT=0; @@ -452,24 +454,15 @@ void ogsOperator_t::setupRowBlocks() { for (dlong i=0;i gatherNodesPerBlock) { - //this row is pathalogically big. We can't currently run this - std::stringstream ss; - ss << "Multiplicity of global node id: " << i - << " in ogsOperator_t::setupRowBlocks is too large."; - HIPBONE_ABORT(ss.str()) - } - const dlong rowSizeT = rowStartsT[i+1]-rowStartsT[i]; - if (rowSizeT > gatherNodesPerBlock) { - //this row is pathalogically big. We can't currently run this - std::stringstream ss; - ss << "Multiplicity of global node id: " << i - << " in ogsOperator_t::setupRowBlocks is too large."; - HIPBONE_ABORT(ss.str()) - } + //this row is pathalogically big. We can't currently run this + LIBP_ABORT("Multiplicity of global node id: " << i + << " in ogsOperator_t::setupRowBlocks is too large.", + rowSizeN > gatherNodesPerBlock); + LIBP_ABORT("Multiplicity of global node id: " << i + << " in ogsOperator_t::setupRowBlocks is too large.", + rowSizeT > gatherNodesPerBlock); if (blockSumN+rowSizeN > gatherNodesPerBlock) { //adding this row will exceed the nnz per block NrowBlocksN++; //count the previous block @@ -544,47 +537,44 @@ void ogsOperator_t::Free() { } -template +template class U, + template class V, + typename T> void extract(const dlong N, const int K, - const dlong *ids, - const T *q, - T *gatherq) { + const memory ids, + const U q, + V gatherq) { + + const T* q_ptr = q.ptr(); + T* gatherq_ptr = gatherq.ptr(); for(dlong n=0;n(N, K, ids, - static_cast(q), - static_cast(gatherq)); - break; - case Double: extract(N, K, ids, - static_cast(q), - static_cast(gatherq)); - break; - case Int32: extract(N, K, ids, - static_cast(q), - static_cast(gatherq)); - break; - case Int64: extract(N, K, ids, - static_cast(q), - static_cast(gatherq)); - break; - } -} +template void extract(const dlong N, const int K, const memory ids, + const memory q, memory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const memory q, memory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const memory q, memory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const memory q, memory gatherq); + +template void extract(const dlong N, const int K, const memory ids, + const pinnedMemory q, pinnedMemory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const pinnedMemory q, pinnedMemory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const pinnedMemory q, pinnedMemory gatherq); +template void extract(const dlong N, const int K, const memory ids, + const pinnedMemory q, pinnedMemory gatherq); } //namespace ogs diff --git a/libs/ogs/ogsPairwise.cpp b/libs/ogs/ogsPairwise.cpp index a6583e2..2cf83e9 100644 --- a/libs/ogs/ogsPairwise.cpp +++ b/libs/ogs/ogsPairwise.cpp @@ -39,73 +39,110 @@ namespace libp { namespace ogs { -void ogsPairwise_t::Start(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host){ +/********************************** +* Host exchange +***********************************/ +template +inline void ogsPairwise_t::Start(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ - occa::device &device = platform.device; + pinnedMemory sendBuf = h_sendspace; - //get current stream - occa::stream currentStream = device.getStream(); + const int NranksSend = (trans==NoTrans) ? NranksSendN : NranksSendT; + const int NranksRecv = (trans==NoTrans) ? NranksRecvN : NranksRecvT; + const int *sendRanks = (trans==NoTrans) ? sendRanksN.ptr() : sendRanksT.ptr(); + const int *recvRanks = (trans==NoTrans) ? recvRanksN.ptr() : recvRanksT.ptr(); + const int *sendCounts = (trans==NoTrans) ? sendCountsN.ptr() : sendCountsT.ptr(); + const int *recvCounts = (trans==NoTrans) ? recvCountsN.ptr() : recvCountsT.ptr(); + const int *sendOffsets= (trans==NoTrans) ? sendOffsetsN.ptr() : sendOffsetsT.ptr(); + const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr(); - const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT; - const dlong N = (trans == NoTrans) ? NhaloP : Nhalo; + //post recvs + for (int r=0;r +inline void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, + const Op op, const Transpose trans){ -void ogsPairwise_t::Finish(const int k, - const Type type, - const Op op, - const Transpose trans, - const bool host){ + const int NranksSend = (trans==NoTrans) ? NranksSendN : NranksSendT; + const int NranksRecv = (trans==NoTrans) ? NranksRecvN : NranksRecvT; + const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr(); - const size_t Nbytes = k*Sizeof(type); - occa::device &device = platform.device; + comm.Waitall(NranksRecv+NranksSend, requests); - //get current stream - occa::stream currentStream = device.getStream(); + //if we recvieved anything via MPI, gather the recv buffer and scatter + // it back to to original vector + dlong Nrecv = recvOffsets[NranksRecv]; + if (Nrecv) { + // gather the recieved nodes + postmpi.Gather(buf, buf, k, op, trans); + } +} + +void ogsPairwise_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(pinnedMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + +/********************************** +* GPU-aware exchange +***********************************/ +template +void ogsPairwise_t::Start(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT; - if (Nsend && !gpu_aware && !host) { - //synchronize data stream to ensure the host buffer is on the host - device.setStream(dataStream); + if (Nsend) { + deviceMemory o_sendBuf = o_sendspace; + + // assemble the send buffer on device + if (trans == NoTrans) { + extractKernel[ogsType::get()](NsendN, k, o_sendIdsN, o_buf, o_sendBuf); + } else { + extractKernel[ogsType::get()](NsendT, k, o_sendIdsT, o_buf, o_sendBuf); + } + //wait for kernel to finish on default stream + device_t &device = platform.device; device.finish(); - device.setStream(dataStream); } +} +template +void ogsPairwise_t::Finish(deviceMemory &o_buf, + const int k, + const Op op, + const Transpose trans){ - char *sendPtr, *recvPtr; - if (gpu_aware && !host) { //device pointer - sendPtr = (char*)o_sendBuf.ptr(); - recvPtr = (char*)o_haloBuf.ptr() + Nhalo*Nbytes; - } else { //host pointer - sendPtr = sendBuf; - recvPtr = haloBuf + Nhalo*Nbytes; - } + deviceMemory o_sendBuf = o_sendspace; const int NranksSend = (trans==NoTrans) ? NranksSendN : NranksSendT; const int NranksRecv = (trans==NoTrans) ? NranksRecvN : NranksRecvT; @@ -118,59 +155,49 @@ void ogsPairwise_t::Finish(const int k, //post recvs for (int r=0;r &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Start(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Start(buf, k, op, trans); } +void ogsPairwise_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } +void ogsPairwise_t::Finish(deviceMemory &buf, const int k, const Op op, const Transpose trans) { Finish(buf, k, op, trans); } + ogsPairwise_t::ogsPairwise_t(dlong Nshared, - libp::memory &sharedNodes, + memory &sharedNodes, ogsOperator_t& gatherHalo, - MPI_Comm _comm, + stream_t _dataStream, + comm_t _comm, platform_t &_platform): - ogsExchange_t(_platform,_comm) { + ogsExchange_t(_platform,_comm,_dataStream) { Nhalo = gatherHalo.NrowsT; NhaloP = gatherHalo.NrowsN; @@ -185,14 +212,14 @@ ogsPairwise_t::ogsPairwise_t(dlong Nshared, }); //make mpi allgatherv counts and offsets - libp::memory mpiSendCountsT(size,0); - libp::memory mpiSendCountsN(size,0); - libp::memory mpiRecvCountsT(size); - libp::memory mpiRecvCountsN(size); - libp::memory mpiSendOffsetsT(size+1); - libp::memory mpiSendOffsetsN(size+1); - libp::memory mpiRecvOffsetsT(size+1); - libp::memory mpiRecvOffsetsN(size+1); + memory mpiSendCountsT(size,0); + memory mpiSendCountsN(size,0); + memory mpiRecvCountsT(size); + memory mpiRecvCountsN(size); + memory mpiSendOffsetsT(size+1); + memory mpiSendOffsetsN(size+1); + memory mpiRecvOffsetsT(size+1); + memory mpiRecvOffsetsN(size+1); for (dlong n=0;n recvNodes(Nrecv); + memory recvNodes(Nrecv); //Send list of nodes to each rank - MPI_Alltoallv(sharedNodes.ptr(), mpiSendCountsT.ptr(), mpiSendOffsetsT.ptr(), MPI_PARALLELNODE_T, - recvNodes.ptr(), mpiRecvCountsT.ptr(), mpiRecvOffsetsT.ptr(), MPI_PARALLELNODE_T, - comm); - MPI_Barrier(comm); + comm.Alltoallv(sharedNodes, mpiSendCountsT, mpiSendOffsetsT, + recvNodes, mpiRecvCountsT, mpiRecvOffsetsT); //make ops for gathering halo nodes after an MPI_Allgatherv postmpi.platform = platform; @@ -258,10 +281,10 @@ ogsPairwise_t::ogsPairwise_t(dlong Nshared, postmpi.rowStartsT.calloc(Nhalo+1); //make array of counters - libp::memory haloGatherTCounts(Nhalo); - libp::memory haloGatherNCounts(Nhalo); + memory haloGatherTCounts(Nhalo); + memory haloGatherNCounts(Nhalo); - //count the data that will already be in haloBuf + //count the data that will already be in h_haloBuf.ptr() for (dlong n=0;n(platform.hostMalloc(postmpi.nnzT*Nbytes, nullptr, h_haloBuf)); - o_haloBuf = platform.malloc(postmpi.nnzT*Nbytes); + if (o_workspace.size() < postmpi.nnzT*Nbytes) { + h_workspace = platform.hostMalloc(postmpi.nnzT*Nbytes); + o_workspace = platform.malloc(postmpi.nnzT*Nbytes); } - if (o_sendBuf.size() < NsendT*Nbytes) { - if (o_sendBuf.size()) o_sendBuf.free(); - sendBuf = static_cast(platform.hostMalloc(NsendT*Nbytes, nullptr, h_sendBuf)); - o_sendBuf = platform.malloc(NsendT*Nbytes); + if (o_sendspace.size() < NsendT*Nbytes) { + h_sendspace = platform.hostMalloc(NsendT*Nbytes); + o_sendspace = platform.malloc(NsendT*Nbytes); } } diff --git a/libs/ogs/ogsSetup.cpp b/libs/ogs/ogsSetup.cpp index 6323752..96608bd 100644 --- a/libs/ogs/ogsSetup.cpp +++ b/libs/ogs/ogsSetup.cpp @@ -28,6 +28,7 @@ SOFTWARE. #include "ogs/ogsUtils.hpp" #include "ogs/ogsOperator.hpp" #include "ogs/ogsExchange.hpp" +#include "timer.hpp" #ifdef GLIBCXX_PARALLEL #include @@ -41,8 +42,8 @@ namespace libp { namespace ogs { void ogs_t::Setup(const dlong _N, - hlong *ids, - MPI_Comm _comm, + memory ids, + comm_t _comm, const Kind _kind, const Method method, const bool _unique, @@ -52,8 +53,8 @@ void ogs_t::Setup(const dlong _N, } void halo_t::Setup(const dlong _N, - hlong *ids, - MPI_Comm _comm, + memory ids, + comm_t _comm, const Method method, const bool verbose, platform_t& _platform){ @@ -66,8 +67,8 @@ void halo_t::Setup(const dlong _N, * Setup ********************************/ void ogsBase_t::Setup(const dlong _N, - hlong *ids, - MPI_Comm _comm, + memory ids, + comm_t _comm, const Kind _kind, const Method method, const bool _unique, @@ -77,21 +78,26 @@ void ogsBase_t::Setup(const dlong _N, //release resources if this ogs was setup before Free(); + timePoint_t start = Time(); + platform = _platform; + if (!dataStream.isInitialized()) + dataStream = platform.device.createStream(); + N = _N; comm = _comm; kind = _kind; unique = _unique; int rank, size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + rank = comm.rank(); + size = comm.size(); //sanity check options - if ( (kind==Unsigned && unique==true) - || (kind==Halo && unique==true) ) - HIPBONE_ABORT("Invalid ogs setup requested"); + LIBP_ABORT("Invalid ogs setup requested", + (kind==Unsigned && unique==true) + || (kind==Halo && unique==true)); //count how many ids are non-zero dlong Nids=0; @@ -99,7 +105,7 @@ void ogsBase_t::Setup(const dlong _N, if (ids[n]!=0) Nids++; // make list of nodes - libp::memory nodes(Nids); + memory nodes(Nids); //fill the data (squeezing out zero ids) Nids=0; @@ -114,9 +120,6 @@ void ogsBase_t::Setup(const dlong _N, } } - /*Register MPI_PARALLELNODE_T type*/ - InitMPIType(); - //flag which nodes are shared via MPI FindSharedNodes(Nids, nodes, verbose); @@ -124,7 +127,7 @@ void ogsBase_t::Setup(const dlong _N, // construct sharedNodes which contains all the info // we need to setup the MPI exchange. dlong Nshared=0; - libp::memory sharedNodes; + memory sharedNodes; ConstructSharedNodes(Nids, nodes, Nshared, sharedNodes); Nids=0; @@ -159,15 +162,18 @@ void ogsBase_t::Setup(const dlong _N, if (method == AllToAll) { exchange = std::shared_ptr( new ogsAllToAll_t(Nshared, sharedNodes, - *gatherHalo, comm, platform)); + *gatherHalo, dataStream, + comm, platform)); } else if (method == Pairwise) { exchange = std::shared_ptr( new ogsPairwise_t(Nshared, sharedNodes, - *gatherHalo, comm, platform)); + *gatherHalo, dataStream, + comm, platform)); } else if (method == CrystalRouter) { exchange = std::shared_ptr( new ogsCrystalRouter_t(Nshared, sharedNodes, - *gatherHalo, comm, platform)); + *gatherHalo, dataStream, + comm, platform)); } else { //Auto exchange = std::shared_ptr( AutoSetup(Nshared, sharedNodes, @@ -175,30 +181,33 @@ void ogsBase_t::Setup(const dlong _N, platform, verbose)); } - /*Free the MPI_PARALLELNODE_T type*/ - DestroyMPIType(); + timePoint_t end = GlobalPlatformTime(platform); + double elapsedTime = ElapsedTime(start, end); + + if (!rank && verbose) { + std::cout << "ogs Setup Time: " << elapsedTime << " seconds." << std::endl; + } } void ogsBase_t::FindSharedNodes(const dlong Nids, - libp::memory &nodes, + memory &nodes, const int verbose){ int rank, size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + rank = comm.rank(); + size = comm.size(); - libp::memory sendCounts(size,0); - libp::memory recvCounts(size); - libp::memory sendOffsets(size+1); - libp::memory recvOffsets(size+1); + memory sendCounts(size,0); + memory recvCounts(size); + memory sendOffsets(size+1); + memory recvOffsets(size+1); //count number of ids we're sending for (dlong n=0;n recvNodes(recvN); + memory recvNodes(recvN); //Send all the nodes to their destination rank. - MPI_Alltoallv( nodes.ptr(), sendCounts.ptr(), sendOffsets.ptr(), MPI_PARALLELNODE_T, - recvNodes.ptr(), recvCounts.ptr(), recvOffsets.ptr(), MPI_PARALLELNODE_T, - comm); + comm.Alltoallv( nodes, sendCounts, sendOffsets, + recvNodes, recvCounts, recvOffsets); //remember this ordering for (dlong n=0;n0) positiveCount++; //if we didnt find a sole positive baseId, the gather is not well-defined - if (positiveCount!=1) locally_unique=0; + if (positiveCount!=1) is_unique=0; } // When making a halo excahnge, check that we have a leading positive id - if (kind==Halo && positiveCount!=1) { - std::stringstream ss; - ss << "Found " << positiveCount << " positive Ids for baseId: " - << abs(recvNodes[start].baseId)<< "."; - HIPBONE_ABORT(ss.str()); - } + LIBP_ABORT("Found " << positiveCount << " positive Ids for baseId: " + << abs(recvNodes[start].baseId)<< ".", + kind==Halo && positiveCount!=1); //determine if this node is shared via MPI, int shared=1; @@ -304,13 +309,11 @@ void ogsBase_t::FindSharedNodes(const dlong Nids, } //shared the unique node check so we know if the gather operation is well-defined - int globally_unique=1; - MPI_Allreduce(&locally_unique, &globally_unique, 1, MPI_INT, MPI_MIN, comm); - gather_defined = (globally_unique==1); + comm.Allreduce(is_unique, comm_t::Min); + gather_defined = (is_unique==1); - hlong Nshared_local = Nshared; hlong Nshared_global = Nshared; - MPI_Reduce(&Nshared_local, &Nshared_global, 1, MPI_HLONG, MPI_SUM, 0, comm); + comm.Reduce(Nshared_global, 0); if (!rank && verbose) { std::cout << "ogs Setup: " << Nshared_global << " unique labels shared." << std::endl; } @@ -323,19 +326,16 @@ void ogsBase_t::FindSharedNodes(const dlong Nids, permute(recvN, recvNodes, [](const parallelNode_t& a) { return a.newId; } ); //Return all the nodes to their origin rank. - MPI_Alltoallv(recvNodes.ptr(), recvCounts.ptr(), recvOffsets.ptr(), MPI_PARALLELNODE_T, - nodes.ptr(), sendCounts.ptr(), sendOffsets.ptr(), MPI_PARALLELNODE_T, - comm); + comm.Alltoallv(recvNodes, recvCounts, recvOffsets, + nodes, sendCounts, sendOffsets); } void ogsBase_t::ConstructSharedNodes(const dlong Nids, - libp::memory &nodes, + memory &nodes, dlong &Nshared, - libp::memory &sharedNodes) { + memory &sharedNodes) { - int rank, size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); + int size = comm.size(); // sort based on abs(baseId) sort(nodes.ptr(), nodes.ptr()+Nids, @@ -388,11 +388,11 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids, Ngather = NlocalP+NhaloP; //global total - hlong NgatherLocal = (hlong) Ngather; - MPI_Allreduce(&NgatherLocal, &(NgatherGlobal), 1, MPI_HLONG, MPI_SUM, comm); + NgatherGlobal = Ngather; + comm.Allreduce(NgatherGlobal); //extract the leading node from each shared baseId - libp::memory sendSharedNodes(NhaloT); + memory sendSharedNodes(NhaloT); NhaloT=0; for (dlong n=0;n indexMap(NbaseIds, -1); + memory indexMap(NbaseIds, -1); dlong localCntN = 0, localCntT = NlocalP; //start point for local gather nodes dlong haloCntN = 0, haloCntT = NhaloP; //start point for halo gather nodes @@ -441,10 +441,10 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids, indexMap.free(); - libp::memory sendCounts(size,0); - libp::memory recvCounts(size); - libp::memory sendOffsets(size+1); - libp::memory recvOffsets(size+1); + memory sendCounts(size,0); + memory recvCounts(size); + memory sendOffsets(size+1); + memory recvOffsets(size+1); // sort based on destination rank sort(sendSharedNodes.ptr(), sendSharedNodes.ptr()+NhaloT, @@ -457,8 +457,7 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids, sendCounts[sendSharedNodes[n].destRank]++; } - MPI_Alltoall(sendCounts.ptr(), 1, MPI_INT, - recvCounts.ptr(), 1, MPI_INT, comm); + comm.Alltoall(sendCounts, recvCounts); sendOffsets[0] = 0; recvOffsets[0] = 0; @@ -468,15 +467,13 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids, } dlong recvN = recvOffsets[size]; //total ids to recv - libp::memory recvSharedNodes(recvN); + memory recvSharedNodes(recvN); //Send all the nodes to their destination rank. - MPI_Alltoallv(sendSharedNodes.ptr(), sendCounts.ptr(), sendOffsets.ptr(), MPI_PARALLELNODE_T, - recvSharedNodes.ptr(), recvCounts.ptr(), recvOffsets.ptr(), MPI_PARALLELNODE_T, - comm); + comm.Alltoallv(sendSharedNodes, sendCounts, sendOffsets, + recvSharedNodes, recvCounts, recvOffsets); //free up some space - MPI_Barrier(comm); sendSharedNodes.free(); sendCounts.free(); recvCounts.free(); @@ -490,10 +487,10 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids, }); //count number of shared nodes we will be sending - libp::memory sharedSendCounts(size,0); - libp::memory sharedRecvCounts(size); - libp::memory sharedSendOffsets(size+1); - libp::memory sharedRecvOffsets(size+1); + memory sharedSendCounts(size,0); + memory sharedRecvCounts(size); + memory sharedSendOffsets(size+1); + memory sharedRecvOffsets(size+1); start=0; for (dlong n=0;n sharedSendNodes(sharedSendOffsets[size]); + memory sharedSendNodes(sharedSendOffsets[size]); //reset sendCounts for (int r=0;r(Nshared); + sharedNodes = memory(Nshared); //Share all the gathering info - MPI_Alltoallv(sharedSendNodes.ptr(), sharedSendCounts.ptr(), sharedSendOffsets.ptr(), MPI_PARALLELNODE_T, - sharedNodes.ptr(), sharedRecvCounts.ptr(), sharedRecvOffsets.ptr(), MPI_PARALLELNODE_T, - comm); + comm.Alltoallv(sharedSendNodes, sharedSendCounts, sharedSendOffsets, + sharedNodes, sharedRecvCounts, sharedRecvOffsets); } //Make local and halo gather operators using nodes list -void ogsBase_t::LocalSignedSetup(const dlong Nids, libp::memory &nodes){ - - int rank, size; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); +void ogsBase_t::LocalSignedSetup(const dlong Nids, memory &nodes){ gatherLocal = std::make_shared(platform); gatherHalo = std::make_shared(platform); @@ -593,10 +584,10 @@ void ogsBase_t::LocalSignedSetup(const dlong Nids, libp::memory //tally up how many nodes are being gathered to each gatherNode and // map to a local ordering - libp::memory localGatherNCounts(gatherLocal->NrowsT,0); - libp::memory localGatherTCounts(gatherLocal->NrowsT,0); - libp::memory haloGatherNCounts(gatherHalo->NrowsT,0); - libp::memory haloGatherTCounts(gatherHalo->NrowsT,0); + memory localGatherNCounts(gatherLocal->NrowsT,0); + memory localGatherTCounts(gatherLocal->NrowsT,0); + memory haloGatherNCounts(gatherHalo->NrowsT,0); + memory haloGatherTCounts(gatherHalo->NrowsT,0); for (dlong i=0;i } //Make local and halo gather operators using nodes list -void ogsBase_t::LocalUnsignedSetup(const dlong Nids, libp::memory &nodes){ +void ogsBase_t::LocalUnsignedSetup(const dlong Nids, memory &nodes){ gatherLocal = std::make_shared(platform); gatherHalo = std::make_shared(platform); @@ -710,8 +701,8 @@ void ogsBase_t::LocalUnsignedSetup(const dlong Nids, libp::memory localGatherTCounts(gatherLocal->NrowsT,0); - libp::memory haloGatherTCounts(gatherHalo->NrowsT,0); + memory localGatherTCounts(gatherLocal->NrowsT,0); + memory haloGatherTCounts(gatherHalo->NrowsT,0); for (dlong i=0;i &nodes){ +void ogsBase_t::LocalHaloSetup(const dlong Nids, memory &nodes){ gatherHalo = std::make_shared(platform); gatherHalo->kind = Signed; @@ -797,8 +788,8 @@ void ogsBase_t::LocalHaloSetup(const dlong Nids, libp::memory &n //tally up how many nodes are being gathered to each gatherNode and // map to a local ordering - libp::memory haloGatherNCounts(gatherHalo->NrowsT,0); - libp::memory haloGatherTCounts(gatherHalo->NrowsT,0); + memory haloGatherNCounts(gatherHalo->NrowsT,0); + memory haloGatherTCounts(gatherHalo->NrowsT,0); for (dlong i=0;i &n } void ogsBase_t::Free() { + comm.Free(); gatherLocal = nullptr; gatherHalo = nullptr; exchange = nullptr; @@ -867,17 +859,19 @@ void ogsBase_t::Free() { } void ogsBase_t::AssertGatherDefined() { - if (!gather_defined) { - HIPBONE_ABORT("Gather operation not well-defined."); - } + LIBP_ABORT("Gather operation not well-defined.", + !gather_defined); } //Populate the local mapping of the original ids and the gathered ordering -void ogs_t::SetupGlobalToLocalMapping(dlong *GlobalToLocal) { +void ogs_t::SetupGlobalToLocalMapping(memory GlobalToLocal) { + + LIBP_ABORT("ogs handle is not set up.", + NgatherGlobal==0); //Note: Must have GlobalToLocal have N entries. - libp::memory ids(NlocalT+NhaloT); + memory ids(NlocalT+NhaloT); for (dlong n=0;nScatter(GlobalToLocal, ids.ptr(), - 1, Dlong, Add, NoTrans); - gatherHalo->Scatter(GlobalToLocal, ids.ptr()+NlocalT, - 1, Dlong, Add, NoTrans); + gatherLocal->Scatter(GlobalToLocal, ids, + 1, NoTrans); + gatherHalo->Scatter(GlobalToLocal, ids+NlocalT, + 1, NoTrans); } void halo_t::SetupFromGather(ogs_t& ogs) { ogs.AssertGatherDefined(); + platform = ogs.platform; + comm = ogs.comm; + N = ogs.NlocalT + ogs.NhaloT; Ngather = Ngather; Nhalo = ogs.NhaloT - ogs.NhaloP; NgatherGlobal = ogs.NgatherGlobal; - comm = ogs.comm; kind = Halo; unique = ogs.unique; diff --git a/libs/ogs/ogsUtils.cpp b/libs/ogs/ogsUtils.cpp index b342020..4bdfc35 100644 --- a/libs/ogs/ogsUtils.cpp +++ b/libs/ogs/ogsUtils.cpp @@ -34,42 +34,13 @@ namespace libp { namespace ogs { -MPI_Datatype MPI_PARALLELNODE_T; - -void InitMPIType() { - // Make the MPI_PARALLELNODE_T data type - parallelNode_t node{}; - MPI_Datatype dtype[6] = {MPI_DLONG, MPI_HLONG, - MPI_DLONG, MPI_INT, - MPI_INT, MPI_INT}; - int blength[6] = {1, 1, 1, 1, 1, 1}; - MPI_Aint addr[6], displ[6]; - MPI_Get_address ( &(node.localId), addr+0); - MPI_Get_address ( &(node.baseId), addr+1); - MPI_Get_address ( &(node.newId), addr+2); - MPI_Get_address ( &(node.sign), addr+3); - MPI_Get_address ( &(node.rank), addr+4); - MPI_Get_address ( &(node.destRank), addr+5); - displ[0] = 0; - displ[1] = addr[1] - addr[0]; - displ[2] = addr[2] - addr[0]; - displ[3] = addr[3] - addr[0]; - displ[4] = addr[4] - addr[0]; - displ[5] = addr[5] - addr[0]; - MPI_Type_create_struct (6, blength, displ, dtype, &MPI_PARALLELNODE_T); - MPI_Type_commit (&MPI_PARALLELNODE_T); -} - -void DestroyMPIType() { - MPI_Type_free(&MPI_PARALLELNODE_T); -} +stream_t ogsBase_t::dataStream; -occa::kernel ogsOperator_t::gatherScatterKernel[4][4]; -occa::kernel ogsOperator_t::gatherKernel[4][4]; -occa::kernel ogsOperator_t::scatterKernel[4]; +kernel_t ogsOperator_t::gatherScatterKernel[4][4]; +kernel_t ogsOperator_t::gatherKernel[4][4]; +kernel_t ogsOperator_t::scatterKernel[4]; -occa::kernel ogsExchange_t::extractKernel[4]; -occa::stream ogsExchange_t::dataStream; +kernel_t ogsExchange_t::extractKernel[4]; void InitializeKernels(platform_t& platform, const Type type, const Op op) { @@ -77,7 +48,7 @@ void InitializeKernels(platform_t& platform, const Type type, const Op op) { //check if the gather kernel is initialized if (!ogsOperator_t::gatherKernel[type][op].isInitialized()) { - occa::properties kernelInfo = platform.props(); + properties_t kernelInfo = platform.props(); kernelInfo["defines/p_blockSize"] = ogsOperator_t::blockSize; kernelInfo["defines/p_gatherNodesPerBlock"] = ogsOperator_t::gatherNodesPerBlock; @@ -151,26 +122,6 @@ void InitializeKernels(platform_t& platform, const Type type, const Op op) { } } -size_t Sizeof(const Type type) { - switch(type) { - case Float: return sizeof(float); - case Double: return sizeof(double); - case Int32: return sizeof(int32_t); - case Int64: return sizeof(int64_t); - } - return 0; -} - -MPI_Datatype MPI_Type(const Type type) { - switch(type) { - case Float: return MPI_FLOAT; - case Double: return MPI_DOUBLE; - case Int32: return MPI_INT32_T; - case Int64: return MPI_INT64_T; - } - return 0; -} - } //namespace ogs } //namespace libp From 7f1726743aa9c338aa2638c1752f80db834c1cec Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:11:47 -0600 Subject: [PATCH 06/12] Latest memory objects from libp --- include/memory.hpp | 822 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 626 insertions(+), 196 deletions(-) diff --git a/include/memory.hpp b/include/memory.hpp index d058e79..5c038f8 100644 --- a/include/memory.hpp +++ b/include/memory.hpp @@ -27,238 +27,668 @@ SOFTWARE. #ifndef LIBP_MEMORY_HPP #define LIBP_MEMORY_HPP -#include -#include -#include -#include -#include -#include +#include "utils.hpp" namespace libp { - template - class memory { - - private: - using size_t = std::size_t; - using ptrdiff_t = std::ptrdiff_t; - - std::shared_ptr shrdPtr; - size_t lngth; - size_t offset; - - public: - memory() : - lngth{0}, - offset{0} {} - - memory(const size_t lngth_) : - shrdPtr(new T[lngth_]), - lngth{lngth_}, - offset{0} {} - - memory(const size_t lngth_, - const T val) : - shrdPtr(new T[lngth_]), - lngth{lngth_}, - offset{0} { - #pragma omp parallel for - for (size_t i=0;i +class memory { + template friend class memory; + + private: + using size_t = std::size_t; + using ptrdiff_t = std::ptrdiff_t; + + std::shared_ptr shrdPtr; + size_t lngth; + size_t offset; + + public: + memory() : + lngth{0}, + offset{0} {} + + memory(const size_t lngth_) : + shrdPtr(new T[lngth_]), + lngth{lngth_}, + offset{0} {} + + memory(const size_t lngth_, + const T val) : + shrdPtr(new T[lngth_]), + lngth{lngth_}, + offset{0} { + #pragma omp parallel for + for (size_t i=0;i + memory(const memory &m): + shrdPtr{std::reinterpret_pointer_cast(m.shrdPtr)}, + lngth{m.lngth*sizeof(T)/sizeof(U)}, + offset{m.offset*sizeof(T)/sizeof(U)} { + // Check that this conversion made sense + LIBP_ABORT("libp::memory type conversion failed. Trying to convert " + << m.lngth << " " << sizeof(T) << "-byte words to " + << lngth << " " << sizeof(U) << "-byte words.", + lngth*sizeof(U) != m.lngth*sizeof(T)); + + LIBP_ABORT("libp::memory type conversion failed. Source memory has offset at " + << m.lngth << " " << sizeof(T) << "-byte words, destination memory would have offset at" + << lngth << " " << sizeof(U) << "-byte words.", + offset*sizeof(U) != m.offset*sizeof(T)); + } - memory(const memory &m)=default; - memory& operator = (const memory &m)=default; - ~memory()=default; + memory(const memory &m)=default; + memory& operator = (const memory &m)=default; + ~memory()=default; - void malloc(const size_t lngth_) { - *this = libp::memory(lngth_); - } + void malloc(const size_t lngth_) { + *this = memory(lngth_); + } - void calloc(const size_t lngth_) { - *this = libp::memory(lngth_, T{0}); - } + void calloc(const size_t lngth_) { + *this = memory(lngth_, T{0}); + } - void realloc(const size_t lngth_) { - libp::memory m(lngth_); - const ptrdiff_t cnt = std::min(lngth, lngth_); - m.copyFrom(*this, cnt); - *this = m; - } + void realloc(const size_t lngth_) { + memory m(lngth_); + const ptrdiff_t cnt = std::min(lngth, lngth_); + m.copyFrom(*this, cnt); + *this = m; + } - memory& swap(memory &m) { - std::swap(shrdPtr, m.shrdPtr); - std::swap(lngth, m.lngth); - std::swap(offset, m.offset); - return *this; - } + memory& swap(memory &m) { + std::swap(shrdPtr, m.shrdPtr); + std::swap(lngth, m.lngth); + std::swap(offset, m.offset); + return *this; + } - T* ptr() { - return shrdPtr.get()+offset; - } - const T* ptr() const { - return shrdPtr.get()+offset; - } + T* ptr() { + return shrdPtr.get()+offset; + } + const T* ptr() const { + return shrdPtr.get()+offset; + } - size_t length() const { - return lngth; - } + T* begin() {return ptr();} + T* end() {return ptr() + length();} - size_t size() const { - return lngth*sizeof(T); - } + size_t length() const { + return lngth; + } - size_t use_count() const { - return shrdPtr.use_count(); - } + size_t size() const { + return lngth*sizeof(T); + } - T& operator[](const ptrdiff_t idx) const { - return shrdPtr[idx+offset]; - } + size_t use_count() const { + return shrdPtr.use_count(); + } - bool operator == (const libp::memory &other) const { - return (shrdPtr==other.shrdPtr && offset==other.offset); - } - bool operator != (const libp::memory &other) const { - return (shrdPtr!=other.shrdPtr || offset!=other.offset); - } + T& operator[](const ptrdiff_t idx) const { + return shrdPtr[idx+offset]; + } - libp::memory operator + (const ptrdiff_t offset_) const { - return slice(offset_); - } - libp::memory& operator += (const ptrdiff_t offset_) { - *this = slice(offset_); - return *this; - } + bool operator == (const memory &other) const { + return (shrdPtr==other.shrdPtr && offset==other.offset); + } + bool operator != (const memory &other) const { + return (shrdPtr!=other.shrdPtr || offset!=other.offset); + } - libp::memory slice(const ptrdiff_t offset_, - const ptrdiff_t count = -1) const { - libp::memory m(*this); - m.offset = offset + offset_; - m.lngth = (count==-1) - ? (lngth - offset_) - : count; - return m; - } + memory operator + (const ptrdiff_t offset_) const { + return slice(offset_); + } + memory& operator += (const ptrdiff_t offset_) { + *this = slice(offset_); + return *this; + } - /*Copy from raw ptr*/ - void copyFrom(const T* src, - const ptrdiff_t count = -1, - const ptrdiff_t offset_ = 0) { + memory slice(const ptrdiff_t offset_, + const ptrdiff_t count = -1) const { + memory m(*this); + m.offset = offset + offset_; + m.lngth = (count==-1) + ? (lngth - offset_) + : count; + return m; + } - const ptrdiff_t cnt = (count==-1) ? lngth : count; - std::copy(src, - src+cnt, - ptr()+offset_); - } + /*Copy from raw ptr*/ + void copyFrom(const T* src, + const ptrdiff_t count = -1, + const ptrdiff_t offset_ = 0) { + + const ptrdiff_t cnt = (count==-1) ? lngth : count; + + LIBP_ABORT("libp::memory::copyFrom Cannot have negative count (" + << cnt << ")", + cnt < 0); + LIBP_ABORT("libp::memory::copyFrom Cannot have negative offset (" + << offset_ << ")", + offset_ < 0); + LIBP_ABORT("libp::memory::copyFrom Destination memory has size [" << lngth << "]," + << " trying to access [" << offset_ << ", " << offset_+static_cast(cnt) << "]", + static_cast(cnt)+offset_ > lngth); + + std::copy(src, + src+cnt, + ptr()+offset_); + } - /*Copy from libp::memory*/ - void copyFrom(const libp::memory &src, - const ptrdiff_t count = -1, - const ptrdiff_t destOffset = 0, - const ptrdiff_t srcOffset = 0) { - const ptrdiff_t cnt = (count==-1) ? lngth : count; - std::copy(src.ptr()+srcOffset, - src.ptr()+srcOffset+cnt, - ptr()+destOffset); - } + /*Copy from memory*/ + void copyFrom(const memory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset_ = 0) { + const ptrdiff_t cnt = (count==-1) ? lngth : count; + + LIBP_ABORT("libp::memory::copyFrom Cannot have negative count (" + << cnt << ")", + cnt < 0); + LIBP_ABORT("libp::memory::copyFrom Cannot have negative offset (" + << offset_ << ")", + offset_ < 0); + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > src.length()); + LIBP_ABORT("libp::memory::copyFrom Destination memory has size [" << lngth << "]," + << " trying to access [" << offset_ << ", " << offset_+static_cast(cnt) << "]", + static_cast(cnt)+offset_ > lngth); + + std::copy(src.ptr(), + src.ptr()+cnt, + ptr()+offset_); + } - /*Copy from occa::memory*/ - void copyFrom(const occa::memory &src, - const ptrdiff_t count = -1, - const ptrdiff_t destOffset = 0, - const ptrdiff_t srcOffset = 0, - const occa::json &props = occa::json()) { - const ptrdiff_t cnt = (count==-1) ? lngth : count; - src.copyTo(ptr()+destOffset, - cnt*sizeof(T), - srcOffset, - props); - } + /*Copy to raw pointer*/ + void copyTo(T *dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset_ = 0) const { + const ptrdiff_t cnt = (count==-1) ? lngth : count; + + LIBP_ABORT("libp::memory::copyTo Cannot have negative count (" + << cnt << ")", + cnt < 0); + LIBP_ABORT("libp::memory::copyTo Cannot have negative offset (" + << offset_ << ")", + offset_ < 0); + LIBP_ABORT("libp::memory::copyTo Source memory has size [" << lngth << "]," + << " trying to access [" << offset_ << ", " << offset_+static_cast(cnt) << "]", + static_cast(cnt)+offset_ > lngth); + + std::copy(ptr()+offset_, + ptr()+offset_+cnt, + dest); + } - void copyFrom(const occa::memory &src, - const occa::json &props) { - src.copyTo(ptr(), - lngth*sizeof(T), - 0, - props); - } + /*Copy to memory*/ + void copyTo(memory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset_ = 0) const { + const ptrdiff_t cnt = (count==-1) ? lngth : count; + + LIBP_ABORT("libp::memory::copyTo Cannot have negative count (" + << cnt << ")", + cnt < 0); + LIBP_ABORT("libp::memory::copyTo Cannot have negative offset (" + << offset_ << ")", + offset_ < 0); + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << cnt << "]", + static_cast(cnt) > dest.length()); + LIBP_ABORT("libp::memory::copyTo Source memory has size [" << lngth << "]," + << " trying to access [" << offset_ << ", " << offset_+static_cast(cnt) << "]", + static_cast(cnt)+offset_ > lngth); + + std::copy(ptr()+offset_, + ptr()+offset_+cnt, + dest.ptr()); + } - /*Copy to raw pointer*/ - void copyTo(T *dest, - const ptrdiff_t count = -1, - const ptrdiff_t offset_ = 0) const { - const ptrdiff_t cnt = (count==-1) ? lngth : count; - std::copy(ptr()+offset_, - ptr()+offset_+cnt, - dest); - } + memory clone() const { + memory m(lngth); + m.copyFrom(*this); + return m; + } + + void free() { + shrdPtr = nullptr; + lngth=0; + offset=0; + } +}; + +template +std::ostream& operator << (std::ostream &out, + const memory &memory) { + out << "memory - " + << "type: " << typeid(T).name() << ", " + << "ptr : " << memory.ptr() << ", " + << "length : " << memory.length() << ", " + << "use_count : " << memory.use_count(); + return out; +} + +/*Extern declare common instantiations for faster compilation*/ +extern template class memory; +extern template class memory; +extern template class memory; +extern template class memory; + +/*libp::deviceMemory is a wrapper around occa::memory*/ +template +class deviceMemory: public occa::memory { + public: + deviceMemory() = default; + deviceMemory(const deviceMemory &m)=default; + deviceMemory(occa::memory m): + occa::memory(m) + { + if (isInitialized()) + occa::memory::setDtype(occa::dtype::get()); + } + + /*Conversion constructor*/ + template + deviceMemory(const deviceMemory &m): + occa::memory(m) + { + if (isInitialized()) + occa::memory::setDtype(occa::dtype::get()); + } + + deviceMemory& operator = (const deviceMemory &m)=default; + ~deviceMemory()=default; + + T* ptr() { + return static_cast(occa::memory::ptr()); + } + const T* ptr() const { + return static_cast(occa::memory::ptr()); + } + + T& operator[](const ptrdiff_t idx) { + return ptr()[idx]; + } + + deviceMemory operator + (const ptrdiff_t offset) const { + if (isInitialized()) + return deviceMemory(occa::memory::operator+(offset)); + else + return deviceMemory(); + } + + deviceMemory& operator += (const ptrdiff_t offset) { + *this = deviceMemory(occa::memory::slice(offset)); + return *this; + } - /*Copy to libp::memory*/ - void copyTo(libp::memory &dest, + /*Copy from libp::memory*/ + void copyFrom(const libp::memory src, const ptrdiff_t count = -1, - const ptrdiff_t destOffset = 0, - const ptrdiff_t srcOffset = 0) const { - const ptrdiff_t cnt = (count==-1) ? lngth : count; - std::copy(ptr()+srcOffset, - ptr()+srcOffset+cnt, - dest.ptr()+destOffset); - } + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > src.length()); + + occa::memory::copyFrom(src.ptr(), + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyFrom(const libp::memory src, + const properties_t &props) { + + if (length()==0) return; + + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << length() << "]", + length() > src.length()); + + occa::memory::copyFrom(src.ptr(), + length()*sizeof(T), + 0, + props); + } - /*Copy to occa::memory*/ - void copyTo(occa::memory &dest, + /*Copy from libp::deviceMemory*/ + void copyFrom(const deviceMemory src, const ptrdiff_t count = -1, - const ptrdiff_t destOffset = 0, - const ptrdiff_t srcOffset = 0, - const occa::json &props = occa::json()) const { - const ptrdiff_t cnt = (count==-1) ? lngth : count; - dest.copyFrom(ptr()+srcOffset, - cnt*sizeof(T), - destOffset, - props); - } + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; - void copyTo(occa::memory &dest, - const occa::json &props) const { - dest.copyFrom(ptr(), - lngth*sizeof(T), - 0, - props); - } + if (cnt==0) return; + occa::memory::copyFrom(src, + cnt*sizeof(T), + offset*sizeof(T), + 0, + props); + } - libp::memory clone() const { - libp::memory m(lngth); - m.copyFrom(*this); - return m; - } + void copyFrom(const deviceMemory src, + const properties_t &props) { + if (length()==0) return; - void free() { - shrdPtr = nullptr; - lngth=0; - offset=0; - } - }; + occa::memory::copyFrom(src, + length()*sizeof(T), + 0, + 0, + props); + } + + /*Copy to libp::memory*/ + void copyTo(libp::memory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > dest.length()); - template - std::ostream& operator << (std::ostream &out, - const libp::memory &memory) { - out << "libp::memory - " - << "type: " << typeid(T).name() << ", " - << "ptr : " << memory.ptr() << ", " - << "length : " << memory.length() << ", " - << "use_count : " << memory.use_count(); - return out; + occa::memory::copyTo(dest.ptr(), + cnt*sizeof(T), + offset*sizeof(T), + props); } -} + + void copyTo(libp::memory dest, + const properties_t &props) const { + + if (length()==0) return; + + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << length() << "]", + length() > dest.length()); + + occa::memory::copyTo(dest.ptr(), + length()*sizeof(T), + 0, + props); + } + + /*Copy to libp::deviceMemory*/ + void copyTo(deviceMemory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyTo(dest, + cnt*sizeof(T), + 0, + offset*sizeof(T), + props); + } + + void copyTo(deviceMemory dest, + const properties_t &props) const { + + if (length()==0) return; + + occa::memory::copyTo(dest, + length()*sizeof(T), + 0, + 0, + props); + } +}; /*Extern declare common instantiations for faster compilation*/ -extern template class libp::memory; -extern template class libp::memory; -extern template class libp::memory; -extern template class libp::memory; +extern template class deviceMemory; +extern template class deviceMemory; +extern template class deviceMemory; +extern template class deviceMemory; + +/*libp::pinnedMemory is another wrapper around occa::memory, + but is allocated slightly differently*/ +template +class pinnedMemory: public occa::memory { + public: + pinnedMemory() = default; + pinnedMemory(const pinnedMemory &m)=default; + pinnedMemory(occa::memory m): + occa::memory(m) + { + if (isInitialized()) + occa::memory::setDtype(occa::dtype::get()); + }; + + /*Conversion constructor*/ + template + pinnedMemory(const pinnedMemory &m): + occa::memory(m) + { + if (isInitialized()) + occa::memory::setDtype(occa::dtype::get()); + } + + pinnedMemory& operator = (const pinnedMemory &m)=default; + ~pinnedMemory()=default; + + T* ptr() { + return static_cast(occa::memory::ptr()); + } + const T* ptr() const { + return static_cast(occa::memory::ptr()); + } + + T& operator[](const ptrdiff_t idx) { + return ptr()[idx]; + } + + pinnedMemory operator + (const ptrdiff_t offset) const { + if (isInitialized()) + return pinnedMemory(occa::memory::operator+(offset)); + else + return pinnedMemory(); + } + + pinnedMemory& operator += (const ptrdiff_t offset) { + *this = pinnedMemory(occa::memory::slice(offset)); + return *this; + } + + /*Copy from libp::memory*/ + void copyFrom(const libp::memory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > src.length()); + + occa::memory::copyFrom(src.ptr(), + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyFrom(const libp::memory src, + const properties_t &props) { + + if (length()==0) return; + + LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "]," + << " trying to access [0, " << length() << "]", + length() > src.length()); + + occa::memory::copyFrom(src.ptr(), + length()*sizeof(T), + 0, + props); + } + + /*Copy from libp::deviceMemory*/ + void copyFrom(const deviceMemory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyFrom(src, + cnt*sizeof(T), + offset*sizeof(T), + 0, + props); + } + + void copyFrom(const deviceMemory src, + const properties_t &props) { + + if (length()==0) return; + + occa::memory::copyFrom(src, + length()*sizeof(T), + 0, + 0, + props); + } + + /*Copy from libp::pinnedMemory*/ + void copyFrom(const pinnedMemory src, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyFrom(src, + cnt*sizeof(T), + offset*sizeof(T), + 0, + props); + } + + void copyFrom(const pinnedMemory src, + const properties_t &props) { + + if (length()==0) return; + + occa::memory::copyFrom(src, + length()*sizeof(T), + 0, + 0, + props); + } + + /*Copy to libp::memory*/ + void copyTo(libp::memory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << static_cast(cnt) << "]", + static_cast(cnt) > dest.length()); + + occa::memory::copyTo(dest.ptr(), + cnt*sizeof(T), + offset*sizeof(T), + props); + } + + void copyTo(libp::memory dest, + const properties_t &props) const { + + if (length()==0) return; + + LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "]," + << " trying to access [0, " << length() << "]", + length() > dest.length()); + + occa::memory::copyTo(dest.ptr(), + length()*sizeof(T), + 0, + props); + } + + /*Copy to libp::deviceMemory*/ + void copyTo(deviceMemory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyTo(dest, + cnt*sizeof(T), + 0, + offset*sizeof(T), + props); + } + + void copyTo(deviceMemory dest, + const properties_t &props) const { + + if (length()==0) return; + + occa::memory::copyTo(dest, + length()*sizeof(T), + 0, + 0, + props); + } + + /*Copy to libp::pinnedMemory*/ + void copyTo(pinnedMemory dest, + const ptrdiff_t count = -1, + const ptrdiff_t offset = 0, + const properties_t &props = properties_t()) const { + const ptrdiff_t cnt = (count==-1) ? length() : count; + + if (cnt==0) return; + + occa::memory::copyTo(dest, + cnt*sizeof(T), + 0, + offset*sizeof(T), + props); + } + + void copyTo(pinnedMemory dest, + const properties_t &props) const { + + if (length()==0) return; + + occa::memory::copyTo(dest, + length()*sizeof(T), + 0, + 0, + props); + } +}; + +} //namespace libp #endif From afad7b9eb156d009bee9ec42d7a8ab668b4992be Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:13:28 -0600 Subject: [PATCH 07/12] Update linalg libs --- include/core.hpp | 19 +----------- include/linAlg.hpp | 53 ++++++++++++++++++++-------------- include/linearSolver.hpp | 34 ++++++++++++---------- libs/core/linAlg.cpp | 36 +++++++++++++---------- libs/core/linAlgSetup.cpp | 24 ++++----------- libs/core/linearSolverCG.cpp | 45 ++++++++++++++--------------- libs/core/matrixEig.cpp | 38 +++++++++--------------- libs/core/matrixInverse.cpp | 34 +++++++--------------- libs/core/matrixRightSolve.cpp | 20 +++++-------- 9 files changed, 130 insertions(+), 173 deletions(-) diff --git a/include/core.hpp b/include/core.hpp index 4eec1e5..cd02493 100644 --- a/include/core.hpp +++ b/include/core.hpp @@ -27,29 +27,12 @@ SOFTWARE. #ifndef CORE_HPP #define CORE_HPP -#include -#include -#include -#include -#include -#include #include "utils.hpp" #include "memory.hpp" +#include "comm.hpp" namespace libp { -void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C); -void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C); - -void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI); -void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI); - -void matrixEigenValues(int N, double *A, double *WR, double *WI); -void matrixEigenValues(int N, float *A, float *WR, float *WI); - -void matrixInverse(int N, double *A); -void matrixInverse(int N, float *A); - void RankDecomp(int size_x, int size_y, int size_z, int &rank_x, int &rank_y, int &rank_z, const int rank); diff --git a/include/linAlg.hpp b/include/linAlg.hpp index 0e0fed2..b970226 100644 --- a/include/linAlg.hpp +++ b/include/linAlg.hpp @@ -28,7 +28,7 @@ SOFTWARE. #define LINALG_HPP #include "core.hpp" - +#include "memory.hpp" namespace libp { @@ -39,46 +39,57 @@ class linAlg_t { public: platform_t *platform; - occa::properties kernelInfo; + properties_t kernelInfo; int blocksize; //scratch space for reductions - dfloat *scratch; - occa::memory h_scratch; - occa::memory o_scratch; + deviceMemory o_scratch; + pinnedMemory h_scratch; linAlg_t(platform_t *_platform); //initialize list of kernels - void InitKernels(std::vector kernels, MPI_Comm comm); - - ~linAlg_t(); + void InitKernels(std::vector kernels); /*********************/ /* vector operations */ /*********************/ // o_x[n] = alpha - void set(const dlong N, const dfloat alpha, occa::memory& o_x); + void set(const dlong N, const dfloat alpha, deviceMemory o_x); // o_y[n] = beta*o_y[n] + alpha*o_x[n] - void axpy(const dlong N, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y); + void axpy(const dlong N, const dfloat alpha, deviceMemory o_x, + const dfloat beta, deviceMemory o_y); // ||o_a||_2 - dfloat norm2(const dlong N, occa::memory& o_a, MPI_Comm comm); + dfloat norm2(const dlong N, deviceMemory o_a, comm_t comm); // o_x.o_y - dfloat innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y, - MPI_Comm comm); - - occa::kernel setKernel; - occa::kernel axpyKernel; - occa::kernel norm2Kernel1; - occa::kernel norm2Kernel2; - occa::kernel innerProdKernel1; - occa::kernel innerProdKernel2; + dfloat innerProd(const dlong N, + deviceMemory o_x, + deviceMemory o_y, + comm_t comm); + + kernel_t setKernel; + kernel_t axpyKernel; + kernel_t norm2Kernel1; + kernel_t norm2Kernel2; + kernel_t innerProdKernel1; + kernel_t innerProdKernel2; + + static void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C); + static void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C); + + static void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI); + static void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI); + + static void matrixEigenValues(int N, double *A, double *WR, double *WI); + static void matrixEigenValues(int N, float *A, float *WR, float *WI); + + static void matrixInverse(int N, double *A); + static void matrixInverse(int N, float *A); }; } //namespace libp diff --git a/include/linearSolver.hpp b/include/linearSolver.hpp index a25bdf6..81f0172 100644 --- a/include/linearSolver.hpp +++ b/include/linearSolver.hpp @@ -37,7 +37,7 @@ namespace libp { class linearSolver_t { public: platform_t platform; - MPI_Comm comm; + comm_t comm; dlong N; dlong Nhalo; @@ -47,33 +47,37 @@ class linearSolver_t { N(_N), Nhalo(_Nhalo) {} virtual int Solve(solver_t& solver, - occa::memory& o_x, occa::memory& o_rhs, - const dfloat tol, const int MAXIT, const int verbose)=0; - - virtual ~linearSolver_t(){} + deviceMemory o_x, + deviceMemory o_rhs, + const dfloat tol, + const int MAXIT, + const int verbose)=0; }; //Conjugate Gradient class cg: public linearSolver_t { private: - occa::memory o_p, o_Ap; + deviceMemory o_p, o_Ap; - dfloat* tmprdotr; - occa::memory h_tmprdotr; - occa::memory o_tmprdotr; + deviceMemory o_tmprdotr; + pinnedMemory h_tmprdotr; - occa::kernel updateCGKernel1; - occa::kernel updateCGKernel2; + kernel_t updateCGKernel1; + kernel_t updateCGKernel2; - dfloat UpdateCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r); + dfloat UpdateCG(const dfloat alpha, + deviceMemory o_x, + deviceMemory o_r); public: cg(platform_t& _platform, dlong _N, dlong _Nhalo); - ~cg(); int Solve(solver_t& solver, - occa::memory& o_x, occa::memory& o_rhs, - const dfloat tol, const int MAXIT, const int verbose); + deviceMemory o_x, + deviceMemory o_rhs, + const dfloat tol, + const int MAXIT, + const int verbose); }; } //namespace libp diff --git a/libs/core/linAlg.cpp b/libs/core/linAlg.cpp index 84ef116..fad2ff6 100644 --- a/libs/core/linAlg.cpp +++ b/libs/core/linAlg.cpp @@ -35,37 +35,43 @@ namespace libp { /*********************/ // o_x[n] = alpha -void linAlg_t::set(const dlong N, const dfloat alpha, occa::memory& o_x) { +void linAlg_t::set(const dlong N, const dfloat alpha, + deviceMemory o_x) { setKernel(N, alpha, o_x); } // o_y[n] = beta*o_y[n] + alpha*o_x[n] -void linAlg_t::axpy(const dlong N, const dfloat alpha, occa::memory& o_x, - const dfloat beta, occa::memory& o_y) { +void linAlg_t::axpy(const dlong N, + const dfloat alpha, + deviceMemory o_x, + const dfloat beta, + deviceMemory o_y) { axpyKernel(N, alpha, o_x, beta, o_y); } // ||o_a||_2 -dfloat linAlg_t::norm2(const dlong N, occa::memory& o_a, MPI_Comm comm) { - //TODO, maybe complete reduction on device with second kernel? +dfloat linAlg_t::norm2(const dlong N, + deviceMemory o_a, comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries norm2Kernel1(Nblock, N, o_a, o_scratch); norm2Kernel2(Nblock, o_scratch); - o_scratch.copyTo(scratch, 1*sizeof(dfloat), 0, "async: true"); - platform->device.finish(); + h_scratch.copyFrom(o_scratch, 1, 0, "async: true"); + platform->finish(); - dfloat norm = 0; - MPI_Allreduce(scratch, &norm, 1, MPI_DFLOAT, MPI_SUM, comm); + dfloat norm = h_scratch[0]; + comm.Allreduce(norm); return sqrt(norm); } // o_x.o_y -dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y, - MPI_Comm comm) { +dfloat linAlg_t::innerProd(const dlong N, + deviceMemory o_x, + deviceMemory o_y, + comm_t comm) { int Nblock = (N+blocksize-1)/blocksize; Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries @@ -73,11 +79,11 @@ dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y, innerProdKernel1(Nblock, N, o_x, o_y, o_scratch); innerProdKernel2(Nblock, o_scratch); - o_scratch.copyTo(scratch, 1*sizeof(dfloat), 0, "async: true"); - platform->device.finish(); + h_scratch.copyFrom(o_scratch, 1, 0, "async: true"); + platform->finish(); - dfloat dot = 0; - MPI_Allreduce(scratch, &dot, 1, MPI_DFLOAT, MPI_SUM, comm); + dfloat dot = h_scratch[0]; + comm.Allreduce(dot); return dot; } diff --git a/libs/core/linAlgSetup.cpp b/libs/core/linAlgSetup.cpp index ed96108..fbfabff 100644 --- a/libs/core/linAlgSetup.cpp +++ b/libs/core/linAlgSetup.cpp @@ -31,7 +31,6 @@ SOFTWARE. namespace libp { constexpr int LINALG_BLOCKSIZE = 256; -constexpr int AXPY_BLOCKSIZE = 1024; using std::string; using std::stringstream; @@ -45,13 +44,12 @@ linAlg_t::linAlg_t(platform_t *_platform): blocksize(LINALG_BLOCKSIZE) { kernelInfo["defines/" "p_blockSize"] = (int)LINALG_BLOCKSIZE; //pinned scratch buffer - scratch = (dfloat*) platform->hostMalloc(LINALG_BLOCKSIZE*sizeof(dfloat), - NULL, h_scratch); - o_scratch = platform->malloc(LINALG_BLOCKSIZE*sizeof(dfloat)); + h_scratch = platform->hostMalloc(LINALG_BLOCKSIZE); + o_scratch = platform->malloc(LINALG_BLOCKSIZE); } //initialize list of kernels -void linAlg_t::InitKernels(std::vector kernels, MPI_Comm comm) { +void linAlg_t::InitKernels(std::vector kernels) { for (size_t i=0;i kernels, MPI_Comm comm) { "set", kernelInfo); } else if (name=="axpy") { - occa::properties axpyKernelInfo = kernelInfo; if (axpyKernel.isInitialized()==false) axpyKernel = platform->buildKernel(HIPBONE_DIR "/libs/core/okl/" "linAlgAXPY.okl", "axpy", - axpyKernelInfo); + kernelInfo); } else if (name=="norm2") { if (norm2Kernel1.isInitialized()==false) norm2Kernel1 = platform->buildKernel(HIPBONE_DIR "/libs/core/okl/" @@ -93,20 +90,9 @@ void linAlg_t::InitKernels(std::vector kernels, MPI_Comm comm) { "innerProd_2", kernelInfo); } else { - stringstream ss; - ss << "Requested linAlg routine \"" << name << "\" not found"; - HIPBONE_ABORT(ss.str()); + LIBP_FORCE_ABORT("Requested linAlg routine \"" << name << "\" not found"); } } } -linAlg_t::~linAlg_t() { - setKernel.free(); - axpyKernel.free(); - norm2Kernel1.free(); - norm2Kernel2.free(); - innerProdKernel1.free(); - innerProdKernel2.free(); -} - } //namespace libp diff --git a/libs/core/linearSolverCG.cpp b/libs/core/linearSolverCG.cpp index e556651..6755883 100644 --- a/libs/core/linearSolverCG.cpp +++ b/libs/core/linearSolverCG.cpp @@ -37,18 +37,17 @@ cg::cg(platform_t& _platform, dlong _N, dlong _Nhalo): dlong Ntotal = N + Nhalo; /*aux variables */ - dfloat *dummy = (dfloat *) calloc(Ntotal,sizeof(dfloat)); //need this to avoid uninitialized memory warnings - o_p = platform.malloc(Ntotal*sizeof(dfloat),dummy); - o_Ap = platform.malloc(Ntotal*sizeof(dfloat),dummy); - free(dummy); + memory dummy(Ntotal,0.0); //need this to avoid uninitialized memory warnings + o_p = platform.malloc(Ntotal,dummy); + o_Ap = platform.malloc(Ntotal,dummy); + dummy.free(); //pinned tmp buffer for reductions - tmprdotr = (dfloat*) platform.hostMalloc(1*sizeof(dfloat), - NULL, h_tmprdotr); - o_tmprdotr = platform.malloc(CG_BLOCKSIZE*sizeof(dfloat)); + h_tmprdotr = platform.hostMalloc(1); + o_tmprdotr = platform.malloc(CG_BLOCKSIZE); /* build kernels */ - occa::properties kernelInfo = platform.props(); //copy base properties + properties_t kernelInfo = platform.props(); //copy base properties //add defines kernelInfo["defines/" "p_blockSize"] = (int)CG_BLOCKSIZE; @@ -61,10 +60,13 @@ cg::cg(platform_t& _platform, dlong _N, dlong _Nhalo): } int cg::Solve(solver_t& solver, - occa::memory &o_x, occa::memory &o_r, - const dfloat tol, const int MAXIT, const int verbose) { + deviceMemory o_x, + deviceMemory o_r, + const dfloat tol, + const int MAXIT, + const int verbose) { - int rank = platform.rank; + int rank = platform.rank(); linAlg_t &linAlg = platform.linAlg(); // register scalars @@ -82,7 +84,7 @@ int cg::Solve(solver_t& solver, rdotr = linAlg.norm2(N, o_r, comm); rdotr = rdotr*rdotr; - dfloat TOL = mymax(tol*tol*rdotr,tol*tol); + dfloat TOL = std::max(tol*tol*rdotr,tol*tol); if (verbose&&(rank==0)) printf("CG: initial res norm %12.12f \n", sqrt(rdotr)); @@ -126,7 +128,9 @@ int cg::Solve(solver_t& solver, return iter; } -dfloat cg::UpdateCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r){ +dfloat cg::UpdateCG(const dfloat alpha, + deviceMemory o_x, + deviceMemory o_r){ linAlg_t &linAlg = platform.linAlg(); @@ -138,24 +142,17 @@ dfloat cg::UpdateCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r){ updateCGKernel1(N, Nblocks, o_Ap, alpha, o_r, o_tmprdotr); updateCGKernel2(Nblocks, o_tmprdotr); - o_tmprdotr.copyTo(tmprdotr, 1*sizeof(dfloat), 0, "async: true"); - - platform.device.finish(); + h_tmprdotr.copyFrom(o_tmprdotr, 1, 0, "async: true"); + platform.finish(); // x <= x + alpha*p linAlg.axpy(N, alpha, o_p, 1.f, o_x); /*Compute all reduce while axpy is running*/ - dfloat rdotr1 = 0.0; - MPI_Allreduce(tmprdotr, &rdotr1, 1, MPI_DFLOAT, MPI_SUM, comm); + dfloat rdotr1 = h_tmprdotr[0]; + comm.Allreduce(rdotr1); return rdotr1; } -cg::~cg() { - updateCGKernel1.free(); - updateCGKernel2.free(); - -} - } //namespace libp diff --git a/libs/core/matrixEig.cpp b/libs/core/matrixEig.cpp index 1219f3b..fdc084d 100644 --- a/libs/core/matrixEig.cpp +++ b/libs/core/matrixEig.cpp @@ -24,7 +24,7 @@ SOFTWARE. */ -#include "core.hpp" +#include "linAlg.hpp" extern "C" { void sgeev_(char *JOBVL, char *JOBVR, int *N, float *A, int *LDA, float *WR, float *WI, @@ -36,7 +36,7 @@ extern "C" { namespace libp { // compute right eigenvectors -void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){ +void linAlg_t::matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){ char JOBVL = 'N'; char JOBVR = 'V'; @@ -62,11 +62,8 @@ void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){ dgeev_ (&JOBVL, &JOBVR, &N, tmpA, &LDA, WR, WI, VL, &LDVL, tmpVR, &LDVR, WORK, &LWORK, &INFO); - if(INFO) { - std::stringstream ss; - ss << "dgeev_ reports info = " << INFO; - HIPBONE_ABORT(ss.str()); - } + LIBP_ABORT("dgeev_ reports info = " << INFO, + INFO); for(int n=0;n iplatform; - - occa::device device; std::shared_ptr ilinAlg; - int rank=0, size=0; + public: + comm_t comm; + device_t device; platform_t()=default; @@ -74,9 +75,6 @@ class platform_t { comm = settings.comm; - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &size); - DeviceConfig(); DeviceProperties(); @@ -91,64 +89,58 @@ class platform_t { } void assertInitialized() { - if(!isInitialized()) { - HIPBONE_ABORT("Platform not initialized."); - } + LIBP_ABORT("Platform not initialized.", + !isInitialized()); } - occa::kernel buildKernel(std::string fileName, std::string kernelName, - occa::properties& kernelInfo); - - occa::memory malloc(const size_t bytes, - const void *src = NULL, - const occa::properties &prop = occa::properties()) { - assertInitialized(); - return device.malloc(bytes, src, prop); - } + kernel_t buildKernel(std::string fileName, std::string kernelName, + properties_t& kernelInfo); - occa::memory malloc(const size_t bytes, - const occa::memory &src, - const occa::properties &prop = occa::properties()) { + template + deviceMemory malloc(const size_t count, + const properties_t &prop = properties_t()) { assertInitialized(); - return device.malloc(bytes, src, prop); + return deviceMemory(device.malloc(count, prop)); } - occa::memory malloc(const size_t bytes, - const occa::properties &prop) { + template + deviceMemory malloc(const size_t count, + const memory src, + const properties_t &prop = properties_t()) { assertInitialized(); - return device.malloc(bytes, prop); + return deviceMemory(device.malloc(count, src.ptr(), prop)); } template - occa::memory malloc(const size_t count, - const occa::properties &prop = occa::properties()) { + deviceMemory malloc(const memory src, + const properties_t &prop = properties_t()) { assertInitialized(); - return device.malloc(count*sizeof(T), prop); + return deviceMemory(device.malloc(src.length(), src.ptr(), prop)); } template - occa::memory malloc(const size_t count, - const libp::memory &src, - const occa::properties &prop) { + pinnedMemory hostMalloc(const size_t count){ assertInitialized(); - return device.malloc(count*sizeof(T), src.ptr(), prop); + properties_t hostProp; + hostProp["host"] = true; + return pinnedMemory(device.malloc(count, nullptr, hostProp)); } template - occa::memory malloc(const libp::memory &src, - const occa::properties &prop = occa::properties()) { + pinnedMemory hostMalloc(const size_t count, + const memory src){ assertInitialized(); - return device.malloc(src.length()*sizeof(T), src.ptr(), prop); + properties_t hostProp; + hostProp["host"] = true; + return pinnedMemory(device.malloc(count, src.ptr(), hostProp)); } - void *hostMalloc(const size_t bytes, - const void *src, - occa::memory &h_mem){ + template + pinnedMemory hostMalloc(const memory src){ assertInitialized(); - occa::properties hostProp; + properties_t hostProp; hostProp["host"] = true; - h_mem = device.malloc(bytes, src, hostProp); - return h_mem.ptr(); + return pinnedMemory(device.malloc(src.length(), src.ptr(), hostProp)); } linAlg_t& linAlg() { @@ -161,12 +153,32 @@ class platform_t { return iplatform->settings; } - occa::properties& props() { + properties_t& props() { assertInitialized(); return iplatform->props; } -private: + void finish() { + device.finish(); + } + + const int rank() const { + return comm.rank(); + } + + const int size() const { + return comm.size(); + } + + int getDeviceCount(const std::string mode) { + return occa::getDeviceCount(mode); + } + + void setCacheDir(const std::string cacheDir) { + occa::env::setOccaCacheDir(cacheDir); + } + + private: void DeviceConfig(); void DeviceProperties(); @@ -174,4 +186,4 @@ class platform_t { } //namespace libp -#endif \ No newline at end of file +#endif diff --git a/include/settings.hpp b/include/settings.hpp index 52dea68..6320a2b 100644 --- a/include/settings.hpp +++ b/include/settings.hpp @@ -92,12 +92,12 @@ class settings_t { std::vector insertOrder; public: - MPI_Comm comm; + comm_t comm; std::map settings; settings_t() = delete; - settings_t(MPI_Comm _comm); + settings_t(comm_t _comm); //copy settings_t(const settings_t& other)=default; @@ -120,9 +120,7 @@ class settings_t { const setting_t& val = search->second; value = val.getVal(); } else { - stringstream ss; - ss << "Unable to find setting: [" << name << "]"; - HIPBONE_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unable to find setting: [" << name << "]"); } } diff --git a/include/solver.hpp b/include/solver.hpp index 3313758..b878a95 100644 --- a/include/solver.hpp +++ b/include/solver.hpp @@ -38,17 +38,16 @@ class solver_t { solver_t()=default; solver_t(platform_t& _platform): platform(_platform) {} - virtual ~solver_t(){} settings_t& settings() { return platform.settings(); } virtual void Run()=0; - virtual void Operator(occa::memory& o_q, occa::memory& o_Aq) { - HIPBONE_ABORT(std::string("Operator not implemented in this solver")) + virtual void Operator(deviceMemory& o_q, deviceMemory& o_Aq) { + LIBP_FORCE_ABORT("Operator not implemented in this solver"); } }; } //namespace libp -#endif \ No newline at end of file +#endif diff --git a/libs/core/memory.cpp b/libs/core/memory.cpp index 90ee0f3..3df102e 100644 --- a/libs/core/memory.cpp +++ b/libs/core/memory.cpp @@ -24,12 +24,18 @@ SOFTWARE. */ -#include "core.hpp" +#include "memory.hpp" namespace libp { - /*explicit instantiation of common specializations*/ - template class memory; - template class memory; - template class memory; - template class memory; +/*explicit instantiation of common specializations*/ +template class memory; +template class memory; +template class memory; +template class memory; + +/*explicit instantiation of common specializations*/ +template class deviceMemory; +template class deviceMemory; +template class deviceMemory; +template class deviceMemory; } //namespace libp diff --git a/libs/core/platformBuildKernel.cpp b/libs/core/platformBuildKernel.cpp index e54c5a4..e992758 100644 --- a/libs/core/platformBuildKernel.cpp +++ b/libs/core/platformBuildKernel.cpp @@ -2,7 +2,7 @@ The MIT License (MIT) -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus +Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -28,26 +28,27 @@ SOFTWARE. namespace libp { -occa::kernel platform_t::buildKernel(std::string fileName, std::string kernelName, - occa::properties& kernelInfo){ +kernel_t platform_t::buildKernel(std::string fileName, + std::string kernelName, + properties_t& kernelInfo){ assertInitialized(); - occa::kernel kernel; + kernel_t kernel; //build on root first - if (!rank) + if (!rank()) kernel = device.buildKernel(fileName, kernelName, kernelInfo); - MPI_Barrier(comm); + comm.Barrier(); //remaining ranks find the cached version (ideally) - if (rank) + if (rank()) kernel = device.buildKernel(fileName, kernelName, kernelInfo); - MPI_Barrier(comm); + comm.Barrier(); return kernel; } -} //namespace libp \ No newline at end of file +} //namespace libp diff --git a/libs/core/platformDeviceConfig.cpp b/libs/core/platformDeviceConfig.cpp index 8a5cd20..656d313 100644 --- a/libs/core/platformDeviceConfig.cpp +++ b/libs/core/platformDeviceConfig.cpp @@ -33,22 +33,20 @@ namespace libp { void platform_t::DeviceConfig(){ //find out how many ranks and devices are on this system - char* hostnames = (char *) ::malloc(size*sizeof(char)*MPI_MAX_PROCESSOR_NAME); - char* hostname = hostnames+rank*MPI_MAX_PROCESSOR_NAME; + memory hostnames(size()*MAX_PROCESSOR_NAME); + memory hostname = hostnames + rank()*MAX_PROCESSOR_NAME; int namelen; - MPI_Get_processor_name(hostname,&namelen); - - MPI_Allgather(MPI_IN_PLACE , MPI_MAX_PROCESSOR_NAME, MPI_CHAR, - hostnames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD); + comm_t::GetProcessorName(hostname.ptr(), namelen); + comm.Allgather(hostnames, MAX_PROCESSOR_NAME); int localRank = 0; int localSize = 0; - for (int n=0; n0 && localRank>=deviceCount) { - std::stringstream ss; - ss << "Rank " << rank << " oversubscribing device " << device_id%deviceCount << " on node \"" << hostname<< "\""; - HIPBONE_WARNING(ss.str()); + LIBP_FORCE_WARNING("Rank " << rank() << " oversubscribing device " << device_id%deviceCount << " on node \"" << hostname.ptr()<< "\""); device_id = device_id%deviceCount; } } @@ -108,19 +104,18 @@ void platform_t::DeviceConfig(){ /*Use lscpu to determine core and socket counts */ FILE *pipeCores = popen("lscpu | grep \"Core(s) per socket\" | awk '{print $4}'", "r"); FILE *pipeSockets = popen("lscpu | grep \"Socket(s)\" | awk '{print $2}'", "r"); - if (!pipeCores || !pipeSockets) { - HIPBONE_ABORT("popen() failed!"); - } + LIBP_ABORT("popen() failed!", + !pipeCores || !pipeSockets); std::array buffer; - if (!fgets(buffer.data(), buffer.size(), pipeCores)) { //read to end of line - HIPBONE_ABORT("Error reading core count") - } + //read to end of line + LIBP_ABORT("Error reading core count", + !fgets(buffer.data(), buffer.size(), pipeCores)); int Ncores = std::stoi(buffer.data()); - if (!fgets(buffer.data(), buffer.size(), pipeSockets)) { //read to end of line - HIPBONE_ABORT("Error reading core count") - } + //read to end of line + LIBP_ABORT("Error reading core count", + !fgets(buffer.data(), buffer.size(), pipeSockets)); int Nsockets = std::stoi(buffer.data()); pclose(pipeCores); @@ -130,7 +125,7 @@ void platform_t::DeviceConfig(){ int NcoresPerNode = Ncores*Nsockets; int Nthreads=0; -#if !defined(HIPBONE_DEBUG) +#if !defined(LIBP_DEBUG) /*Check OMP_NUM_THREADS env variable*/ std::string ompNumThreads; char * ompEnvVar = std::getenv("OMP_NUM_THREADS"); @@ -146,11 +141,8 @@ void platform_t::DeviceConfig(){ Nthreads = std::stoi(ompNumThreads); } } - if (Nthreads*localSize>NcoresPerNode) { - std::stringstream ss; - ss << "Rank " << rank << " oversubscribing CPU on node \"" << hostname<< "\""; - HIPBONE_WARNING(ss.str()); - } + LIBP_WARNING("Rank " << rank() << " oversubscribing CPU on node \"" << hostname.ptr()<< "\"", + Nthreads*localSize>NcoresPerNode); omp_set_num_threads(Nthreads); // omp_set_num_threads(1); @@ -161,25 +153,24 @@ void platform_t::DeviceConfig(){ device.setup(mode); - std::string occaCacheDir; + std::string cacheDir; char * cacheEnvVar = std::getenv("HIPBONE_CACHE_DIR"); if (cacheEnvVar == nullptr) { // Environment variable is not set - occaCacheDir = HIPBONE_DIR "/.occa"; + cacheDir = HIPBONE_DIR "/.occa"; } else { // Environmet variable is set, but could be empty string - occaCacheDir = cacheEnvVar; + cacheDir = cacheEnvVar; - if (occaCacheDir.size() == 0) { + if (cacheDir.size() == 0) { // Environment variable is set but equal to empty string - occaCacheDir = HIPBONE_DIR "/.occa"; + cacheDir = HIPBONE_DIR "/.occa"; } } - occa::env::setOccaCacheDir(occaCacheDir); + setCacheDir(cacheDir); - MPI_Barrier(MPI_COMM_WORLD); - free(hostnames); + comm.Barrier(); } } //namespace libp diff --git a/libs/core/platformProperties.cpp b/libs/core/platformProperties.cpp index a6a41ff..91fe56a 100644 --- a/libs/core/platformProperties.cpp +++ b/libs/core/platformProperties.cpp @@ -31,7 +31,7 @@ namespace libp { //initialize occa::properties with common props void platform_t::DeviceProperties(){ - occa::properties& Props = props(); + properties_t& Props = props(); Props["defines"].asObject(); Props["includes"].asArray(); diff --git a/libs/core/platformSettings.cpp b/libs/core/platformSettings.cpp index 9b63e65..f8da166 100644 --- a/libs/core/platformSettings.cpp +++ b/libs/core/platformSettings.cpp @@ -54,9 +54,7 @@ void platformReportSettings(settings_t& settings) { if (settings.compareSetting("THREAD MODEL","OpenCL")) settings.reportSetting("PLATFORM NUMBER"); - int size; - MPI_Comm_size(MPI_COMM_WORLD, &size); - if ((size==1) + if ((settings.comm.size()==1) &&(settings.compareSetting("THREAD MODEL","CUDA") ||settings.compareSetting("THREAD MODEL","HIP") ||settings.compareSetting("THREAD MODEL","OpenCL") )) diff --git a/libs/core/settings.cpp b/libs/core/settings.cpp index 4315c0f..4a6b9b9 100644 --- a/libs/core/settings.cpp +++ b/libs/core/settings.cpp @@ -55,7 +55,7 @@ void setting_t::updateVal(const string newVal){ << "Possible values are: { "; for (size_t i=0;isecond; - if (!setting.shortkey.compare(shortkey)) { - stringstream ss; - ss << "Setting with key: [" << shortkey << "] already exists."; - HIPBONE_ABORT(ss.str()); - } - if (!setting.longkey.compare(longkey)) { - stringstream ss; - ss << "Setting with key: [" << longkey << "] already exists."; - HIPBONE_ABORT(ss.str()); - } + LIBP_ABORT("Setting with key: [" << shortkey << "] already exists.", + !setting.shortkey.compare(shortkey)); + LIBP_ABORT("Setting with key: [" << longkey << "] already exists.", + !setting.longkey.compare(longkey)); } auto search = settings.find(name); @@ -132,9 +126,7 @@ void settings_t::newSetting(const string shortkey, const string longkey, settings[name] = setting_t(shortkey, longkey, name, val, description, options); insertOrder.push_back(name); } else { - stringstream ss; - ss << "Setting with name: [" << name << "] already exists."; - HIPBONE_ABORT(ss.str()); + LIBP_FORCE_ABORT("Setting with name: [" << name << "] already exists."); } } @@ -144,20 +136,19 @@ void settings_t::changeSetting(const string name, const string newVal) { setting_t& val = search->second; val.updateVal(newVal); } else { - stringstream ss; - ss << "Setting with name: [" << name << "] does not exist."; - HIPBONE_ABORT(ss.str()); + LIBP_FORCE_ABORT("Setting with name: [" << name << "] does not exist."); } } void settings_t::parseSettings(const int argc, char** argv) { for (int i = 1; i < argc; ) { - if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) - { - PrintUsage(); - MPI_Abort(MPI_COMM_WORLD,HIPBONE_ERROR); - return; + if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { + if (comm.rank()==0) PrintUsage(); + comm.Barrier(); + comm_t::Finalize(); + std::exit(LIBP_SUCCESS); + return; } for(auto it = settings.begin(); it != settings.end(); ++it) { @@ -165,9 +156,7 @@ void settings_t::parseSettings(const int argc, char** argv) { if (strcmp(argv[i], setting.shortkey.c_str()) == 0 || strcmp(argv[i], setting.longkey.c_str()) == 0) { if (setting.check!=0) { - stringstream ss; - ss << "Cannot set setting [" << setting.name << "] twice in run command."; - HIPBONE_ABORT(ss.str()); + LIBP_FORCE_ABORT("Cannot set setting [" << setting.name << "] twice in run command."); } else { if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) { @@ -191,9 +180,7 @@ string settings_t::getSetting(const string name) const { const setting_t& val = search->second; return val.getVal(); } else { - stringstream ss; - ss << "Unable to find setting: [" << name << "]"; - HIPBONE_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unable to find setting: [" << name << "]"); return string(); } } @@ -204,9 +191,7 @@ bool settings_t::compareSetting(const string name, const string token) const { const setting_t& val = search->second; return val.compareVal(token); } else { - stringstream ss; - ss << "Unable to find setting: [" << name.c_str() << "]"; - HIPBONE_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unable to find setting: [" << name.c_str() << "]"); return false; } } @@ -226,9 +211,7 @@ void settings_t::reportSetting(const string name) const { const setting_t& val = search->second; std::cout << val << std::endl; } else { - stringstream ss; - ss << "Unable to find setting: [" << name.c_str() << "]"; - HIPBONE_ABORT(ss.str()); + LIBP_FORCE_ABORT("Unable to find setting: [" << name.c_str() << "]"); } } From 3ef8ede9bb8c43624dcff73650ca9c67adace67f Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:14:55 -0600 Subject: [PATCH 09/12] Update mesh lib --- include/mesh.hpp | 129 ++++++++++---------- libs/mesh/makefile | 84 ------------- libs/mesh/mesh.cpp | 8 +- libs/mesh/meshBasis1D.cpp | 30 ++--- libs/mesh/meshBasisHex3D.cpp | 148 ++++++++++++++++++++++- libs/mesh/meshConnect.cpp | 69 +++-------- libs/mesh/meshConnectFaceNodes.cpp | 162 +------------------------- libs/mesh/meshConnectFaceVertices.cpp | 2 +- libs/mesh/meshConnectNodes.cpp | 26 ++--- libs/mesh/meshGatherScatterSetup.cpp | 42 +++---- libs/mesh/meshHaloSetup.cpp | 12 +- libs/mesh/meshOccaSetup.cpp | 15 +-- libs/mesh/meshPhysicalNodes.cpp | 8 +- libs/mesh/meshSetupBox.cpp | 13 +-- 14 files changed, 310 insertions(+), 438 deletions(-) delete mode 100644 libs/mesh/makefile diff --git a/include/mesh.hpp b/include/mesh.hpp index f8de21f..65c2ed2 100644 --- a/include/mesh.hpp +++ b/include/mesh.hpp @@ -28,6 +28,7 @@ SOFTWARE. #define MESH_HPP 1 #include "core.hpp" +#include "comm.hpp" #include "settings.hpp" #include "ogs.hpp" @@ -37,90 +38,98 @@ class mesh_t { public: platform_t platform; - occa::properties props; + properties_t props; - MPI_Comm comm; + comm_t comm; int rank, size; + /*************************/ + /* Element Data */ + /*************************/ int dim; int Nverts, Nfaces, NfaceVertices; - hlong Nnodes; //global number of element vertices - libp::memory EX; // coordinates of vertices for each element - libp::memory EY; - libp::memory EZ; + // indices of vertex nodes + memory vertexNodes; + + hlong Nnodes=0; //global number of element vertices + memory EX; // coordinates of vertices for each element + memory EY; + memory EZ; dlong Nelements; //local element count hlong NelementsGlobal; //global element count - libp::memory EToV; // element-to-vertex connectivity - libp::memory EToE; // element-to-element connectivity - libp::memory EToF; // element-to-(local)face connectivity - libp::memory EToP; // element-to-partition/process connectivity + memory EToV; // element-to-vertex connectivity + memory EToE; // element-to-element connectivity + memory EToF; // element-to-(local)face connectivity + memory EToP; // element-to-partition/process connectivity + + memory VmapM; // list of vertices on each face + memory VmapP; // list of vertices that are paired with face vertices + + /*************************/ + /* FEM Space */ + /*************************/ + int N=0, Np=0; // N = Polynomial order and Np = Nodes per element + memory r, s, t; // coordinates of local nodes + + int Nq=0; // N = Polynomial order, Nq=N+1 + memory gllz; // 1D GLL quadrature nodes + memory gllw; // 1D GLL quadrature weights + memory D; // 1D differentiation matrix (for tensor-product) + deviceMemory o_D; - libp::memory VmapM; // list of vertices on each face - libp::memory VmapP; // list of vertices that are paired with face vertices + // face node info + int Nfp=0; // number of nodes per face + memory faceNodes; // list of element reference interpolation nodes on element faces + memory vmapM; // list of volume nodes that are face nodes + memory vmapP; // list of volume nodes that are paired with face nodes + memory faceVertices; // list of mesh vertices on each face + + /*************************/ + /* Physical Space */ + /*************************/ + // volume node info + memory x, y, z; // coordinates of physical nodes + // second order volume geometric factors + dlong Nggeo; + memory ggeo; + deviceMemory o_ggeo; + /*************************/ + /* MPI Data */ + /*************************/ // MPI halo exchange info - ogs::halo_t halo; // halo exchange pointer - dlong NinternalElements; // number of elements that can update without halo exchange - dlong NhaloElements; // number of elements that cannot update without halo exchange - dlong totalHaloPairs; // number of elements to be received in halo exchange - libp::memory internalElementIds; // list of elements that can update without halo exchange - libp::memory haloElementIds; // list of elements to be sent in halo exchange - occa::memory o_internalElementIds; // list of elements that can update without halo exchange - occa::memory o_haloElementIds; // list of elements to be sent in halo exchange + ogs::halo_t halo; // halo exchange pointer + dlong NinternalElements; // number of elements that can update without halo exchange + dlong NhaloElements; // number of elements that cannot update without halo exchange + dlong totalHaloPairs; // number of elements to be received in halo exchange + memory internalElementIds; // list of elements that can update without halo exchange + memory haloElementIds; // list of elements to be sent in halo exchange + deviceMemory o_internalElementIds; // list of elements that can update without halo exchange + deviceMemory o_haloElementIds; // list of elements to be sent in halo exchange // CG gather-scatter info ogs::ogs_t ogsMasked; ogs::halo_t gHalo; - libp::memory globalIds, maskedGlobalIds, maskedGlobalNumbering; + memory globalIds, maskedGlobalIds, maskedGlobalNumbering; dlong Nmasked; - libp::memory GlobalToLocal; - occa::memory o_GlobalToLocal; + memory GlobalToLocal; + deviceMemory o_GlobalToLocal; // list of elements that are needed for global gather-scatter dlong NglobalGatherElements; - libp::memory globalGatherElementList; - occa::memory o_globalGatherElementList; + memory globalGatherElementList; + deviceMemory o_globalGatherElementList; // list of elements that are not needed for global gather-scatter dlong NlocalGatherElements; - libp::memory localGatherElementList; - occa::memory o_localGatherElementList; - - - libp::memory weight, weightG; - occa::memory o_weight, o_weightG; - - // second order volume geometric factors - dlong Nggeo; - libp::memory ggeo; - - // volume node info - int N, Nq, Np; - libp::memory r, s, t; // coordinates of local nodes - libp::memory x, y, z; // coordinates of physical nodes - - // indices of vertex nodes - libp::memory vertexNodes; - - libp::memory D; // 1D differentiation matrix (for tensor-product) - libp::memory gllz; // 1D GLL quadrature nodes - libp::memory gllw; // 1D GLL quadrature weights - - // face node info - int Nfp; // number of nodes per face - libp::memory faceNodes; // list of element reference interpolation nodes on element faces - libp::memory vmapM; // list of volume nodes that are face nodes - libp::memory vmapP; // list of volume nodes that are paired with face nodes - libp::memory faceVertices; // list of mesh vertices on each face - - // occa stuff - occa::stream defaultStream; + memory localGatherElementList; + deviceMemory o_localGatherElementList; - occa::memory o_D; // tensor product differentiation matrix (for Hexes) - occa::memory o_ggeo; // second order geometric factors + memory weight, weightG; + deviceMemory o_weight, o_weightG; mesh_t()=default; mesh_t(platform_t& _platform) { @@ -186,6 +195,8 @@ class mesh_t { void NodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[]); void FaceNodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[], int _faceNodes[]); void VertexNodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[], int _vertexNodes[]); + void FaceNodeMatchingHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[], + int _faceNodes[], int R[]); /* offsets for second order geometric factors */ static constexpr int GWJID=0; diff --git a/libs/mesh/makefile b/libs/mesh/makefile deleted file mode 100644 index 2a40fce..0000000 --- a/libs/mesh/makefile +++ /dev/null @@ -1,84 +0,0 @@ -##################################################################################### -# -#The MIT License (MIT) -# -#Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus -# -#Permission is hereby granted, free of charge, to any person obtaining a copy -#of this software and associated documentation files (the "Software"), to deal -#in the Software without restriction, including without limitation the rights -#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -#copies of the Software, and to permit persons to whom the Software is -#furnished to do so, subject to the following conditions: -# -#The above copyright notice and this permission notice shall be included in all -#copies or substantial portions of the Software. -# -#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -#SOFTWARE. -# -##################################################################################### - -ifndef HIPBONE_MAKETOP_LOADED -ifeq (,$(wildcard ../../make.top)) -$(error cannot locate ${PWD}/../../make.top) -else -include ../../make.top -endif -endif - -LIBMESH_DIR=${HIPBONE_CORE_DIR}/mesh - -#includes -INCLUDES=${HIPBONE_INCLUDES} \ - -I${LIBMESH_DIR}/include - -#defines -DEFINES =${HIPBONE_DEFINES} \ - -DHIPBONE_DIR='"${HIPBONE_DIR}"' \ - -#.cpp compilation flags -MESH_CXXFLAGS=${HIPBONE_CXXFLAGS} ${DEFINES} ${INCLUDES} - -#object dependancies -DEPS=$(wildcard $(HIPBONE_INCLUDE_DIR)/*.h) \ - $(wildcard ${LIBMESH_DIR}/include/*.hpp) \ - $(HIPBONE_INCLUDE_DIR)/mesh.hpp - -SRC =$(wildcard *.cpp) -OBJS=$(SRC:.cpp=.o) - -.PHONY: all lib clean realclean silentUpdate - -all: lib - -lib: ../libmesh.a silentUpdate - -../libmesh.a: $(OBJS) -ifneq (,${verbose}) - ar -cr ../libmesh.a $(OBJS) -else - @printf "%b" "$(LIB_COLOR)Building library $(@F)$(NO_COLOR)\n"; - @ar -cr ../libmesh.a $(OBJS) -endif - -silentUpdate: - @true - -# rule for .cpp files -%.o: %.cpp $(DEPS) -ifneq (,${verbose}) - $(HIPBONE_CXX) -o $*.o -c $*.cpp $(MESH_CXXFLAGS) -else - @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(HIPBONE_CXX) -o $*.o -c $*.cpp $(MESH_CXXFLAGS) -endif - -#cleanup -clean: - rm -f *.o ../libmesh.a diff --git a/libs/mesh/mesh.cpp b/libs/mesh/mesh.cpp index 5418b17..8e54a2d 100644 --- a/libs/mesh/mesh.cpp +++ b/libs/mesh/mesh.cpp @@ -32,9 +32,9 @@ void mesh_t::Setup(platform_t& _platform) { platform = _platform; - comm = platform.comm; - MPI_Comm_rank(platform.comm, &rank); - MPI_Comm_size(platform.comm, &size); + comm = platform.comm.Dup(); + rank = comm.rank(); + size = comm.size(); platform.settings().getSetting("POLYNOMIAL DEGREE", N); @@ -83,4 +83,4 @@ void mesh_t::Setup(platform_t& _platform) { OccaSetup(); } -} //namespace libp \ No newline at end of file +} //namespace libp diff --git a/libs/mesh/meshBasis1D.cpp b/libs/mesh/meshBasis1D.cpp index da8e475..b8eca24 100644 --- a/libs/mesh/meshBasis1D.cpp +++ b/libs/mesh/meshBasis1D.cpp @@ -92,21 +92,21 @@ void mesh_t::MassMatrix1D(int _Np, dfloat V[], dfloat _MM[]){ _MM[n*_Np + m] = res; } } - matrixInverse(_Np, _MM); + linAlg_t::matrixInverse(_Np, _MM); } void mesh_t::Dmatrix1D(int _N, int Npoints, dfloat _r[], dfloat _Dr[]){ int _Np = _N+1; - libp::memory V(Npoints*_Np); - libp::memory Vr(Npoints*_Np); + memory V(Npoints*_Np); + memory Vr(Npoints*_Np); Vandermonde1D(_N, Npoints, _r, V.ptr()); GradVandermonde1D(_N, Npoints, _r, Vr.ptr()); //D = Vr/V - matrixRightSolve(_Np, _Np, Vr.ptr(), _Np, _Np, V.ptr(), _Dr); + linAlg_t::matrixRightSolve(_Np, _Np, Vr.ptr(), _Np, _Np, V.ptr(), _Dr); } // ------------------------------------------------------------------------ @@ -122,7 +122,7 @@ dfloat mesh_t::JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){ dfloat ax = a; - libp::memory P(_N+1); + memory P(_N+1); // Zero order dfloat gamma0 = pow(2,(alpha+beta+1))/(alpha+beta+1)*mygamma(1+alpha)*mygamma(1+beta)/mygamma(1+alpha+beta); @@ -172,14 +172,14 @@ void mesh_t::JacobiGLL(int _N, dfloat _x[], dfloat w[]){ _x[_N] = 1.; if(_N>1){ - libp::memory wtmp(_N-1); + memory wtmp(_N-1); JacobiGQ(1,1, _N-2, _x+1, wtmp.ptr()); } if (w!=nullptr) { int _Np = _N+1; - libp::memory _MM(_Np*_Np); - libp::memory V(_Np*_Np); + memory _MM(_Np*_Np); + memory V(_Np*_Np); Vandermonde1D(_N, _N+1, _x, V.ptr()); MassMatrix1D(_N+1, V.ptr(), _MM.ptr()); @@ -210,8 +210,8 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat _x[], dfloat w[] } // Form symmetric matrix from recurrence. - libp::memory J((_N+1)*(_N+1), 0); - libp::memory h1(_N+1); + memory J((_N+1)*(_N+1), 0); + memory h1(_N+1); for(int n=0;n<=_N;++n){ h1[n] = 2*n+alpha+beta; @@ -240,12 +240,12 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat _x[], dfloat w[] // Compute quadrature by eigenvalue solve // [V,D] = eig(J); - libp::memory WR(_N+1); - libp::memory WI(_N+1); - libp::memory VR((_N+1)*(_N+1)); + memory WR(_N+1); + memory WI(_N+1); + memory VR((_N+1)*(_N+1)); // _x = diag(D); - matrixEigenVectors(_N+1, J.ptr(), VR.ptr(), _x, WI.ptr()); + linAlg_t::matrixEigenVectors(_N+1, J.ptr(), VR.ptr(), _x, WI.ptr()); //w = (V(1,:)').^2*2^(alpha+beta+1)/(alpha+beta+1)*gamma(alpha+1)*.gamma(beta+1)/gamma(alpha+beta+1); for(int n=0;n<=_N;++n){ @@ -267,4 +267,4 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat _x[], dfloat w[] } } -} //namespace libp \ No newline at end of file +} //namespace libp diff --git a/libs/mesh/meshBasisHex3D.cpp b/libs/mesh/meshBasisHex3D.cpp index 9001cc4..e24dd95 100644 --- a/libs/mesh/meshBasisHex3D.cpp +++ b/libs/mesh/meshBasisHex3D.cpp @@ -34,7 +34,7 @@ namespace libp { void mesh_t::NodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[]){ int _Nq = _N+1; - libp::memory r1D(_Nq); + memory r1D(_Nq); JacobiGLL(_N, r1D.ptr()); //Gauss-Legendre-Lobatto nodes //Tensor product @@ -109,4 +109,148 @@ void mesh_t::VertexNodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[], int } } -} //namespace libp \ No newline at end of file +/*Find a matching array between nodes on matching faces */ +void mesh_t::FaceNodeMatchingHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[], + int _faceNodes[], int R[]){ + + int _Nq = _N+1; + int _Nfp = _Nq*_Nq; + + const dfloat NODETOL = 1.0e-5; + + dfloat V0[4][2] = {{-1.0,-1.0},{ 1.0,-1.0},{ 1.0, 1.0},{-1.0, 1.0}}; + dfloat V1[4][2] = {{-1.0,-1.0},{-1.0, 1.0},{ 1.0, 1.0},{ 1.0,-1.0}}; + + dfloat EX0[Nverts], EY0[Nverts]; + dfloat EX1[Nverts], EY1[Nverts]; + + memory x0(_Nfp); + memory y0(_Nfp); + + memory x1(_Nfp); + memory y1(_Nfp); + + + for (int fM=0;fMNODETOL); + } + } + } + } +} + +} //namespace libp diff --git a/libs/mesh/meshConnect.cpp b/libs/mesh/meshConnect.cpp index 9b89fec..ef59673 100644 --- a/libs/mesh/meshConnect.cpp +++ b/libs/mesh/meshConnect.cpp @@ -59,7 +59,7 @@ void mesh_t::Connect(){ **********************/ /* build list of faces */ - libp::memory faces(Nelements*Nfaces); + memory faces(Nelements*Nfaces); #pragma omp parallel for collapse(2) for(dlong e=0;e Nsend(size, 0); - libp::memory Nrecv(size, 0); - libp::memory sendOffsets(size, 0); - libp::memory recvOffsets(size, 0); + memory Nsend(size, 0); + memory Nrecv(size, 0); + memory sendOffsets(size, 0); + memory recvOffsets(size, 0); // WARNING: In some corner cases, the number of faces to send may overrun int storage int allNsend = 0; @@ -147,8 +147,8 @@ void mesh_t::Connect(){ hlong maxv = 0; for(int n=0;n sendFaces(allNsend); - - // Make the MPI_FACE_T data type - MPI_Datatype MPI_FACE_T; - MPI_Datatype dtype[7] = {MPI_HLONG, MPI_DLONG, MPI_DLONG, MPI_INT, - MPI_INT, MPI_INT, MPI_INT}; - int blength[7] = {4, 1, 1, 1, 1, 1, 1}; - MPI_Aint addr[7], displ[7]; - MPI_Get_address ( &(sendFaces[0] ), addr+0); - MPI_Get_address ( &(sendFaces[0].element ), addr+1); - MPI_Get_address ( &(sendFaces[0].elementN ), addr+2); - MPI_Get_address ( &(sendFaces[0].face ), addr+3); - MPI_Get_address ( &(sendFaces[0].faceN ), addr+4); - MPI_Get_address ( &(sendFaces[0].rank ), addr+5); - MPI_Get_address ( &(sendFaces[0].rankN ), addr+6); - displ[0] = 0; - displ[1] = addr[1] - addr[0]; - displ[2] = addr[2] - addr[0]; - displ[3] = addr[3] - addr[0]; - displ[4] = addr[4] - addr[0]; - displ[5] = addr[5] - addr[0]; - displ[6] = addr[6] - addr[0]; - MPI_Type_create_struct (7, blength, displ, dtype, &MPI_FACE_T); - MPI_Type_commit (&MPI_FACE_T); + memory sendFaces(allNsend); // pack face data for(dlong e=0;e recvFaces(allNrecv); + memory recvFaces(allNrecv); // exchange parallel faces - MPI_Alltoallv(sendFaces.ptr(), Nsend.ptr(), sendOffsets.ptr(), MPI_FACE_T, - recvFaces.ptr(), Nrecv.ptr(), recvOffsets.ptr(), MPI_FACE_T, - comm); + comm.Alltoallv(sendFaces, Nsend, sendOffsets, + recvFaces, Nrecv, recvOffsets); // local sort allNrecv received faces sort(recvFaces.ptr(), recvFaces.ptr()+allNrecv, @@ -289,9 +263,8 @@ void mesh_t::Connect(){ }); // send faces back from whence they came - MPI_Alltoallv(recvFaces.ptr(), Nrecv.ptr(), recvOffsets.ptr(), MPI_FACE_T, - sendFaces.ptr(), Nsend.ptr(), sendOffsets.ptr(), MPI_FACE_T, - comm); + comm.Alltoallv(recvFaces, Nrecv, recvOffsets, + sendFaces, Nsend, sendOffsets); // extract connectivity info #pragma omp parallel for @@ -313,13 +286,9 @@ void mesh_t::Connect(){ } } - MPI_Barrier(comm); - MPI_Type_free(&MPI_FACE_T); - //record the number of elements in the whole mesh - hlong NelementsLocal = (hlong) Nelements; - NelementsGlobal = 0; - MPI_Allreduce(&NelementsLocal, &NelementsGlobal, 1, MPI_HLONG, MPI_SUM, comm); + NelementsGlobal = Nelements; + comm.Allreduce(NelementsGlobal); } } //namespace libp diff --git a/libs/mesh/meshConnectFaceNodes.cpp b/libs/mesh/meshConnectFaceNodes.cpp index e05014b..2e13d5f 100644 --- a/libs/mesh/meshConnectFaceNodes.cpp +++ b/libs/mesh/meshConnectFaceNodes.cpp @@ -31,149 +31,10 @@ namespace libp { // serial face-node to face-node connection void mesh_t::ConnectFaceNodes(){ - const dfloat NODETOL = 1.0e-5; - - dfloat V0[4][2] = {{-1.0,-1.0},{ 1.0,-1.0},{ 1.0, 1.0},{-1.0, 1.0}}; - dfloat V1[4][2] = {{-1.0,-1.0},{-1.0, 1.0},{ 1.0, 1.0},{ 1.0,-1.0}}; - - dfloat EX0[Nverts], EY0[Nverts]; - dfloat EX1[Nverts], EY1[Nverts]; - - libp::memory x0(Nfp); - libp::memory y0(Nfp); - - libp::memory x1(Nfp); - libp::memory y1(Nfp); - /* Build the permutation array R */ - libp::memory R(Nfaces*Nfaces*Nverts*Nfp); - - for (int fM=0;fMNODETOL){ - //This shouldn't happen - std::stringstream ss; - ss << "Unable to match face node, face: " << fM - << ", matching face: " << fP - << ", rotation: " << rot - << ", node: " << n - << ". Is the reference node set not symmetric?"; - HIPBONE_ABORT(ss.str()) - } - } - } - } - } - - x0.free(); y0.free(); - x1.free(); y1.free(); + memory R(Nfaces*Nfaces*NfaceVertices*Nfp); + FaceNodeMatchingHex3D(N, r.ptr(), s.ptr(), t.ptr(), + faceNodes.ptr(), R.ptr()); /* volume indices of the interior and exterior face nodes for each element */ vmapM.malloc(Nfp*Nfaces*Nelements); @@ -213,27 +74,10 @@ void mesh_t::ConnectFaceNodes(){ const dlong id = eM*Nfaces*Nfp + fM*Nfp + nM; vmapM[id] = idM + eM*Np; vmapP[id] = idP + eP*Np; - - #if 0 - /*Sanity check*/ - dfloat xnM = x[idM + eM*Np]; - dfloat ynM = y[idM + eM*Np]; - dfloat znM = z[idM + eM*Np]; - dfloat xnP = x[idP + eP*Np]; - dfloat ynP = y[idP + eP*Np]; - dfloat znP = z[idP + eP*Np]; - const dfloat dist = pow(xnM-xnP,2) + pow(ynM-ynP,2) + pow(znM-znP,2); - if (dist>NODETOL) - printf("Mismatch?: Element %d, face %d, node %d, xM = (%f, %f, %f), xP = (%f, %f, %f)\n", - eM, fM, nM, xnM, ynM, znM, xnP, ynP, znP); - #endif } } } } - -// printf("connecting (%d,%d) to (%d,%d) [ vmapM %d to vmapP %d ]\n", -// e,f,eP,fP, vmapM[id], vmapP[id]); } } //namespace libp diff --git a/libs/mesh/meshConnectFaceVertices.cpp b/libs/mesh/meshConnectFaceVertices.cpp index bc8a06e..322e1a8 100644 --- a/libs/mesh/meshConnectFaceVertices.cpp +++ b/libs/mesh/meshConnectFaceVertices.cpp @@ -33,7 +33,7 @@ void mesh_t::ConnectFaceVertices(){ //allocate and fill a halo region in element-to-vertex mapping EToV.realloc((Nelements+totalHaloPairs)*Nverts); - halo.Exchange(EToV.ptr(), Nverts, ogs::Hlong); + halo.Exchange(EToV, Nverts); /* volume indices of the interior and exterior face vertices for each element */ VmapM.malloc(NfaceVertices*Nfaces*Nelements); diff --git a/libs/mesh/meshConnectNodes.cpp b/libs/mesh/meshConnectNodes.cpp index 7a3cf02..e4e15df 100644 --- a/libs/mesh/meshConnectNodes.cpp +++ b/libs/mesh/meshConnectNodes.cpp @@ -31,18 +31,10 @@ namespace libp { // uniquely label each node with a global index, used for gatherScatter void mesh_t::ConnectNodes(){ - dlong localNodeCount = Np*Nelements; - libp::memory allLocalNodeCounts(platform.size); - - MPI_Allgather(&localNodeCount, 1, MPI_DLONG, - allLocalNodeCounts.ptr(), 1, MPI_DLONG, - comm); - - hlong gatherNodeStart = 0; - for(int rr=0;rrvirtual gather) globalIds.malloc((totalHaloPairs+Nelements)*Np); @@ -56,16 +48,16 @@ void mesh_t::ConnectNodes(){ } } - hlong localChange = 0, gatherChange = 1; + hlong gatherChange = 1; // keep comparing numbers on positive and negative traces until convergence while(gatherChange>0){ // reset change counter - localChange = 0; + gatherChange = 0; // send halo data and recv into extension of buffer - halo.Exchange(globalIds.ptr(), Np, ogs::Hlong); + halo.Exchange(globalIds, Np); // compare trace nodes #pragma omp parallel for collapse(2) @@ -78,14 +70,14 @@ void mesh_t::ConnectNodes(){ hlong gidP = globalIds[idP]; if(gidP minRank(Ntotal); - libp::memory maxRank(Ntotal); + memory minRank(Ntotal); + memory maxRank(Ntotal); for (dlong i=0;i0){ // reset change counter - localChange = 0; + gatherChange = 0; // send halo data and recv into extension of buffer - halo.Exchange(minRank.ptr(), Nverts, ogs::Int32); - halo.Exchange(maxRank.ptr(), Nverts, ogs::Int32); + halo.Exchange(minRank, Nverts); + halo.Exchange(maxRank, Nverts); // compare trace vertices #pragma omp parallel for collapse(2) @@ -67,19 +67,19 @@ void mesh_t::GatherScatterSetup() { int maxRankP = maxRank[idP]; if(minRankPmaxRankM){ - localChange=1; + gatherChange=1; maxRank[idM] = maxRankP; } } } // sum up changes - MPI_Allreduce(&localChange, &gatherChange, 1, MPI_HLONG, MPI_MAX, comm); + comm.Allreduce(gatherChange); } // count elements that contribute to global C0 gather-scatter @@ -152,14 +152,14 @@ void mesh_t::GatherScatterSetup() { //use the masked ids to make another gs handle (signed so the gather is defined) bool verbose = platform.settings().compareSetting("VERBOSE", "TRUE"); bool unique = true; //flag a unique node in every gather node - ogsMasked.Setup(Nelements*Np, maskedGlobalIds.ptr(), + ogsMasked.Setup(Nelements*Np, maskedGlobalIds, comm, ogs::Signed, ogs::Auto, unique, verbose, platform); gHalo.SetupFromGather(ogsMasked); GlobalToLocal.malloc(Nelements*Np); - ogsMasked.SetupGlobalToLocalMapping(GlobalToLocal.ptr()); + ogsMasked.SetupGlobalToLocalMapping(GlobalToLocal); o_GlobalToLocal = platform.malloc(GlobalToLocal); @@ -175,30 +175,30 @@ void mesh_t::GatherScatterSetup() { #pragma omp parallel for for(dlong n=0;n newglobalIds(Ngather); + memory newglobalIds(Ngather); // every gathered degree of freedom has its own global id - libp::memory globalStarts(size+1, 0); - MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts.ptr()+1, 1, MPI_HLONG, comm); - for(int rr=0;rr globalOffset(size+1, 0); - hlong localNelements = (hlong) Nelements; + memory globalOffset(size+1, 0); //gather number of elements on each rank - MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffset.ptr()+1, 1, MPI_HLONG, comm); + hlong localNelements = Nelements; + comm.Allgather(localNelements, globalOffset+1); for(int rr=0;rr globalElementId(Nelements+totalHaloPairs); + memory globalElementId(Nelements+totalHaloPairs); //outgoing elements for(int e=0;e(internalElementIds); + o_haloElementIds = platform.malloc(haloElementIds); + o_globalGatherElementList = platform.malloc(globalGatherElementList); + o_localGatherElementList = platform.malloc(localGatherElementList); props = platform.props(); //copy platform props @@ -56,9 +54,8 @@ void mesh_t::OccaSetup(){ props["defines/" "p_G22ID"]= G22ID; props["defines/" "p_GWJID"]= GWJID; - o_D = platform.malloc(D); - - o_ggeo = platform.malloc(ggeo); + o_D = platform.malloc(D); + o_ggeo = platform.malloc(ggeo); } } //namespace libp diff --git a/libs/mesh/meshPhysicalNodes.cpp b/libs/mesh/meshPhysicalNodes.cpp index e87f2ff..2d52da4 100644 --- a/libs/mesh/meshPhysicalNodes.cpp +++ b/libs/mesh/meshPhysicalNodes.cpp @@ -108,9 +108,9 @@ void mesh_t::PhysicalNodes(){ } } - halo.Exchange(x.ptr(), Np, ogs::Dfloat); - halo.Exchange(y.ptr(), Np, ogs::Dfloat); - halo.Exchange(z.ptr(), Np, ogs::Dfloat); + halo.Exchange(x, Np); + halo.Exchange(y, Np); + halo.Exchange(z, Np); } -} //namespace libp \ No newline at end of file +} //namespace libp diff --git a/libs/mesh/meshSetupBox.cpp b/libs/mesh/meshSetupBox.cpp index 6b2b8ca..c8c9a4c 100644 --- a/libs/mesh/meshSetupBox.cpp +++ b/libs/mesh/meshSetupBox.cpp @@ -60,19 +60,18 @@ void mesh_t::SetupBox(){ // size is total number of ranks and populated by the mpi communicator size_x = std::cbrt(size); //number of ranks in each dimension - if (size_x*size_x*size_x != size) - HIPBONE_ABORT(std::string("3D BOX mesh requires a cubic number of MPI ranks since px,py,pz have not been provided.")) + LIBP_ABORT("3D BOX mesh requires a cubic number of MPI ranks since px,py,pz have not been provided.", + size_x*size_x*size_x != size); size_y = size_x; size_z = size_x; } - else if (size_x * size_y * size_z != size) { - // User provided only *some* of px, py, pz, so check if they multiply to - // the right thing. - HIPBONE_ABORT(std::string("3D BOX mesh requires the user specifies all of px, py, pz, or none of px, py, pz. If all are provided, their product must equal the total number of MPI ranks")) - } + // User provided only *some* of px, py, pz, so check if they multiply to + // the right thing. + LIBP_ABORT("3D BOX mesh requires the user specifies all of px, py, pz, or none of px, py, pz. If all are provided, their product must equal the total number of MPI ranks", + size_x * size_y * size_z != size); //local grid physical sizes dfloat dimx = DIMX/size_x; From db1e354e1abaac75af8096d5030f3380b03eb032 Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Tue, 8 Mar 2022 16:15:10 -0600 Subject: [PATCH 10/12] Update hipBone to use latest libs --- README.md | 5 ----- hipBone.cpp | 10 ++++------ hipBone.hpp | 12 +++++------- make.top | 2 +- src/operator.cpp | 12 ++++++------ src/run.cpp | 27 ++++++++++----------------- src/settings.cpp | 11 ++++------- src/setup.cpp | 7 +++---- 8 files changed, 33 insertions(+), 53 deletions(-) diff --git a/README.md b/README.md index 956c5ba..9ef7f13 100644 --- a/README.md +++ b/README.md @@ -89,9 +89,4 @@ To clean the `hipBone` build objects: $ cd /path/to/hipBone/repo $ make realclean -To clean JIT kernel objects: - - $ cd /path/to/hipBone/repo - $ rm -r .occa - Please invoke `make help` for more supported options. diff --git a/hipBone.cpp b/hipBone.cpp index 0028514..e1ac9ee 100644 --- a/hipBone.cpp +++ b/hipBone.cpp @@ -29,11 +29,10 @@ SOFTWARE. int main(int argc, char **argv){ // start up MPI - MPI_Init(&argc, &argv); + comm_t::Init(argc, argv); { /*Scope so everything is destructed before MPI_Finalize */ - - MPI_Comm comm = MPI_COMM_WORLD; + comm_t comm(comm_t::world().Dup()); hipBoneSettings_t settings(argc, argv, comm); if (settings.compareSetting("VERBOSE", "TRUE")) @@ -50,10 +49,9 @@ int main(int argc, char **argv){ // run hb.Run(); - } // close down MPI - MPI_Finalize(); - return HIPBONE_SUCCESS; + comm_t::Finalize(); + return LIBP_SUCCESS; } diff --git a/hipBone.hpp b/hipBone.hpp index 5b77a5c..d39c803 100644 --- a/hipBone.hpp +++ b/hipBone.hpp @@ -39,7 +39,7 @@ using namespace libp; class hipBoneSettings_t: public settings_t { public: - hipBoneSettings_t(const int argc, char** argv, MPI_Comm& _comm); + hipBoneSettings_t(const int argc, char** argv, comm_t _comm); void report(); }; @@ -50,10 +50,10 @@ class hipBone_t: public solver_t { dfloat lambda; - occa::memory o_AqL; + deviceMemory o_AqL; - occa::kernel operatorKernel; - occa::kernel forcingKernel; + kernel_t operatorKernel; + kernel_t forcingKernel; hipBone_t() = default; hipBone_t(platform_t& _platform, mesh_t &_mesh) { @@ -65,9 +65,7 @@ class hipBone_t: public solver_t { void Run(); - void PlotFields(dfloat* Q, char *fileName); - - void Operator(occa::memory& o_q, occa::memory& o_Aq); + void Operator(deviceMemory& o_q, deviceMemory& o_Aq); }; diff --git a/make.top b/make.top index fa882d5..530f9bb 100644 --- a/make.top +++ b/make.top @@ -51,7 +51,7 @@ export HIPBONE_LIBS= ${HIPBONE_BLAS_LIB} \ ifneq (,${debug}) export HIPBONE_CFLAGS=-O0 -g -Wall -Wshadow -Wno-unused-function -Wno-unknown-pragmas export HIPBONE_CXXFLAGS=-O0 -g -Wall -Wshadow -Wno-unused-function -Wno-unknown-pragmas -std=c++17 - export HIPBONE_DEFINES=-DHIPBONE_DEBUG + export HIPBONE_DEFINES=-DLIBP_DEBUG else export HIPBONE_CFLAGS=-fopenmp -O3 -Wall -Wshadow -Wno-unused-function export HIPBONE_CXXFLAGS=-fopenmp -O3 -Wall -Wshadow -Wno-unused-function -std=c++17 diff --git a/src/operator.cpp b/src/operator.cpp index a3b6246..00d5af3 100644 --- a/src/operator.cpp +++ b/src/operator.cpp @@ -26,9 +26,9 @@ #include "hipBone.hpp" -void hipBone_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ +void hipBone_t::Operator(deviceMemory &o_q, deviceMemory &o_Aq){ - mesh.gHalo.ExchangeStart(o_q, 1, ogs::Dfloat); + mesh.gHalo.ExchangeStart(o_q, 1); if(mesh.NlocalGatherElements/2){ operatorKernel(mesh.NlocalGatherElements/2, @@ -40,7 +40,7 @@ void hipBone_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ } // finalize halo exchange - mesh.gHalo.ExchangeFinish(o_q, 1, ogs::Dfloat); + mesh.gHalo.ExchangeFinish(o_q, 1); if(mesh.NglobalGatherElements) { operatorKernel(mesh.NglobalGatherElements, @@ -52,17 +52,17 @@ void hipBone_t::Operator(occa::memory &o_q, occa::memory &o_Aq){ } //gather result to Aq - mesh.ogsMasked.GatherStart(o_Aq, o_AqL, 1, ogs::Dfloat, ogs::Add, ogs::Trans); + mesh.ogsMasked.GatherStart(o_Aq, o_AqL, 1, ogs::Add, ogs::Trans); if((mesh.NlocalGatherElements+1)/2){ operatorKernel((mesh.NlocalGatherElements+1)/2, - mesh.o_localGatherElementList+(mesh.NlocalGatherElements/2)*sizeof(dlong), + mesh.o_localGatherElementList+(mesh.NlocalGatherElements/2), mesh.o_GlobalToLocal, mesh.o_ggeo, mesh.o_D, lambda, o_q, o_AqL); } - mesh.ogsMasked.GatherFinish(o_Aq, o_AqL, 1, ogs::Dfloat, ogs::Add, ogs::Trans); + mesh.ogsMasked.GatherFinish(o_Aq, o_AqL, 1, ogs::Add, ogs::Trans); } diff --git a/src/run.cpp b/src/run.cpp index ff0d07e..5bba158 100644 --- a/src/run.cpp +++ b/src/run.cpp @@ -25,21 +25,22 @@ SOFTWARE. */ #include "hipBone.hpp" +#include "timer.hpp" void hipBone_t::Run(){ //setup linear solver dlong N = mesh.ogsMasked.Ngather; dlong Nhalo = mesh.gHalo.Nhalo; - linearSolver_t *linearSolver = new cg(platform, N, Nhalo); + cg linearSolver(platform, N, Nhalo); hlong NGlobal = mesh.ogsMasked.NgatherGlobal; dlong NLocal = mesh.Np*mesh.Nelements; //create occa buffers dlong Nall = N+Nhalo; - occa::memory o_r = platform.malloc(Nall*sizeof(dfloat)); - occa::memory o_x = platform.malloc(Nall*sizeof(dfloat)); + deviceMemory o_r = platform.malloc(Nall); + deviceMemory o_x = platform.malloc(Nall); //set x =0 platform.linAlg().set(Nall, 0.0, o_x); @@ -50,24 +51,19 @@ void hipBone_t::Run(){ int maxIter = 100; int verbose = platform.settings().compareSetting("VERBOSE", "TRUE") ? 1 : 0; - platform.device.finish(); - MPI_Barrier(mesh.comm); - double startTime = MPI_Wtime(); + timePoint_t startTime = GlobalPlatformTime(platform); //call the solver dfloat tol = 0.0; - int Niter = linearSolver->Solve(*this, o_x, o_r, tol, maxIter, verbose); + int Niter = linearSolver.Solve(*this, o_x, o_r, tol, maxIter, verbose); - platform.device.finish(); - MPI_Barrier(mesh.comm); - double endTime = MPI_Wtime(); - double elapsedTime = endTime - startTime; + timePoint_t endTime = GlobalPlatformTime(platform); + double elapsedTime = ElapsedTime(startTime, endTime); int Np = mesh.Np, Nq = mesh.Nq; - hlong NunMasked = NLocal - mesh.Nmasked; - hlong NunMaskedGlobal; - MPI_Allreduce(&NunMasked, &NunMaskedGlobal, 1, MPI_HLONG, MPI_SUM, mesh.comm); + hlong NunMaskedGlobal = NLocal - mesh.Nmasked; + mesh.comm.Allreduce(NunMaskedGlobal); hlong Ndofs = NGlobal; @@ -109,7 +105,4 @@ void hipBone_t::Run(){ printf("hipBone: NekBone FOM = %4.1f GFLOPs. \n", NflopsNekbone/(1.0e9 * elapsedTime)); } - - o_r.free(); o_x.free(); - delete linearSolver; } diff --git a/src/settings.cpp b/src/settings.cpp index 905a89b..0772bb3 100644 --- a/src/settings.cpp +++ b/src/settings.cpp @@ -27,11 +27,11 @@ SOFTWARE. #include "hipBone.hpp" //settings for hipBone solver -hipBoneSettings_t::hipBoneSettings_t(const int argc, char** argv, MPI_Comm &_comm): +hipBoneSettings_t::hipBoneSettings_t(const int argc, char** argv, comm_t _comm): settings_t(_comm) { platformAddSettings(*this); - libp::meshAddSettings(*this); + meshAddSettings(*this); newSetting("-v", "--verbose", "VERBOSE", @@ -44,12 +44,9 @@ hipBoneSettings_t::hipBoneSettings_t(const int argc, char** argv, MPI_Comm &_com void hipBoneSettings_t::report() { - int rank; - MPI_Comm_rank(comm, &rank); - - if (rank==0) { + if (comm.rank()==0) { std::cout << "Settings:\n\n"; platformReportSettings(*this); - libp::meshReportSettings(*this); + meshReportSettings(*this); } } diff --git a/src/setup.cpp b/src/setup.cpp index fb10579..cef0998 100644 --- a/src/setup.cpp +++ b/src/setup.cpp @@ -34,17 +34,16 @@ void hipBone_t::Setup(platform_t& _platform, mesh_t& _mesh){ lambda = 0.1; //hard code //setup linear algebra module - platform.linAlg().InitKernels({"set", "axpy", "innerProd", "norm2"}, - mesh.comm); + platform.linAlg().InitKernels({"set", "axpy", "innerProd", "norm2"}); //Trigger JIT kernel builds ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add); //tmp local storage buffer for Ax op - o_AqL = platform.malloc(mesh.Np*mesh.Nelements*sizeof(dfloat)); + o_AqL = platform.malloc(mesh.Np*mesh.Nelements); // OCCA build stuff - occa::properties kernelInfo = mesh.props; //copy mesh occa properties + properties_t kernelInfo = mesh.props; //copy mesh occa properties // Ax kernel operatorKernel = platform.buildKernel(DHIPBONE "/okl/hipBoneAx.okl", From 21cbf97ec69c690c77d7d58b5df44f06d572419c Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Thu, 10 Mar 2022 09:03:12 -0600 Subject: [PATCH 11/12] Small tweak to block size --- include/ogs/ogsOperator.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ogs/ogsOperator.hpp b/include/ogs/ogsOperator.hpp index bd33c6f..811a1af 100644 --- a/include/ogs/ogsOperator.hpp +++ b/include/ogs/ogsOperator.hpp @@ -120,7 +120,7 @@ class ogsOperator_t { //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU static constexpr int blockSize = 256; - static constexpr int gatherNodesPerBlock = 1024; //should be a multiple of blockSize for good unrolling + static constexpr int gatherNodesPerBlock = 512; //should be a multiple of blockSize for good unrolling //4 types - Float, Double, Int32, Int64 //4 ops - Add, Mul, Max, Min From 65fd9da7ecfa76466a90e36182ce6dd8bc3bc1b2 Mon Sep 17 00:00:00 2001 From: Noel Chalmers Date: Mon, 14 Mar 2022 18:04:04 -0500 Subject: [PATCH 12/12] Remove ambiguous scan overload --- include/comm.hpp | 7 ------- 1 file changed, 7 deletions(-) diff --git a/include/comm.hpp b/include/comm.hpp index 7781da0..3d942c7 100644 --- a/include/comm.hpp +++ b/include/comm.hpp @@ -382,13 +382,6 @@ class comm_t { MPI_Scan(&snd, &rcv, 1, type, op, comm()); mpiType::freeMpiType(type); } - template - void Scan(T& snd, - const op_t op = Sum) const { - T rcv=snd; - Scan(snd, rcv, op); - snd = rcv; - } /*libp::memory gather*/ template class mem, typename T>