From a488e897e4f083876a8bd852b2a9ddf3a9d36c9d Mon Sep 17 00:00:00 2001 From: Damon McDougall Date: Wed, 13 Apr 2022 17:50:23 -0400 Subject: [PATCH 1/2] Add roctx support Added roctx ranges around comms, and some important CG steps. Also added a build toggle to enable roctx support. --- include/comm.hpp | 81 ++++++++++++++++++++++++++++++++++++ libs/core/comm.cpp | 6 +++ libs/core/linearSolverCG.cpp | 7 +++- libs/makefile | 9 +++- makefile | 4 ++ 5 files changed, 104 insertions(+), 3 deletions(-) diff --git a/include/comm.hpp b/include/comm.hpp index 7fcdca5..ac5b365 100644 --- a/include/comm.hpp +++ b/include/comm.hpp @@ -30,6 +30,15 @@ SOFTWARE. #include #include "core.hpp" +#ifdef WITH_ROCTX + #include + #define ROCTX_PUSH(string) roctxRangePush(string) + #define ROCTX_POP() roctxRangePop() +#else + #define ROCTX_PUSH(string) + #define ROCTX_POP() +#endif + namespace libp { #define MAX_PROCESSOR_NAME MPI_MAX_PROCESSOR_NAME @@ -114,10 +123,12 @@ class comm_t { const int dest, const int count=-1, const int tag=0) const { + ROCTX_PUSH("comm_t::Send(mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(m.length()) : count; MPI_Send(m.ptr(), cnt, type, dest, tag, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory recv*/ @@ -126,10 +137,12 @@ class comm_t { const int source, const int count=-1, const int tag=0) const { + ROCTX_PUSH("comm_t::Recv(mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(m.length()) : count; MPI_Recv(m.ptr(), cnt, type, source, tag, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar send*/ @@ -137,9 +150,11 @@ class comm_t { void Send(T& val, const int dest, const int tag=0) const { + ROCTX_PUSH("comm_t::Send(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Send(&val, 1, type, dest, tag, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar recv*/ @@ -147,9 +162,11 @@ class comm_t { void Recv(T& val, const int source, const int tag=0) const { + ROCTX_PUSH("comm_t::Recv(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Recv(&val, 1, type, source, tag, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory non-blocking send*/ @@ -159,9 +176,11 @@ class comm_t { const int count, const int tag, request_t &request) const { + ROCTX_PUSH("comm_t::Isend(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Isend(m.ptr(), count, type, dest, tag, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory non-blocking recv*/ @@ -171,9 +190,11 @@ class comm_t { const int count, const int tag, request_t &request) const { + ROCTX_PUSH("comm_t::Irecv(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Irecv(m.ptr(), count, type, source, tag, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar non-blocking send*/ @@ -182,9 +203,11 @@ class comm_t { const int dest, const int tag, request_t &request) const { + ROCTX_PUSH("comm_t::Isend(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Isend(&val, 1, type, dest, tag, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar non-blocking recv*/ @@ -193,9 +216,11 @@ class comm_t { const int source, const int tag, request_t &request) const { + ROCTX_PUSH("comm_t::Irecv(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Irecv(&val, 1, type, source, tag, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory broadcast*/ @@ -203,19 +228,23 @@ class comm_t { void Bcast(mem m, const int root, const int count=-1) const { + ROCTX_PUSH("comm_t::Bcast(mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(m.length()) : count; MPI_Bcast(m.ptr(), cnt, type, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar broadcast*/ template void Bcast(T& val, const int root) const { + ROCTX_PUSH("comm_t::Bcast(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Bcast(&val, 1, type, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory reduce*/ @@ -225,10 +254,12 @@ class comm_t { const int root, const op_t op = Sum, const int count=-1) const { + ROCTX_PUSH("comm_t::Reduce(mem, mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(snd.length()) : count; MPI_Reduce(snd.ptr(), rcv.ptr(), cnt, type, op, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory in-place reduce*/ @@ -237,6 +268,7 @@ class comm_t { const int root, const op_t op = Sum, const int count=-1) const { + ROCTX_PUSH("comm_t::Reduce(mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(m.length()) : count; if (_rank==root) { @@ -245,6 +277,7 @@ class comm_t { MPI_Reduce(m.ptr(), nullptr, cnt, type, op, root, comm()); } mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar reduce*/ @@ -253,9 +286,11 @@ class comm_t { T& rcv, const int root, const op_t op = Sum) const { + ROCTX_PUSH("comm_t::Reduce(T, T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Reduce(&snd, &rcv, 1, type, op, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } template void Reduce(T& val, @@ -272,10 +307,12 @@ class comm_t { mem rcv, const op_t op = Sum, const int count=-1) const { + ROCTX_PUSH("comm_t::Allreduce(mem, mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(snd.length()) : count; MPI_Allreduce(snd.ptr(), rcv.ptr(), cnt, type, op, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory in-place allreduce*/ @@ -283,10 +320,12 @@ class comm_t { void Allreduce(mem m, const op_t op = Sum, const int count=-1) const { + ROCTX_PUSH("comm_t::Allreduce(mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(m.length()) : count; MPI_Allreduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar allreduce*/ @@ -294,9 +333,11 @@ class comm_t { void Allreduce(const T& snd, T& rcv, const op_t op = Sum) const { + ROCTX_PUSH("comm_t::Allreduce(T, T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Allreduce(&snd, &rcv, 1, type, op, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } template void Allreduce(T& val, @@ -313,9 +354,11 @@ class comm_t { const op_t op, const int count, request_t &request) const { + ROCTX_PUSH("comm_t::Iallreduce(mem, mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Iallreduce(snd.ptr(), rcv.ptr(), count, type, op, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory non-blocking in-place allreduce*/ @@ -325,9 +368,11 @@ class comm_t { const op_t op, const int count, request_t &request) const { + ROCTX_PUSH("comm_t::Iallreduce(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Iallreduce(MPI_IN_PLACE, m.ptr(), count, type, op, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar non-blocking allreduce*/ @@ -336,18 +381,22 @@ class comm_t { T& rcv, const op_t op, request_t &request) const { + ROCTX_PUSH("comm_t::Iallreduce(T, T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Iallreduce(&snd, &rcv, 1, type, op, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar non-blocking in-place allreduce*/ template class mem, typename T> void Iallreduce(T& val, const op_t op, request_t &request) const { + ROCTX_PUSH("comm_t::Iallreduce(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Iallreduce(MPI_IN_PLACE, &val, 1, type, op, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory scan*/ @@ -356,10 +405,12 @@ class comm_t { mem rcv, const op_t op = Sum, const int count=-1) const { + ROCTX_PUSH("comm_t::Scan(mem, mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(snd.length()) : count; MPI_Scan(snd.ptr(), rcv.ptr(), cnt, type, op, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory in-place scan*/ @@ -367,10 +418,12 @@ class comm_t { void Scan(mem m, const op_t op = Sum, const int count=-1) const { + ROCTX_PUSH("comm_t::Scan(mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(m.length()) : count; MPI_Scan(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar scan*/ @@ -378,9 +431,11 @@ class comm_t { void Scan(const T& snd, T& rcv, const op_t op = Sum) const { + ROCTX_PUSH("comm_t::Scan(T, T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Scan(&snd, &rcv, 1, type, op, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory gather*/ @@ -389,11 +444,13 @@ class comm_t { mem rcv, const int root, const int sendCount=-1) const { + ROCTX_PUSH("comm_t::Gather(mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (sendCount==-1) ? static_cast(snd.length()) : sendCount; MPI_Gather(snd.ptr(), cnt, type, rcv.ptr(), cnt, type, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory gatherv*/ @@ -404,11 +461,13 @@ class comm_t { const memory recvCounts, const memory recvOffsets, const int root) const { + ROCTX_PUSH("comm_t::Gatherv(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Gatherv(snd.ptr(), sendcount, type, rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar gather*/ @@ -416,10 +475,12 @@ class comm_t { void Gather(const T& snd, mem rcv, const int root) const { + ROCTX_PUSH("comm_t::Gather(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Gather(&snd, 1, type, rcv.ptr(), 1, type, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory scatter*/ @@ -428,11 +489,13 @@ class comm_t { mem rcv, const int root, const int count=-1) const { + ROCTX_PUSH("comm_t::Scatter(mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (count==-1) ? static_cast(rcv.length()) : count; MPI_Scatter(snd.ptr(), cnt, type, rcv.ptr(), cnt, type, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory scatterv*/ @@ -443,11 +506,13 @@ class comm_t { mem rcv, const int recvcount, const int root) const { + ROCTX_PUSH("comm_t::Scatterv(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Scatterv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, rcv.ptr(), recvcount, type, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar scatter*/ @@ -455,10 +520,12 @@ class comm_t { void Scatter(T& rcv, const mem snd, const int root) const { + ROCTX_PUSH("comm_t::Scatter(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Scatter(snd.ptr, 1, type, &rcv, 1, type, root, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory allgather*/ @@ -466,19 +533,23 @@ class comm_t { void Allgather(const mem snd, mem rcv, const int sendCount=-1) const { + ROCTX_PUSH("comm_t::Allgather(mem, mem)"); MPI_Datatype type = mpiType::getMpiType(); const int cnt = (sendCount==-1) ? static_cast(snd.length()) : sendCount; MPI_Allgather(snd.ptr(), cnt, type, rcv.ptr(), cnt, type, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } template class mem, typename T> void Allgather(mem m, const int cnt) const { + ROCTX_PUSH("comm_t::Allgather(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Allgather(MPI_IN_PLACE, cnt, type, m.ptr(), cnt, type, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory allgatherv*/ @@ -488,21 +559,25 @@ class comm_t { mem rcv, const memory recvCounts, const memory recvOffsets) const { + ROCTX_PUSH("comm_t::Allgatherv(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Allgatherv(snd.ptr(), sendcount, type, rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*scalar allgather*/ template class mem, typename T> void Allgather(const T& snd, mem rcv) const { + ROCTX_PUSH("comm_t::Allgather(T)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Allgather(&snd, 1, type, rcv.ptr(), 1, type, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory alltoall*/ @@ -510,10 +585,12 @@ class comm_t { void Alltoall(const mem snd, mem rcv, const int cnt=1) const { + ROCTX_PUSH("comm_t::Alltoall(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Alltoall(snd.ptr(), cnt, type, rcv.ptr(), cnt, type, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } /*libp::memory alltoallv*/ @@ -524,11 +601,13 @@ class comm_t { mem rcv, const memory recvCounts, const memory recvOffsets) const { + ROCTX_PUSH("comm_t::Alltoallv(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Alltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, comm()); mpiType::freeMpiType(type); + ROCTX_POP(); } template class mem, typename T> @@ -539,11 +618,13 @@ class comm_t { const memory recvCounts, const memory recvOffsets, request_t &request) const { + ROCTX_PUSH("comm_t::Ialltoallv(mem)"); MPI_Datatype type = mpiType::getMpiType(); MPI_Ialltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type, rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type, comm(), &request); mpiType::freeMpiType(type); + ROCTX_POP(); } void Wait(request_t &request) const; diff --git a/libs/core/comm.cpp b/libs/core/comm.cpp index 853a70c..32e60e8 100644 --- a/libs/core/comm.cpp +++ b/libs/core/comm.cpp @@ -96,15 +96,21 @@ MPI_Comm comm_t::comm() const { } void comm_t::Wait(request_t &request) const { + roctxRangePush("comm_t::Wait"); MPI_Wait(&request, MPI_STATUS_IGNORE); + roctxRangePop(); } void comm_t::Waitall(const int count, memory &requests) const { + roctxRangePush("comm_t::Waitall"); MPI_Waitall(count, requests.ptr(), MPI_STATUSES_IGNORE); + roctxRangePop(); } void comm_t::Barrier() const { + roctxRangePush("comm_t::Barrier"); MPI_Barrier(comm()); + roctxRangePop(); } } //namespace libp diff --git a/libs/core/linearSolverCG.cpp b/libs/core/linearSolverCG.cpp index ba60534..eb4ab63 100644 --- a/libs/core/linearSolverCG.cpp +++ b/libs/core/linearSolverCG.cpp @@ -25,6 +25,7 @@ SOFTWARE. */ #include "linearSolver.hpp" +#include namespace libp { @@ -65,6 +66,7 @@ int cg::Solve(solver_t& solver, const dfloat tol, const int MAXIT, const int verbose) { + if (MAXIT <= 100) roctxRangePush("cg::Solve"); int rank = platform.rank(); linAlg_t &linAlg = platform.linAlg(); @@ -115,7 +117,9 @@ int cg::Solve(solver_t& solver, // x <= x + alpha*p // r <= r - alpha*A*p // dot(r,r) + if (MAXIT <= 100) roctxRangePush("cg::UpdateCG"); rdotr = UpdateCG(alpha, o_x, o_r); + if (MAXIT <= 100) roctxRangePop(); if (verbose&&(rank==0)) { if(rdotr<0) @@ -125,13 +129,13 @@ int cg::Solve(solver_t& solver, } } + if (MAXIT <= 100) roctxRangePop(); return iter; } dfloat cg::UpdateCG(const dfloat alpha, deviceMemory o_x, deviceMemory o_r){ - linAlg_t &linAlg = platform.linAlg(); // r <= r - alpha*A*p @@ -150,6 +154,7 @@ dfloat cg::UpdateCG(const dfloat alpha, /*Compute all reduce while axpy is running*/ dfloat rdotr1 = h_tmprdotr[0]; + comm.Allreduce(rdotr1); return rdotr1; diff --git a/libs/makefile b/libs/makefile index 1bacc36..8f92f17 100644 --- a/libs/makefile +++ b/libs/makefile @@ -45,6 +45,11 @@ LIBCORE_DEFINES=${HIPBONE_DEFINES} LIBMESH_DEFINES=${HIPBONE_DEFINES} LIBOGS_DEFINES=${HIPBONE_DEFINES} +ifeq (true,${with-roctx}) + LIBCORE_DEFINES+= -DWITH_ROCTX + LIBCORE_INCLUDES= -I${ROCM_PATH}/include +endif + ifeq (true,${gpu-aware-mpi}) LIBOGS_DEFINES+= -DGPU_AWARE_MPI endif @@ -123,10 +128,10 @@ endif core/%.o: core/%.cpp $(LIB_DEPS) ifneq (,${verbose}) - $(HIPBONE_CXX) -o $@ -c $< ${LIBCORE_DEFINES} $(LIB_CXXFLAGS) + $(HIPBONE_CXX) -o $@ -c $< ${LIBCORE_DEFINES} $(LIB_CXXFLAGS) $(LIBCORE_INCLUDES) else @printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n"; - @$(HIPBONE_CXX) -o $@ -c $< ${LIBCORE_DEFINES} $(LIB_CXXFLAGS) + @$(HIPBONE_CXX) -o $@ -c $< ${LIBCORE_DEFINES} $(LIB_CXXFLAGS) $(LIBCORE_INCLUDES) endif mesh/%.o: mesh/%.cpp $(LIB_DEPS) | libogs.a diff --git a/makefile b/makefile index 4f40db6..3705fb7 100644 --- a/makefile +++ b/makefile @@ -91,6 +91,10 @@ HB_CXXFLAGS=${HIPBONE_CXXFLAGS} ${DEFINES} ${INCLUDES} LIBS=-L${HIPBONE_LIBS_DIR} -lmesh -logs -lcore \ ${HIPBONE_LIBS} +ifeq (true,${with-roctx}) + LIBS+= -L${ROCM_PATH}/lib -lroctx64 +endif + #link flags LFLAGS=${HB_CXXFLAGS} ${LIBS} From 71e039f9bd0c78e1d1c53a58327addfd6ba6202b Mon Sep 17 00:00:00 2001 From: Damon McDougall Date: Wed, 26 Apr 2023 12:56:21 -0700 Subject: [PATCH 2/2] Update README with roctx and gpu-aware MPI builds --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index aba661f..6b12003 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,12 @@ To build `hipBone` manually: $ export OPENBLAS_DIR=/path/to/openblas $ make -j `nproc` +To build with `roctx` support, make sure that the `ROCM_PATH` environment +variable is set and build with `make with-roctx=true`. + +To build with support for GPU-aware MPI, make sure your MPI stack supports +handling device buffers and build with `make gpu-aware-mpi=true`. + How to run `hipBone` --------------------