Skip to content

Commit

Permalink
Merge branch 'master' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
paboyle committed Nov 16, 2020
2 parents a0ccbb3 + a1063dd commit 9c4dcc5
Show file tree
Hide file tree
Showing 13 changed files with 683 additions and 87 deletions.
3 changes: 3 additions & 0 deletions Grid/GridStd.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,7 @@
///////////////////
#include "Config.h"

#ifdef TOFU
#undef GRID_COMMS_THREADS
#endif
#endif /* GRID_STD_H */
10 changes: 9 additions & 1 deletion Grid/allocator/AlignedAllocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,17 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
////////////////////////////////////////////////////////////////////////////////
// Template typedefs
////////////////////////////////////////////////////////////////////////////////
//template<class T> using commAllocator = devAllocator<T>;
#ifdef ACCELERATOR_CSHIFT
// Cshift on device
template<class T> using cshiftAllocator = devAllocator<T>;
#else
// Cshift on host
template<class T> using cshiftAllocator = std::allocator<T>;
#endif

template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
template<class T> using commVector = std::vector<T,devAllocator<T> >;
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;

NAMESPACE_END(Grid);

Expand Down
9 changes: 6 additions & 3 deletions Grid/communicator/Communicator_mpi3.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
MPI_Initialized(&flag); // needed to coexist with other libs apparently
if ( !flag ) {

#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
#ifndef GRID_COMMS_THREADS
nCommThreads=1;
// wrong results here too
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
Expand Down Expand Up @@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
assert(from != _processor);
assert(gme == ShmRank);
double off_node_bytes=0.0;
int tag;

if ( gfrom ==MPI_UNDEFINED) {
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
tag= dir+from*32;
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
assert(ierr==0);
list.push_back(rrq);
off_node_bytes+=bytes;
}

if ( gdest == MPI_UNDEFINED ) {
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
tag= dir+_processor*32;
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
assert(ierr==0);
list.push_back(xrq);
off_node_bytes+=bytes;
Expand Down
22 changes: 7 additions & 15 deletions Grid/communicator/SharedMemoryMPI.cc
Original file line number Diff line number Diff line change
Expand Up @@ -457,8 +457,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
exit(EXIT_FAILURE);
}
if ( WorldRank == 0 ){
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes
// if ( WorldRank == 0 ){
if ( 1 ){
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
}
SharedMemoryZero(ShmCommBuf,bytes);
Expand Down Expand Up @@ -771,20 +772,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);

#ifdef GRID_IBM_SUMMIT
// Hide the shared memory path between sockets
// if even number of nodes
if ( (ShmSize & 0x1)==0 ) {
int SocketSize = ShmSize/2;
int mySocket = ShmRank/SocketSize;
#ifdef GRID_SHM_DISABLE
// Hide the shared memory path between ranks
{
for(int r=0;r<size;r++){
int hisRank=ShmRanks[r];
if ( hisRank!= MPI_UNDEFINED ) {
int hisSocket=hisRank/SocketSize;
if ( hisSocket != mySocket ) {
ShmRanks[r] = MPI_UNDEFINED;
}
}
ShmRanks[r] = MPI_UNDEFINED;
}
}
#endif
Expand Down
88 changes: 79 additions & 9 deletions Grid/cshift/Cshift_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
// Gather for when there is no need to SIMD split
///////////////////////////////////////////////////////////////////
template<class vobj> void
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
{
int rd = rhs.Grid()->_rdimensions[dimension];

Expand Down Expand Up @@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
}
}
{
autoView(rhs_v , rhs, AcceleratorRead);
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for(i,ent,{
buffer_p[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}

Expand All @@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
int n1=rhs.Grid()->_slice_stride[dimension];

if ( cbmask ==0x3){
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for2d(n,e1,b,e2,1,{
int o = n*n1;
Expand All @@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{
int o = n*n1;
int offset = b+n*e2;

vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
});
#endif
} else {
autoView(rhs_v , rhs, AcceleratorRead);

Coordinate rdim=rhs.Grid()->_rdimensions;
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
accelerator_for2d(n,e1,b,e2,1,{

Coordinate coor;
Expand All @@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
extract<vobj>(temp,pointers,offset);
}
});
#else
autoView(rhs_v , rhs, CpuRead);
thread_for2d(n,e1,b,e2,{

Coordinate coor;

int o=n*n1;
int oindex = o+b;

int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);

int ocb=1<<cb;
int offset = b+n*e2;

if ( ocb & cbmask ) {
vobj temp =rhs_v[so+o+b];
extract<vobj>(temp,pointers,offset);
}
});
#endif
}
}

//////////////////////////////////////////////////////
// Scatter for when there is no need to SIMD split
//////////////////////////////////////////////////////
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
{
int rd = rhs.Grid()->_rdimensions[dimension];

Expand Down Expand Up @@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
}

{
autoView( rhs_v, rhs, AcceleratorWrite);
auto buffer_p = & buffer[0];
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorWrite);
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
});
#else
autoView( rhs_v, rhs, CpuWrite);
thread_for(i,ent,{
rhs_v[table[i].first]=buffer_p[table[i].second];
});
#endif
}
}

Expand All @@ -208,14 +253,23 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
int e2=rhs.Grid()->_slice_block[dimension];

if(cbmask ==0x3 ) {
autoView( rhs_v , rhs, AcceleratorWrite);
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
int _slice_block = rhs.Grid()->_slice_block[dimension];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v , rhs, AcceleratorWrite);
accelerator_for2d(n,e1,b,e2,1,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#else
autoView( rhs_v , rhs, CpuWrite);
thread_for2d(n,e1,b,e2,{
int o = n*_slice_stride;
int offset = b+n*_slice_block;
merge(rhs_v[so+o+b],pointers,offset);
});
#endif
} else {

// Case of SIMD split AND checker dim cannot currently be hit, except in
Expand Down Expand Up @@ -280,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
}

{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView(rhs_v , rhs, AcceleratorRead);
autoView(lhs_v , lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,vobj::Nsimd(),{
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
});
#else
autoView(rhs_v , rhs, CpuRead);
autoView(lhs_v , lhs, CpuWrite);
thread_for(i,ent,{
lhs_v[table[i].first]=rhs_v[table[i].second];
});
#endif
}
}

Expand Down Expand Up @@ -324,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
}

{
auto table = &Cshift_table[0];
#ifdef ACCELERATOR_CSHIFT
autoView( rhs_v, rhs, AcceleratorRead);
autoView( lhs_v, lhs, AcceleratorWrite);
auto table = &Cshift_table[0];
accelerator_for(i,ent,1,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#else
autoView( rhs_v, rhs, CpuRead);
autoView( lhs_v, lhs, CpuWrite);
thread_for(i,ent,{
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
});
#endif
}
}

Expand Down
Loading

0 comments on commit 9c4dcc5

Please sign in to comment.