Skip to content

Commit

Permalink
Now, in case of MPI only the first process will run on the GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
Iximiel committed Nov 11, 2024
1 parent c71066c commit 874e88c
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 145 deletions.
311 changes: 170 additions & 141 deletions plugins/cudaCoord/Coordination.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "plumed/core/Colvar.h"
#include "plumed/tools/NeighborList.h"
#include "plumed/tools/SwitchingFunction.h"
#include "plumed/tools/Communicator.h"

#include "cudaHelpers.cuh"
// #include "ndReduction.h"
Expand All @@ -46,9 +47,11 @@

using std::cerr;

// #define vdbg(...) std::cerr << __LINE__ << ":" << #__VA_ARGS__ << " " <<
// (__VA_ARGS__) << '\n'
#define vdbg(...)

#define hdbg(...) __LINE__ << ":" #__VA_ARGS__ " = " << (__VA_ARGS__) << '\n'
// #define vdbg(...) std::cerr << __LINE__ << ":" << #__VA_ARGS__ << " " << (__VA_ARGS__) << '\n'
#define vdbg(...) std::cerr << hdbg(__VA_ARGS__)
// #define vdbg(...)

namespace PLMD {
namespace colvar {
Expand Down Expand Up @@ -110,6 +113,8 @@ template <typename calculateFloat> class CudaCoordination : public Colvar {
PLMD::GPU::ortoPBCs<calculateFloat> myPBC;

bool pbc{true};
/// THis is used to not launch 3 times the same thing with MPI
bool const mpiActive{true};
void setUpPermanentGPUMemory();

enum class calculationMode { self, dual, pair, none };
Expand Down Expand Up @@ -168,111 +173,118 @@ void CudaCoordination<calculateFloat>::registerKeywords (Keywords &keys) {

template <typename calculateFloat>
CudaCoordination<calculateFloat>::~CudaCoordination() {
cudaStreamDestroy (streamDerivatives);
cudaStreamDestroy (streamVirial);
cudaStreamDestroy (streamCoordination);
if(mpiActive) {
cudaStreamDestroy (streamDerivatives);
cudaStreamDestroy (streamVirial);
cudaStreamDestroy (streamCoordination);
}
}

template <typename calculateFloat>
void CudaCoordination<calculateFloat>::calculate() {
constexpr unsigned dataperthread = 4;
if (pbc) {
//myPBC is used in the three calculation mode functions
// Only ortho as now
auto box = getBox();

myPBC.X = box (0, 0);
myPBC.Y = box (1, 1);
myPBC.Z = box (2, 2);
makeWhole();
}
auto positions = getPositions();
/***************************copying data on the GPU**************************/
CUDAHELPERS::plmdDataToGPU (cudaPositions, positions, streamDerivatives);
/***************************copying data on the GPU**************************/

// number of things to be reduced
size_t t2br = 0;

switch (mode) {
case calculationMode::self:
t2br = doSelf();
break;
case calculationMode::dual:
t2br = doDual();
break;
case calculationMode::pair:
t2br = doPair();
break;
case calculationMode::none:
// throw"this should not have been happened"
break;
}

/**************************accumulating the results**************************/

cudaDeviceSynchronize();
Tensor virial;
double coordination;
auto deriv = std::vector<Vector> (positions.size());
CUDAHELPERS::plmdDataFromGPU (cudaDerivatives, deriv, streamDerivatives);

auto N = t2br;
// if (N>1){
//maybe if N=1 it is more efficient to not do the reduction
//or to sneakily invert the two groups
do {
size_t runningThreads = CUDAHELPERS::threadsPerBlock (
ceil (double (N) / dataperthread), maxReductionNumThreads);

unsigned nGroups = ceil (double (N) / (runningThreads * dataperthread));

reductionMemoryVirial.resize (9 * nGroups);
reductionMemoryCoord.resize (nGroups);

dim3 ngroupsVirial (nGroups, 9);
CUDAHELPERS::doReductionND<dataperthread> (
thrust::raw_pointer_cast (cudaVirial.data()),
thrust::raw_pointer_cast (reductionMemoryVirial.data()),
N,
ngroupsVirial,
runningThreads,
streamVirial);

CUDAHELPERS::doReduction1D<dataperthread> (
thrust::raw_pointer_cast (cudaCoordination.data()),
thrust::raw_pointer_cast (reductionMemoryCoord.data()),
N,
nGroups,
runningThreads,
streamCoordination);

if (nGroups == 1) {
CUDAHELPERS::plmdDataFromGPU (
reductionMemoryVirial, virial, streamVirial);
// TODO:find a way to stream this
coordination = reductionMemoryCoord[0];
} else {
reductionMemoryVirial.swap (cudaVirial);
reductionMemoryCoord.swap (cudaCoordination);
auto deriv = std::vector<Vector> (getPositions().size());
if(mpiActive) {
constexpr unsigned dataperthread = 4;
if (pbc) {
//myPBC is used in the three calculation mode functions
// Only ortho as now
auto box = getBox();

myPBC.X = box (0, 0);
myPBC.Y = box (1, 1);
myPBC.Z = box (2, 2);
makeWhole();
}
auto positions = getPositions();
/***************************copying data on the GPU**************************/
CUDAHELPERS::plmdDataToGPU (cudaPositions, positions, streamDerivatives);
/***************************copying data on the GPU**************************/

N = nGroups;
} while (N > 1);
// }else {
// CUDAHELPERS::plmdDataFromGPU (
// cudaVirial, virial, streamVirial);
// // TODO:find a way to stream this
// coordination = cudaCoordination[0];
// }
// number of things to be reduced
size_t t2br = 0;

// in this way we do not resize with additional memory allocation
if (reductionMemoryCoord.size() > cudaCoordination.size())
reductionMemoryCoord.swap (cudaCoordination);
if (reductionMemoryVirial.size() > cudaVirial.size())
reductionMemoryVirial.swap (cudaVirial);
// this ensures that the memory is fully in the host ram
cudaDeviceSynchronize();
switch (mode) {
case calculationMode::self:
t2br = doSelf();
break;
case calculationMode::dual:
t2br = doDual();
break;
case calculationMode::pair:
t2br = doPair();
break;
case calculationMode::none:
// throw"this should not have been happened"
break;
}

/**************************accumulating the results**************************/

cudaDeviceSynchronize();
CUDAHELPERS::plmdDataFromGPU (cudaDerivatives, deriv, streamDerivatives);

auto N = t2br;
// if (N>1){
//maybe if N=1 it is more efficient to not do the reduction
//or to sneakily invert the two groups
do {
size_t runningThreads = CUDAHELPERS::threadsPerBlock (
ceil (double (N) / dataperthread), maxReductionNumThreads);

unsigned nGroups = ceil (double (N) / (runningThreads * dataperthread));

reductionMemoryVirial.resize (9 * nGroups);
reductionMemoryCoord.resize (nGroups);

dim3 ngroupsVirial (nGroups, 9);
CUDAHELPERS::doReductionND<dataperthread> (
thrust::raw_pointer_cast (cudaVirial.data()),
thrust::raw_pointer_cast (reductionMemoryVirial.data()),
N,
ngroupsVirial,
runningThreads,
streamVirial);

CUDAHELPERS::doReduction1D<dataperthread> (
thrust::raw_pointer_cast (cudaCoordination.data()),
thrust::raw_pointer_cast (reductionMemoryCoord.data()),
N,
nGroups,
runningThreads,
streamCoordination);

if (nGroups == 1) {
CUDAHELPERS::plmdDataFromGPU (
reductionMemoryVirial, virial, streamVirial);
// TODO:find a way to stream this
coordination = reductionMemoryCoord[0];
} else {
reductionMemoryVirial.swap (cudaVirial);
reductionMemoryCoord.swap (cudaCoordination);
}

N = nGroups;
} while (N > 1);
// }else {
// CUDAHELPERS::plmdDataFromGPU (
// cudaVirial, virial, streamVirial);
// // TODO:find a way to stream this
// coordination = cudaCoordination[0];
// }

// in this way we do not resize with additional memory allocation
if (reductionMemoryCoord.size() > cudaCoordination.size())
reductionMemoryCoord.swap (cudaCoordination);
if (reductionMemoryVirial.size() > cudaVirial.size())
reductionMemoryVirial.swap (cudaVirial);
// this ensures that the memory is fully in the host ram
cudaDeviceSynchronize();
}
comm.Bcast (coordination, 0);
comm.Bcast (virial, 0);
comm.Bcast (deriv, 0);
for (unsigned i = 0; i < deriv.size(); ++i)
setAtomsDerivatives (i, deriv[i]);

Expand Down Expand Up @@ -583,7 +595,7 @@ getDerivDual (const unsigned natLoop,
calculateFloat mydevY = 0.0;
calculateFloat mydevZ = 0.0;
calculateFloat mycoord = 0.0;

// local calculation aid
const calculateFloat x = coordActive[X (i)];
const calculateFloat y = coordActive[Y (i)];
Expand Down Expand Up @@ -831,7 +843,10 @@ size_t CudaCoordination<calculateFloat>::doPair() {

template <typename calculateFloat>
CudaCoordination<calculateFloat>::CudaCoordination (const ActionOptions &ao)
: PLUMED_COLVAR_INIT (ao) {
: PLUMED_COLVAR_INIT (ao),
//mpiActive is const
mpiActive( comm.Get_rank()== 0) {

std::vector<AtomNumber> GroupA;
parseAtomList ("GROUPA", GroupA);
std::vector<AtomNumber> GroupB;
Expand Down Expand Up @@ -922,7 +937,7 @@ CudaCoordination<calculateFloat>::CudaCoordination (const ActionOptions &ao)
calculateFloat invr0 = 1.0 / r0_;
switchingParameters.invr0_2 = invr0 * invr0;
constexpr bool dostretch = true;
if (dostretch) {
if (dostretch && mpiActive) {
std::vector<calculateFloat> inputs = {0.0, dmax * invr0};

thrust::device_vector<calculateFloat> inputZeroMax (2);
Expand All @@ -940,50 +955,64 @@ CudaCoordination<calculateFloat>::CudaCoordination (const ActionOptions &ao)
switchingParameters.stretch = 1.0 / (resZeroMax[0] - resZeroMax[1]);
switchingParameters.shift = -resZeroMax[1] * switchingParameters.stretch;
}
comm.Bcast (switchingParameters.dmaxSQ,0);
comm.Bcast (switchingParameters.invr0_2,0);
comm.Bcast (switchingParameters.stretch,0);
comm.Bcast (switchingParameters.shift,0);
comm.Bcast (switchingParameters.nn,0);
comm.Bcast (switchingParameters.mm,0);

}

checkRead();
cudaStreamCreate (&streamDerivatives);
cudaStreamCreate (&streamVirial);
cudaStreamCreate (&streamCoordination);
setUpPermanentGPUMemory();

maxReductionNumThreads = min (1024, maxNumThreads);

cudaFuncAttributes attr;
// the kernels are heavy on registers, this adjust the maximum number of
// threads accordingly
switch (mode) {
case calculationMode::self:
if (pbc) {
cudaFuncGetAttributes (&attr, &getSelfCoord<true, calculateFloat>);
} else {
cudaFuncGetAttributes (&attr, &getSelfCoord<false, calculateFloat>);
}
break;
case calculationMode::dual:
if (pbc) {
cudaFuncGetAttributes (&attr, &getDerivDual<true, calculateFloat>);
maxNumThreads = min (attr.maxThreadsPerBlock, maxNumThreads);
cudaFuncGetAttributes (&attr, &getCoordDual<true, calculateFloat>);
} else {
cudaFuncGetAttributes (&attr, &getDerivDual<false, calculateFloat>);
maxNumThreads = min (attr.maxThreadsPerBlock, maxNumThreads);
cudaFuncGetAttributes (&attr, &getCoordDual<false, calculateFloat>);
}
break;
case calculationMode::pair:
if (pbc) {
cudaFuncGetAttributes (&attr, &getCoordPair<true, calculateFloat>);
} else {
cudaFuncGetAttributes (&attr, &getCoordPair<false, calculateFloat>);
if (mpiActive) {

cudaStreamCreate (&streamDerivatives);
cudaStreamCreate (&streamVirial);
cudaStreamCreate (&streamCoordination);

setUpPermanentGPUMemory();

maxReductionNumThreads = min (1024, maxNumThreads);

cudaFuncAttributes attr;
// the kernels are heavy on registers, this adjust the maximum number of
// threads accordingly
switch (mode) {
case calculationMode::self:
if (pbc) {
cudaFuncGetAttributes (&attr, &getSelfCoord<true, calculateFloat>);
} else {
cudaFuncGetAttributes (&attr, &getSelfCoord<false, calculateFloat>);
}
break;
case calculationMode::dual:
if (pbc) {
cudaFuncGetAttributes (&attr, &getDerivDual<true, calculateFloat>);
maxNumThreads = min (attr.maxThreadsPerBlock, maxNumThreads);
cudaFuncGetAttributes (&attr, &getCoordDual<true, calculateFloat>);
} else {
cudaFuncGetAttributes (&attr, &getDerivDual<false, calculateFloat>);
maxNumThreads = min (attr.maxThreadsPerBlock, maxNumThreads);
cudaFuncGetAttributes (&attr, &getCoordDual<false, calculateFloat>);
}
break;
case calculationMode::pair:
if (pbc) {
cudaFuncGetAttributes (&attr, &getCoordPair<true, calculateFloat>);
} else {
cudaFuncGetAttributes (&attr, &getCoordPair<false, calculateFloat>);
}
break;
case calculationMode::none:
// throw"this should not have been happened"
break;
}
break;
case calculationMode::none:
// throw"this should not have been happened"
break;
maxNumThreads = min (attr.maxThreadsPerBlock, maxNumThreads);
}
maxNumThreads = min (attr.maxThreadsPerBlock, maxNumThreads);
comm.Bcast (maxNumThreads, 0);
comm.Bcast (maxReductionNumThreads, 0);


log << " contacts are counted with cutoff (dmax)="
<< sqrt (switchingParameters.dmaxSQ)
Expand Down
Loading

0 comments on commit 874e88c

Please sign in to comment.