diff --git a/3rdParty/gslib/src/gs.c b/3rdParty/gslib/src/gs.c index 7b91607ca..1582f4a11 100644 --- a/3rdParty/gslib/src/gs.c +++ b/3rdParty/gslib/src/gs.c @@ -1114,11 +1114,15 @@ static void auto_setup(struct gs_remote *r, struct gs_topology *top, struct gs_remote r_alt; double time[2][3]; + // #define DRY_RUN(i,gsr,str) do { \ + // if(comm->id==0) printf(" " str ": "); \ + // dry_run_time(time[i],gsr,comm,buf); \ + // if(comm->id==0) \ + // printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \ + // } while(0) + #define DRY_RUN(i,gsr,str) do { \ - if(comm->id==0) printf(" " str ": "); \ dry_run_time(time[i],gsr,comm,buf); \ - if(comm->id==0) \ - printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \ } while(0) #define DRY_RUN_CHECK(str,new_name) do { \ @@ -1143,7 +1147,7 @@ static void auto_setup(struct gs_remote *r, struct gs_topology *top, #undef DRY_RUN_CHECK #undef DRY_RUN - if(comm->id==0) printf(" used all_to_all method: %s\n",name); + // if(comm->id==0) printf(" used all_to_all method: %s\n",name); } } diff --git a/include/ogs_t.h b/include/ogs_t.h deleted file mode 100644 index 2864fc0ed..000000000 --- a/include/ogs_t.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -// OCCA+gslib gather scatter -typedef struct { - - dlong Ngather; // total number of gather nodes - dlong NtotalGather; // total number of gather nodes - dlong NnonHaloGather; // number of local gathered nodes - dlong NhaloGather; // number of gathered nodes on halo - - dlong *nonHaloGatherOffsets; - int *nonHaloGatherHaloFlags; - int *nonHaloGatherBaseRanks; - dlong *nonHaloGatherLocalIds; - hlong *nonHaloGatherBaseIds; - - dlong *haloGatherOffsets; - int *haloGatherHaloFlags; - int *haloGatherBaseRanks; - dlong *haloGatherLocalIds; - hlong *haloGatherBaseIds; - - dlong *ownedHaloGatherIds; - - dfloat * haloGatherTmp; - occa::memory o_nonHaloGatherOffsets; // start of local bases - occa::memory o_nonHaloGatherLocalIds; // base connected nodes - occa::memory o_nonHaloGatherTmp; // DEVICE gather buffer - - occa::memory o_haloGatherOffsets; // start of local bases - occa::memory o_haloGatherLocalIds; // base connected nodes - occa::memory o_haloGatherTmp; // DEVICE gather buffer - - occa::memory o_ownedHaloGatherIds; - - void *haloGsh; // gslib gather - dlong Nhalo; // number of halo nodes - dlong NownedHalo; // number of owned halo nodes - - //degree vectors - dfloat *invDegree, *gatherInvDegree; - occa::memory o_invDegree; - occa::memory o_gatherInvDegree; - -}ogs_t; diff --git a/include/parAlmond.h b/include/parAlmond.h deleted file mode 100644 index 09542cd4c..000000000 --- a/include/parAlmond.h +++ /dev/null @@ -1,327 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#ifndef PARALMOND_H -#define PARALMOND_H 1 - -typedef struct csr_t { - - dlong Nrows; - dlong Ncols; - - dlong NlocalCols; - - //local - dlong diagNNZ; - dlong *diagRowStarts; - dlong *diagCols; - dfloat *diagCoefs; - - //non-local - dlong offdNNZ; - dlong *offdRowStarts; - dlong *offdCols; - dfloat *offdCoefs; - - dfloat *diagInv; - - dfloat *null; - - //storage for smoothing - dfloat *scratch; - - hlong *colMap; - - // MPI halo exchange info - dlong NHalo; - int NrecvTotal; // number of elements to be sent in halo exchange - int NsendTotal; - dlong totalHaloPairs; - dlong *haloElementList; // sorted list of elements to be sent in halo exchange - int *NsendPairs; // number of elements worth of data to send - int *NrecvPairs; // number of elements worth of data to recv - int NsendMessages; // number of messages to send - int NrecvMessages; // number of messages to recv - dfloat *sendBuffer; - - void *haloSendRequests; - void *haloRecvRequests; - -} csr; - - -typedef struct ell_t { - - dlong Nrows; - dlong Ncols; - int nnzPerRow; - dlong strideLength; - dlong actualNNZ; - - occa::memory o_cols; - occa::memory o_coefs; - -} ell; - -typedef struct coo_t { - - dlong Nrows; - dlong Ncols; - dlong nnz; - - // device memory - occa::memory o_offsets; - occa::memory o_cols; - occa::memory o_coefs; - -} coo; - -typedef struct hyb_t { - - dlong Nrows; - dlong Ncols; - - dlong NlocalCols; - - coo *C; - ell *E; - - occa::memory o_diagInv; - - occa::memory o_null; - - // MPI halo exchange info - dlong NHalo; - hlong *colMap; - int NrecvTotal; // number of elements to be sent in halo exchange - int NsendTotal; - dlong *haloElementList; // sorted list of elements to be sent in halo exchange - occa::memory o_haloElementList; - int *NsendPairs; // number of elements worth of data to send - int *NrecvPairs; // number of elements worth of data to recv - int NsendMessages; // number of messages to send - int NrecvMessages; // number of messages to recv - dfloat *sendBuffer; - dfloat *recvBuffer; - occa::memory o_haloBuffer; - - void *haloSendRequests; - void *haloRecvRequests; - -} hyb; - - -typedef struct dcsr_t { - - dlong Nrows; - dlong Ncols; - - dlong NlocalCols; - - //local - dlong diagNNZ; - occa::memory o_diagRows; - occa::memory o_diagCols; - occa::memory o_diagCoefs; - - //non-local - dlong offdNNZ; - occa::memory o_offdRows; - occa::memory o_offdCols; - occa::memory o_offdCoefs; - - // MPI halo exchange info - dlong NHalo; - int NrecvTotal; // number of elements to be sent in halo exchange - int NsendTotal; - dlong totalHaloPairs; - dlong *haloElementList; // sorted list of elements to be sent in halo exchange - int *NsendPairs; // number of elements worth of data to send - int *NrecvPairs; // number of elements worth of data to recv - int NsendMessages; // number of messages to send - int NrecvMessages; // number of messages to recv - dfloat *sendBuffer; - dfloat *recvBuffer; - - occa::memory o_haloElementList; - occa::memory o_haloBuffer; - - void *haloSendRequests; - void *haloRecvRequests; - -} dcoo; - -typedef enum {PCG=0,GMRES=1}KrylovType; -typedef enum {JACOBI=0,DAMPED_JACOBI=1,CHEBYSHEV=2}SmoothType; - -typedef struct agmgLevel_t { - dlong Nrows; - dlong Ncols; - - hlong *globalRowStarts; //global partitioning of fine level - hlong *globalAggStarts; //global partitioning of coarse level - - bool gatherLevel; - bool weightedInnerProds; - - void **AxArgs; - void **smoothArgs; - void **smootherArgs; - void **coarsenArgs; - void **prolongateArgs; - void **gatherArgs; - void **scatterArgs; - - //operator call-backs - void (*device_Ax) (void **args, occa::memory &o_x, occa::memory &o_Ax); - void (*device_smooth) (void **args, occa::memory &o_r, occa::memory &o_x, bool x_is_zero); - void (*device_smoother) (void **args, occa::memory &o_r, occa::memory &o_Sr); - void (*device_coarsen) (void **args, occa::memory &o_x, occa::memory &o_Rx); - void (*device_prolongate)(void **args, occa::memory &o_x, occa::memory &o_Px); - void (*device_gather) (void **args, occa::memory &o_x, occa::memory &o_Gx); - void (*device_scatter) (void **args, occa::memory &o_x, occa::memory &o_Sx); - - //host versions - void (*Ax) (void **args, dfloat *x, dfloat *Ax); - void (*smooth) (void **args, dfloat *r, dfloat *x, bool x_is_zero); - void (*smoother) (void **args, dfloat *r, dfloat *Sr); - void (*coarsen) (void **args, dfloat *x, dfloat *Rx); - void (*prolongate)(void **args, dfloat *x, dfloat *Px); - void (*gather) (void **args, dfloat *x, dfloat *Gx); - void (*scatter) (void **args, dfloat *x, dfloat *Sx); - - //agmg operators - csr *A; - csr *P; - csr *R; - - hyb *deviceA; - dcoo *dcsrP; - hyb *deviceR; - - dfloat *rhs, *res, *x; - - dfloat *Srhs, *Sx; //scatter copies - - dfloat *ckp1, *vkp1, *wkp1; - - dfloat *weight; - - occa::memory o_rhs, o_res, o_x; - occa::memory o_Srhs, o_Sx; - occa::memory o_ckp1, o_vkp1, o_wkp1; - - occa::memory o_weight; - - dfloat *smoother_params; - dfloat *smootherResidual; - dfloat *smootherResidual2; - dfloat *smootherUpdate; - occa::memory o_smootherResidual; - occa::memory o_smootherResidual2; - occa::memory o_smootherUpdate; - int ChebyshevIterations; - - dfloat threshold; - dlong numAggregates; - SmoothType stype; - -} agmgLevel; - -typedef struct { - agmgLevel **levels; - int numLevels; - - KrylovType ktype; - - setupAide options; - - //Matrix Free args - void (*MatFreeAx)(void **args, occa::memory o_q, occa::memory o_Aq,const char* options); - void **MatFreeArgs; - - //Coarse solver - void *ExactSolve; - int coarseTotal; - int coarseOffset; - int *coarseOffsets; - int *coarseCounts; - dfloat *invCoarseA; - dfloat *xCoarse, *rhsCoarse; - - bool nullSpace; - dfloat nullSpacePenalty; - - occa::device device; - occa::stream defaultStream; - occa::stream dataStream; - - occa::memory o_x; - occa::memory o_Ax; - - dfloat *rho; - occa::memory o_rho; - - occa::kernel ellAXPYKernel; - occa::kernel ellZeqAXPYKernel; - occa::kernel ellJacobiKernel; - occa::kernel cooAXKernel; - occa::kernel scaleVectorKernel; - occa::kernel vectorAddKernel; - occa::kernel vectorAddKernel2; - occa::kernel setVectorKernel; - occa::kernel sumVectorKernel; - occa::kernel addScalarKernel; - occa::kernel dotStarKernel; - occa::kernel simpleDotStarKernel; - occa::kernel haloExtract; - occa::kernel agg_interpolateKernel; - occa::kernel innerProdKernel; - occa::kernel vectorAddInnerProdKernel; - occa::kernel kcycleCombinedOp1Kernel; - occa::kernel kcycleCombinedOp2Kernel; - occa::kernel vectorAddWeightedInnerProdKernel; - occa::kernel kcycleWeightedCombinedOp1Kernel; - occa::kernel kcycleWeightedCombinedOp2Kernel; - -} parAlmond_t; - -parAlmond_t *parAlmondInit(mesh_t *mesh, setupAide options); - -void parAlmondAgmgSetup(parAlmond_t* parAlmond, - hlong* rowStarts, - dlong nnz, - hlong* Ai, - hlong* Aj, - dfloat* Avals, - bool nullSpace, - dfloat nullSpacePenalty); - -void parAlmondPrecon(parAlmond_t* parAlmond, occa::memory o_x, occa::memory o_rhs); - -int parAlmondFree(void* A); - -#endif diff --git a/libs/gatherScatter/src/ogsKernels.cpp b/libs/gatherScatter/src/ogsKernels.cpp index 954a5114a..ec0eb20b6 100644 --- a/libs/gatherScatter/src/ogsKernels.cpp +++ b/libs/gatherScatter/src/ogsKernels.cpp @@ -158,7 +158,7 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) { int rank, size; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); - + ogs::defaultStream = device.getStream(); ogs::dataStream = device.createStream(); @@ -197,7 +197,7 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) { kernelInfo["compiler_flags"] += "--fmad=true"; // compiler option for cuda } - if (rank==0) printf("Compiling GatherScatter Kernels \n"); + if (rank==0) printf("Compiling GatherScatter Kernels...");fflush(stdout); for (int r=0;rcomm = comm; int rank, size; - MPI_Comm_rank(ogs->comm, &rank); - MPI_Comm_size(ogs->comm, &size); + MPI_Comm_rank(ogs->comm, &rank); + MPI_Comm_size(ogs->comm, &size); //make a host gs handle (calls gslib) ogs->hostGsh = ogsHostSetup(comm, N, ids, 0, 0); @@ -114,10 +114,10 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, //set up the local gatherScatter parallelNode_t *localNodes; - + if (ogs->Nlocal) { localNodes = (parallelNode_t*) calloc(ogs->Nlocal,sizeof(parallelNode_t)); - + dlong cnt=0; for (dlong i=0;iNlocal, sizeof(parallelNode_t), compareLocalId); - //tally up how many nodes are being gathered to each gatherNode and + //tally up how many nodes are being gathered to each gatherNode and // map to a local ordering dlong *localGatherCounts = (dlong*) calloc(ogs->NlocalGather,sizeof(dlong)); dlong *localGatherMap = (dlong*) calloc(ogs->NlocalGather,sizeof(dlong)); cnt = 0; for (dlong i=0;iNlocal;i++) { dlong newId = localNodes[i].newId; //get the ordered id - - if (localNodes[i].owned) + + if (localNodes[i].owned) localGatherMap[newId] = cnt++; //record a new index if this is a new gatherNode - + localNodes[i].newId = localGatherMap[newId]; //reorder localGatherCounts[localGatherMap[newId]]++; //tally } @@ -177,7 +177,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, dlong gatherId = localNodes[i].newId; dlong offset = ogs->localGatherOffsets[gatherId]; int index = localGatherCounts[gatherId]; - + ogs->localGatherIds[offset+index] = localNodes[i].localId; localGatherCounts[gatherId]++; } @@ -235,7 +235,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, // sort based on local ids qsort(haloNodes, ogs->Nhalo, sizeof(parallelNode_t), compareLocalId); - //tally up how many nodes are being gathered to each gatherNode and + //tally up how many nodes are being gathered to each gatherNode and // map to a local ordering dlong *haloGatherCounts = (dlong*) calloc(ogs->NhaloGather,sizeof(dlong)); dlong *haloGatherMap = (dlong*) calloc(ogs->NhaloGather,sizeof(dlong)); @@ -246,19 +246,19 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, dlong cnt2 = ogs->NownedHalo; for (dlong i=0;iNhalo;i++) { dlong newId = haloNodes[i].newId; //get the ordered id - + if (haloNodes[i].owned) { dlong c; - if (haloNodes[i].baseId>0) + if (haloNodes[i].baseId>0) c = cnt++; - else + else c = cnt2++; - symIds[c] = abs(haloNodes[i].baseId); //record the base id - nonSymIds[c] = haloNodes[i].baseId; //record the base id + symIds[c] = abs(haloNodes[i].baseId); //record the base id + nonSymIds[c] = haloNodes[i].baseId; //record the base id haloGatherMap[newId] = c; //record a new index if this is a new gatherNode } - + haloNodes[i].newId = haloGatherMap[newId]; //reorder haloGatherCounts[haloGatherMap[newId]]++; //tally } @@ -275,7 +275,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, dlong gatherId = haloNodes[i].newId; dlong offset = ogs->haloGatherOffsets[gatherId]; int index = haloGatherCounts[gatherId]; - + ogs->haloGatherIds[offset+index] = haloNodes[i].localId; haloGatherCounts[gatherId]++; } @@ -294,7 +294,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, free(minRank); free(maxRank); free(flagIds); //total number of owned gathered nodes - ogs->Ngather = ogs->NlocalGather+ogs->NownedHalo; + ogs->Ngather = ogs->NlocalGather+ogs->NownedHalo; ogs->device = device; @@ -305,7 +305,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, ogs->o_invDegree = device.malloc(N*sizeof(dfloat), ogs->invDegree); ogs->o_gatherInvDegree = device.malloc(ogs->Ngather*sizeof(dfloat), ogs->gatherInvDegree); - + ogsGather(ogs->o_gatherInvDegree, ogs->o_invDegree, ogsDfloat, ogsAdd, ogs); if(ogs->Ngather) @@ -313,11 +313,11 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, ogsScatter(ogs->o_invDegree, ogs->o_gatherInvDegree, ogsDfloat, ogsAdd, ogs); - ogs->o_invDegree.copyTo(ogs->invDegree); - - for(dlong n=0;nN;++n) + if (N) ogs->o_invDegree.copyTo(ogs->invDegree); + + for(dlong n=0;nN;++n) ogs->invDegree[n] = 1./ogs->invDegree[n]; - + for(dlong n=0;nNgather;++n) ogs->gatherInvDegree[n] = 1./ogs->gatherInvDegree[n]; diff --git a/libs/parAlmond/include/agmg.hpp b/libs/parAlmond/include/agmg.hpp new file mode 100644 index 000000000..06e2af214 --- /dev/null +++ b/libs/parAlmond/include/agmg.hpp @@ -0,0 +1,104 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#ifndef PARALMOND_AGMGLEVEL_HPP +#define PARALMOND_AGMGLEVEL_HPP + +namespace parAlmond { + +class agmgLevel: public multigridLevel { + +public: + parCSR *A, *P, *R; + parHYB *o_A, *o_P, *o_R; + + SmoothType stype; + dfloat lambda, lambda1, lambda0; //smoothing params + + int ChebyshevIterations; + + bool gatherLevel; + ogs_t *ogs; + dfloat *Gx, *Sx; + occa::memory o_Sx, o_Gx; + + agmgLevel(parCSR *AA, KrylovType Ktype); + agmgLevel(parCSR *AA, parCSR *PP, parCSR *RR, KrylovType Ktype); + ~agmgLevel(); + + void Ax(dfloat *x, dfloat *Ax); + void Ax(occa::memory o_x, occa::memory o_Ax); + + void smooth(dfloat *rhs, dfloat *x, bool x_is_zero); + void smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero); + + void residual(dfloat *rhs, dfloat *x, dfloat *res); + void residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res); + + void coarsen(dfloat *x, dfloat *Cx); + void coarsen(occa::memory o_x, occa::memory o_Cx); + + void prolongate(dfloat *x, dfloat *Px); + void prolongate(occa::memory o_x, occa::memory o_Px); + + void smoothJacobi(dfloat *r, dfloat *x, const bool x_is_zero); + void smoothDampedJacobi(dfloat *r, dfloat *x, const bool x_is_zero); + void smoothChebyshev(dfloat *r, dfloat *x, const bool x_is_zero); + + void smoothJacobi(occa::memory o_r, occa::memory o_x, bool x_is_zero); + void smoothDampedJacobi(occa::memory o_r, occa::memory o_x, bool x_is_zero); + void smoothChebyshev(occa::memory o_r, occa::memory o_x, bool x_is_zero); + + void Report(); +}; + + +agmgLevel *coarsenAgmgLevel(agmgLevel *level, KrylovType ktype, setupAide options); + +parCSR* strongGraph(parCSR *A); + +void formAggregates(parCSR *A, parCSR *C, + hlong* FineToCoarse, + hlong* globalAggStarts); + +parCSR *constructProlongation(parCSR *A, hlong *FineToCoarse, + hlong *globalAggStarts, dfloat **nullCoarseA); + +parCSR *transpose(parCSR *A); + +parCSR *galerkinProd(parCSR *A, parCSR *P); + + + +void setupAgmgSmoother(agmgLevel *level, SmoothType s, int ChebIterations); + +void allocateAgmgVectors(agmgLevel *level, int k, int numLevels, CycleType ctype); + +void syncAgmgToDevice(agmgLevel *level, int k, int numLevels, CycleType ctype); + +} + +#endif \ No newline at end of file diff --git a/solvers/parALMOND/okl/cooAX.okl b/libs/parAlmond/include/coarse.hpp similarity index 61% rename from solvers/parALMOND/okl/cooAX.okl rename to libs/parAlmond/include/coarse.hpp index b909676b4..dd0e0aba4 100644 --- a/solvers/parALMOND/okl/cooAX.okl +++ b/libs/parAlmond/include/coarse.hpp @@ -24,29 +24,51 @@ SOFTWARE. */ +#ifndef PARALMOND_COARSESOLVE_HPP +#define PARALMOND_COARSESOLVE_HPP + +namespace parAlmond { + +class coarseSolver { + +public: + int coarseTotal; + int coarseOffset; + int *coarseOffsets=NULL; + int *coarseCounts=NULL; + + int N; + dfloat *invCoarseA=NULL; + + dfloat *xLocal=NULL; + dfloat *rhsLocal=NULL; + + dfloat *xCoarse=NULL; + dfloat *rhsCoarse=NULL; + + bool gatherLevel; + ogs_t *ogs; + dfloat *Gx, *Sx; + occa::memory o_Sx, o_Gx; + + MPI_Comm comm; + occa::device device; + + setupAide options; + + coarseSolver(setupAide options); + ~coarseSolver(); + + int getTargetSize(); + + void setup(parCSR *A); + + void syncToDevice(); + + void solve(dfloat *rhs, dfloat *x); + void solve(occa::memory o_rhs, occa::memory o_x); +}; -// y += alpha*A*x - -@kernel void cooAXKernel(const dlong numRows, - const dfloat alpha, - @restrict const dlong * offsets, - @restrict const dlong * cols, - @restrict const dfloat * coeffs, - @restrict const dfloat * x, - @restrict dfloat * y){ - - for(dlong n=0;n -1) + result += vals[i+c*Nrows]*x[col]; + } + y[i] = alpha*result + betay; + } +} + +@kernel void SpMVell2(const dlong Nrows, + const int nnzPerRow, + const dfloat alpha, + const dfloat beta, + @restrict const dlong * cols, + @restrict const dfloat * vals, + @restrict const dfloat * x, + @restrict const dfloat * y, + @restrict dfloat * z){ + + // z = alpha * A * x + beta * y + for(dlong i=0;i -1) + result += vals[i+c*Nrows]*x[col]; + } + z[i] = alpha*result + beta*y[i]; + } +} diff --git a/libs/parAlmond/okl/SpMVmcsr.okl b/libs/parAlmond/okl/SpMVmcsr.okl new file mode 100644 index 000000000..f75f08878 --- /dev/null +++ b/libs/parAlmond/okl/SpMVmcsr.okl @@ -0,0 +1,83 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +@kernel void SpMVmcsr1(const dlong Nrows, + const dfloat alpha, + const dfloat beta, + @restrict const dlong * rowStarts, + @restrict const dlong * rows, + @restrict const dlong * cols, + @restrict const dfloat * vals, + @restrict const dfloat * x, + @restrict dfloat * y){ + + // y = alpha * A * x + beta * y + for(dlong n=0;n512 + for(int t=0;t256 + for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t512 + for(int t=0;t256 + for(int t=0;t +#include +#include + +#include "mpi.h" +#include "types.h" +#include "ogs.hpp" +#include "setupAide.hpp" + +#include "include/defines.hpp" +#include "include/utils.hpp" +#include "include/kernels.hpp" +#include "include/vector.hpp" +#include "include/matrix.hpp" +#include "include/level.hpp" +#include "include/agmg.hpp" +#include "include/coarse.hpp" +#include "include/solver.hpp" + + +namespace parAlmond { + +solver_t *Init(occa::device device, MPI_Comm comm, setupAide options); + +void AMGSetup(solver_t* M, + hlong* rowStarts, + dlong nnz, + hlong* Ai, + hlong* Aj, + dfloat* Avals, + bool nullSpace, + dfloat nullSpacePenalty); + +void Precon(solver_t* M, occa::memory o_x, occa::memory o_rhs); + +void Report(solver_t *M); + +void Free(solver_t* M); + +} //namespace parAlmond + +#endif diff --git a/libs/parAlmond/src/SpMV.cpp b/libs/parAlmond/src/SpMV.cpp new file mode 100644 index 000000000..053cf8887 --- /dev/null +++ b/libs/parAlmond/src/SpMV.cpp @@ -0,0 +1,398 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +//------------------------------------------------------------------------ +// +// CSR matrix +// +//------------------------------------------------------------------------ +void CSR::SpMV(const dfloat alpha, dfloat *x, + const dfloat beta, dfloat *y) { + // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + if (beta) { + // #pragma omp parallel for + for(dlong i=0; i-1) { + result += vals[c+nnzPerRow*i]*x[col]; + } + } + y[i] = alpha*result + beta*y[i]; + } + } else { + // #pragma omp parallel for + for(dlong i=0; i-1) { + result += vals[c+nnzPerRow*i]*x[col]; + } + } + y[i] = alpha*result; + } + } +} + +void ELL::SpMV(const dfloat alpha, dfloat *x, + const dfloat beta, const dfloat *y, dfloat *z) { + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + // #pragma omp parallel for + for(dlong i=0; i-1) { + result += vals[c+nnzPerRow*i]*x[col]; + } + } + z[i] = alpha*result + beta*y[i]; + } +} + +void ELL::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, + occa::memory o_y) { + // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + if(nnzPerRow){ + // occaTimerTic(device,"SpMV ELL"); + SpMVellKernel1(Nrows, nnzPerRow, + alpha, beta, o_cols, o_vals, o_x, o_y); + // occaTimerToc(device,"SpMV ELL"); + } +} + +void ELL::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, + occa::memory o_y, occa::memory o_z) { + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + if(nnzPerRow){ + // occaTimerTic(device,"SpMV ELL"); + SpMVellKernel2(Nrows, nnzPerRow, + alpha, beta, o_cols, o_vals, o_x, o_y, o_z); + // occaTimerToc(device,"SpMV ELL"); + } +} + +//------------------------------------------------------------------------ +// +// MCSR matrix +// +//------------------------------------------------------------------------ +void MCSR::SpMV(const dfloat alpha, dfloat *x, + const dfloat beta, dfloat *y){ + // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + if (beta) { + // #pragma omp parallel for + for(dlong i=0; ihaloExchangeStart(x); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + diag->SpMV(alpha, x, beta, y); + + this->haloExchangeFinish(x); + + offd->SpMV(alpha, x, 1.0, y); + + //rank 1 correction if there is a nullspace + if (nullSpace) { + dfloat gamma = vectorInnerProd(Nrows, null, x, comm)*nullSpacePenalty; + vectorAdd(Nrows, alpha*gamma, null, 1.0, y); + } +} + +void parCSR::SpMV(const dfloat alpha, dfloat *x, + const dfloat beta, const dfloat *y, dfloat *z) { + + this->haloExchangeStart(x); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + diag->SpMV(alpha, x, beta, y, z); + + this->haloExchangeFinish(x); + + offd->SpMV(alpha, x, 1.0, z); + + //rank 1 correction if there is a nullspace + if (nullSpace) { + dfloat gamma = vectorInnerProd(Nrows, null, x, comm)*nullSpacePenalty; + vectorAdd(Nrows, alpha*gamma, null, 1.0, z); + } +} + +void parCSR::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, + occa::memory o_y) { + + this->haloExchangeStart(o_x); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + diag->SpMV(alpha, o_x, beta, o_y); + + this->haloExchangeFinish(o_x); + + offd->SpMV(alpha, o_x, 1.0, o_y); + + //rank 1 correction if there is a nullspace + if (nullSpace) { + dfloat gamma = vectorInnerProd(Nrows, o_null, o_x, comm)*nullSpacePenalty; + vectorAdd(Nrows, alpha*gamma, o_null, 1.0, o_y); + } +} + +void parCSR::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, + occa::memory o_y, occa::memory o_z) { + + this->haloExchangeStart(o_x); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + diag->SpMV(alpha, o_x, beta, o_y, o_z); + + this->haloExchangeFinish(o_x); + + offd->SpMV(alpha, o_x, 1.0, o_z); + + //rank 1 correction if there is a nullspace + if (nullSpace) { + dfloat gamma = vectorInnerProd(Nrows, o_null, o_x, comm)*nullSpacePenalty; + vectorAdd(Nrows, alpha*gamma, o_null, 1.0, o_z); + } +} + +//------------------------------------------------------------------------ +// +// parHYB matrix +// +//------------------------------------------------------------------------ +void parHYB::SpMV(const dfloat alpha, dfloat *x, + const dfloat beta, dfloat *y) { + + this->haloExchangeStart(x); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + E->SpMV(alpha, x, beta, y); + + this->haloExchangeFinish(x); + + C->SpMV(alpha, x, 1.0, y); + + //rank 1 correction if there is a nullspace + if (nullSpace) { + dfloat gamma = vectorInnerProd(Nrows, null, x, comm)*nullSpacePenalty; + vectorAdd(Nrows, alpha*gamma, null, 1.0, y); + } +} + +void parHYB::SpMV(const dfloat alpha, dfloat *x, + const dfloat beta, const dfloat *y, dfloat *z) { + + this->haloExchangeStart(x); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + E->SpMV(alpha, x, beta, y, z); + + this->haloExchangeFinish(x); + + C->SpMV(alpha, x, 1.0, z); + + //rank 1 correction if there is a nullspace + if (nullSpace) { + dfloat gamma = vectorInnerProd(Nrows, null, x, comm)*nullSpacePenalty; + vectorAdd(Nrows, alpha*gamma, null, 1.0, z); + } +} + +void parHYB::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, + occa::memory o_y) { + + this->haloExchangeStart(o_x); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + E->SpMV(alpha, o_x, beta, o_y); + + this->haloExchangeFinish(o_x); + + C->SpMV(alpha, o_x, 1.0, o_y); + + //rank 1 correction if there is a nullspace + if (nullSpace) { + dfloat gamma = vectorInnerProd(Nrows, o_null, o_x, comm)*nullSpacePenalty; + vectorAdd(Nrows, alpha*gamma, o_null, 1.0, o_y); + } +} + +void parHYB::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, + occa::memory o_y, occa::memory o_z) { + + this->haloExchangeStart(o_x); + + // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) + E->SpMV(alpha, o_x, beta, o_y, o_z); + + this->haloExchangeFinish(o_x); + + C->SpMV(alpha, o_x, 1.0, o_z); + + //rank 1 correction if there is a nullspace + if (nullSpace) { + dfloat gamma = vectorInnerProd(Nrows, o_null, o_x, comm)*nullSpacePenalty; + vectorAdd(Nrows, alpha*gamma, o_null, 1.0, o_z); + } +} + + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/agmgLevel.cpp b/libs/parAlmond/src/agmgLevel.cpp new file mode 100644 index 000000000..5e99d5967 --- /dev/null +++ b/libs/parAlmond/src/agmgLevel.cpp @@ -0,0 +1,186 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +agmgLevel::agmgLevel(parCSR *A_, KrylovType ktype_): + multigridLevel(A_->Nrows, A_->Ncols, ktype_, A_->comm) { + + weighted = false; + gatherLevel = false; + + A = A_; +} + +agmgLevel::agmgLevel(parCSR *A_, parCSR *P_, parCSR *R_, KrylovType ktype_): + multigridLevel(A_->Nrows, A_->Ncols, ktype_, A_->comm) { + + //max + Ncols = (A_->Ncols>P_->Ncols) ? A_->Ncols : P_->Ncols; + + weighted = false; + gatherLevel = false; + + A = A_; + P = P_; + R = R_; +} + +agmgLevel::~agmgLevel() { + + delete A; delete P; delete R; + delete o_A; delete o_P; delete o_R; + +} + +void agmgLevel::Ax (dfloat *x, dfloat *Ax){ A->SpMV(1.0, x, 0.0, Ax); } + +void agmgLevel::coarsen (dfloat *r, dfloat *Rr){ + if (gatherLevel) { + ogsGather(Gx, r, ogsDfloat, ogsAdd, ogs); + vectorDotStar(ogs->Ngather, ogs->gatherInvDegree, Gx); + R->SpMV(1.0, Gx, 0.0, Rr); + } else { + R->SpMV(1.0, r, 0.0, Rr); + } +} + +void agmgLevel::prolongate(dfloat *x, dfloat *Px){ + if (gatherLevel) { + P->SpMV(1.0, x, 0.0, Gx); + ogsScatter(Sx, Gx, ogsDfloat, ogsAdd, ogs); + vectorAdd(P->Nrows, 1.0, Sx, 1.0, Px); + } else { + P->SpMV(1.0, x, 1.0, Px); + } +} + +void agmgLevel::residual (dfloat *rhs, dfloat *x, dfloat *res) { A->SpMV(-1.0, x, 1.0, rhs, res); } + +void agmgLevel::Ax (occa::memory o_x, occa::memory o_Ax){ o_A->SpMV(1.0, o_x, 0.0, o_Ax); } + +void agmgLevel::coarsen (occa::memory o_r, occa::memory o_Rr){ + if (gatherLevel) { + ogsGather(o_Gx, o_r, ogsDfloat, ogsAdd, ogs); + vectorDotStar(ogs->Ngather, ogs->o_gatherInvDegree, o_Gx); + o_R->SpMV(1.0, o_Gx, 0.0, o_Rr); + } else { + o_R->SpMV(1.0, o_r, 0.0, o_Rr); + } +} + +void agmgLevel::prolongate(occa::memory o_x, occa::memory o_Px){ + if (gatherLevel) { + o_P->SpMV(1.0, o_x, 0.0, o_Gx); + ogsScatter(o_Sx, o_Gx, ogsDfloat, ogsAdd, ogs); + vectorAdd(ogs->N, 1.0, o_Sx, 1.0, o_Px); + } else { + o_P->SpMV(1.0, o_x, 1.0, o_Px); + } +} + +void agmgLevel::residual (occa::memory o_rhs, occa::memory o_x, occa::memory o_res) { o_A->SpMV(-1.0, o_x, 1.0, o_rhs, o_res); } + +void agmgLevel::smooth(dfloat *rhs, dfloat *x, bool x_is_zero){ + if(stype == JACOBI){ + this->smoothJacobi(rhs, x, x_is_zero); + } else if(stype == DAMPED_JACOBI){ + this->smoothDampedJacobi(rhs, x, x_is_zero); + } else if(stype == CHEBYSHEV){ + this->smoothChebyshev(rhs, x, x_is_zero); + } +} + +void agmgLevel::smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero){ + if(stype == JACOBI){ + this->smoothJacobi(o_rhs, o_x, x_is_zero); + } else if(stype == DAMPED_JACOBI){ + this->smoothDampedJacobi(o_rhs, o_x, x_is_zero); + } else if(stype == CHEBYSHEV){ + this->smoothChebyshev(o_rhs, o_x, x_is_zero); + } +} + +void agmgLevel::Report() { + + int rank, size; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + hlong hNrows = (hlong) Nrows; + + int active = (Nrows>0) ? 1:0; + int totalActive=0; + MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, comm); + + dlong minNrows=0, maxNrows=0; + hlong totalNrows=0; + dfloat avgNrows; + MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, comm); + MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, comm); + avgNrows = (dfloat) totalNrows/totalActive; + + if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min + MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, comm); + + + long long int nnz; + nnz = A->diag->nnz+A->offd->nnz; + + long long int minNnz=0, maxNnz=0, totalNnz=0; + dfloat avgNnz; + MPI_Allreduce(&nnz, &maxNnz, 1, MPI_LONG_LONG_INT, MPI_MAX, comm); + MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, comm); + avgNnz = (dfloat) totalNnz/totalActive; + + if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min + MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, comm); + + dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows; + dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0; + MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, comm); + MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, comm); + avgNnzPerRow /= totalActive; + + if (Nrows==0) nnzPerRow = maxNnzPerRow; + MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, comm); + + char smootherString[BUFSIZ]; + if (stype==DAMPED_JACOBI) + strcpy(smootherString, "Damped Jacobi "); + else if (stype==CHEBYSHEV) + strcpy(smootherString, "Chebyshev "); + + if (rank==0){ + printf( "| parAlmond | %12d | %13d | %s|\n", minNrows, (int)minNnzPerRow, smootherString); + printf(" | | %12d | %13d | |\n", maxNrows, (int)maxNnzPerRow); + printf(" | | %12d | %13d | |\n", (int)avgNrows, (int)avgNnzPerRow); + } +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/agmgSetup/adjustPartition.cpp b/libs/parAlmond/src/agmgSetup/adjustPartition.cpp new file mode 100644 index 000000000..500234680 --- /dev/null +++ b/libs/parAlmond/src/agmgSetup/adjustPartition.cpp @@ -0,0 +1,277 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +void adjustPartition(agmgLevel *level, hlong* FineToCoarse, setupAide options) { + // MPI info + int rank, size; + rank = agmg::rank; + size = agmg::size; + + dlong N = level->A->Nrows; + + //Need to establish 'ownership' of aggregates + + //Keep the current partitioning for STRONGNODES. + // The rank that had the strong node for each aggregate owns the aggregate + if (options.compareArgs("PARALMOND PARTITION", "STRONGNODES")) return; + + //populate aggregate array + hlong gNumAggs = level->globalAggStarts[size]; //total number of aggregates + + parallelAggregate_t *sendAggs; + if (N) + sendAggs = (parallelAggregate_t *) calloc(N,sizeof(parallelAggregate_t)); + else + sendAggs = (parallelAggregate_t *) calloc(1,sizeof(parallelAggregate_t)); + + for (dlong i=0;imaxEntries) { + ownerRank = r; + maxEntries = rankCounts[r]; + } + } + + //set this aggregate's owner + for (dlong i=aggStarts[n];iglobalAggStarts[0] = 0; + for (int r=0;rglobalAggStarts[r+1] = level->globalAggStarts[r] + lNumAggs[r]; + + //set the new global coarse index + cnt = level->globalAggStarts[rank]; + if (newRecvNtotal) newRecvAggs[0].newCoarseId = cnt; + for (dlong i=1;igetTargetSize(); + + AMGstartLev = numLevels; + + agmgLevel *L = new agmgLevel(A, ktype); + levels[numLevels] = L; + + setupAgmgSmoother((agmgLevel*)(levels[numLevels]), stype, ChebyshevIterations); + + hlong globalSize = L->A->globalRowStarts[size]; + + //if the system if already small, dont create MG levels + bool done = false; + if(globalSize <= gCoarseSize){ + coarseLevel->setup(A); + baseLevel = numLevels; + done = true; + } + numLevels++; + + while(!done){ + L = coarsenAgmgLevel((agmgLevel*)(levels[numLevels-1]), ktype, options); + levels[numLevels] = L; + hlong globalCoarseSize = L->A->globalRowStarts[size]; + numLevels++; + + if(globalCoarseSize <= gCoarseSize || globalSize < 2*globalCoarseSize){ + coarseLevel->setup(L->A); + baseLevel = numLevels-1; + break; + } + globalSize = globalCoarseSize; + } + + size_t requiredBytes = 3*levels[AMGstartLev]->Ncols*sizeof(dfloat); + allocateScratchSpace(requiredBytes, device); + + for (int n=AMGstartLev;nsyncToDevice(); +} + +//create coarsened problem +agmgLevel *coarsenAgmgLevel(agmgLevel *level, KrylovType ktype, setupAide options){ + + int rank, size; + MPI_Comm_rank(level->comm, &rank); + MPI_Comm_size(level->comm, &size); + + parCSR *C = strongGraph(level->A); + + hlong *FineToCoarse = (hlong *) malloc(level->A->Ncols*sizeof(hlong)); + hlong *globalAggStarts = (hlong *) calloc(size+1,sizeof(hlong)); + + formAggregates(level->A, C, FineToCoarse, globalAggStarts); + + // adjustPartition(FineToCoarse, options); + + dfloat *nullCoarseA; + parCSR *P = constructProlongation(level->A, FineToCoarse, globalAggStarts, &nullCoarseA); + parCSR *R = transpose(P); + parCSR *A = galerkinProd(level->A, P); + + A->null = nullCoarseA; + + agmgLevel *coarseLevel = new agmgLevel(A,P,R, ktype); + + //update the number of columns required for this level (from R) + level->Ncols = (level->Ncols > R->Ncols) ? level->Ncols : R->Ncols; + + return coarseLevel; +} + +void setupAgmgSmoother(agmgLevel *level, SmoothType s, int ChebIterations){ + + level->stype = s; + level->ChebyshevIterations = ChebIterations; + + if((s == DAMPED_JACOBI)||(s == CHEBYSHEV)){ + // estimate rho(invD * A) + dfloat rho = level->A->rhoDinvA(); + + if (s == DAMPED_JACOBI) { + level->lambda = (4./3.)/rho; + } else if (s == CHEBYSHEV) { + level->lambda1 = rho; + level->lambda0 = rho/10.; + } + } +} + +void allocateAgmgVectors(agmgLevel *level, int k, int AMGstartLev, CycleType ctype) { + + if (k) level->x = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); + if (k) level->rhs = (dfloat *) calloc(level->Nrows,sizeof(dfloat)); + + level->res = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); + + //kcycle vectors + if (ctype==KCYCLE) { + if ((k>0) && (kck = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); + level->vk = (dfloat *) calloc(level->Nrows,sizeof(dfloat)); + level->wk = (dfloat *) calloc(level->Nrows,sizeof(dfloat)); + } + } +} + +void syncAgmgToDevice(agmgLevel *level, int k, int AMGstartLev, CycleType ctype) { + + occa::device device = level->A->device; + + level->o_A = new parHYB(level->A); + level->o_A->syncToDevice(); + if (k>AMGstartLev) { + level->o_R = new parHYB(level->R); + level->o_P = new parHYB(level->P); + level->o_R->syncToDevice(); + level->o_P->syncToDevice(); + } + + if (level->x ) level->o_x = device.malloc(level->Ncols*sizeof(dfloat),level->x); + if (level->rhs) level->o_rhs = device.malloc(level->Nrows*sizeof(dfloat),level->rhs); + if (level->res) level->o_res = device.malloc(level->Ncols*sizeof(dfloat),level->res); + + if (ctype==KCYCLE) { + if ((k>0) && (ko_ck = device.malloc(level->Ncols*sizeof(dfloat),level->ck); + level->o_vk = device.malloc(level->Nrows*sizeof(dfloat),level->vk); + level->o_wk = device.malloc(level->Nrows*sizeof(dfloat),level->wk); + } + } +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/agmgSetup/constructProlongation.cpp b/libs/parAlmond/src/agmgSetup/constructProlongation.cpp new file mode 100644 index 000000000..9887a6af3 --- /dev/null +++ b/libs/parAlmond/src/agmgSetup/constructProlongation.cpp @@ -0,0 +1,124 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +parCSR *constructProlongation(parCSR *A, hlong *FineToCoarse, + hlong *globalAggStarts, dfloat **nullCoarseA){ + // MPI info + int rank, size; + MPI_Comm_rank(A->comm, &rank); + MPI_Comm_size(A->comm, &size); + + const dlong N = A->Nrows; + + const hlong globalAggOffset = globalAggStarts[rank]; + const dlong NCoarse = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local num agg + + parCSR* P = new parCSR(N, NCoarse, A->comm, A->device); + + P->globalRowStarts = A->globalRowStarts; + P->globalColStarts = globalAggStarts; + + P->diag->rowStarts = (dlong *) calloc(N+1, sizeof(dlong)); + P->offd->rowStarts = (dlong *) calloc(N+1, sizeof(dlong)); + + // each row has exactly one nonzero + for(dlong i=0; iglobalAggOffset-1)&&(coldiag->rowStarts[i+1]++; + } else { + P->offd->rowStarts[i+1]++; + } + } + for(dlong i=0; idiag->rowStarts[i+1] += P->diag->rowStarts[i]; + P->offd->rowStarts[i+1] += P->offd->rowStarts[i]; + } + P->diag->nnz = P->diag->rowStarts[N]; + P->offd->nnz = P->offd->rowStarts[N]; + + // Halo setup + hlong *colIds = (hlong *) malloc(P->offd->nnz*sizeof(hlong)); + dlong cnt=0; + for (dlong i=0;iglobalAggOffset+NCoarse-1)) + colIds[cnt++] = col; + } + P->haloSetup(colIds); + + P->diag->cols = (dlong *) calloc(P->diag->nnz, sizeof(dlong)); + P->diag->vals = (dfloat *) calloc(P->diag->nnz, sizeof(dfloat)); + P->offd->cols = (dlong *) calloc(P->offd->nnz, sizeof(dlong)); + P->offd->vals = (dfloat *) calloc(P->offd->nnz, sizeof(dfloat)); + + dlong diagCnt = 0; + dlong offdCnt = 0; + for(dlong i=0; iglobalAggStarts[rank]-1)&&(coldiag->cols[diagCnt ] = (dlong) (col - globalAggOffset); //local index + P->diag->vals[diagCnt++] = A->null[i]; + } else { + P->offd->cols[offdCnt ] = colIds[offdCnt]; + P->offd->vals[offdCnt++] = A->null[i]; + } + } + + // normalize the columns of P + *nullCoarseA = (dfloat *) calloc(P->Ncols,sizeof(dfloat)); + + //add local nonzeros + for(dlong i=0; idiag->nnz; i++) + (*nullCoarseA)[P->diag->cols[i]] += P->diag->vals[i] * P->diag->vals[i]; + + //add nonlocal nonzeros + for(dlong i=0; ioffd->nnz; i++) + (*nullCoarseA)[P->offd->cols[i]] += P->offd->vals[i] * P->offd->vals[i]; + + ogsGatherScatter((*nullCoarseA), ogsDfloat, ogsAdd, P->ogs); + + for(dlong i=0; iNcols; i++) + (*nullCoarseA)[i] = 0.; + + ogsGatherScatter((*nullCoarseA), ogsDfloat, ogsAdd, P->ogs); + + for(dlong i=0; idiag->nnz; i++) + P->diag->vals[i] /= (*nullCoarseA)[P->diag->cols[i]]; + for(dlong i=0; ioffd->nnz; i++) + P->offd->vals[i] /= (*nullCoarseA)[P->offd->cols[i]]; + + return P; +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/agmgSetup/formAggregates.cpp b/libs/parAlmond/src/agmgSetup/formAggregates.cpp new file mode 100644 index 000000000..12401a579 --- /dev/null +++ b/libs/parAlmond/src/agmgSetup/formAggregates.cpp @@ -0,0 +1,297 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +void formAggregates(parCSR *A, parCSR *C, + hlong* FineToCoarse, + hlong* globalAggStarts){ + + int rank, size; + MPI_Comm_rank(A->comm, &rank); + MPI_Comm_size(A->comm, &size); + + const dlong N = C->Nrows; + const dlong M = C->Ncols; + const dlong diagNNZ = C->diag->nnz; + const dlong offdNNZ = C->offd->nnz; + + dfloat *rands = (dfloat *) calloc(M, sizeof(dfloat)); + int *states = (int *) calloc(M, sizeof(int)); + + dfloat *Tr = (dfloat *) calloc(M, sizeof(dfloat)); + int *Ts = (int *) calloc(M, sizeof(int)); + hlong *Ti = (hlong *) calloc(M, sizeof(hlong)); + hlong *Tc = (hlong *) calloc(M, sizeof(hlong)); + + hlong *globalRowStarts = A->globalRowStarts; + + for(dlong i=0; idiag->cols[i]]++; + + for(dlong i=0; ioffd->cols[i]]++; + + //gs for total column counts + ogsGatherScatter(colCnt, ogsInt, ogsAdd, A->ogs); + + //add random pertubation + for(int i=0;iogs); + + hlong done = 0; + while(!done){ + // first neighbours + // #pragma omp parallel for + for(dlong i=0; idiag->rowStarts[i];jjdiag->rowStarts[i+1];jj++){ + const dlong col = C->diag->cols[jj]; + if (col==i) continue; + if(customLess(smax, rmax, imax, states[col], rands[col], col + globalRowStarts[rank])){ + smax = states[col]; + rmax = rands[col]; + imax = col + globalRowStarts[rank]; + } + } + //nonlocal entries + for(dlong jj=C->offd->rowStarts[i];jjoffd->rowStarts[i+1];jj++){ + const dlong col = C->offd->cols[jj]; + if(customLess(smax, rmax, imax, states[col], rands[col], A->colMap[col])) { + smax = states[col]; + rmax = rands[col]; + imax = A->colMap[col]; + } + } + } + Ts[i] = smax; + Tr[i] = rmax; + Ti[i] = imax; + } + + //share results + for (dlong n=N;nogs); + ogsGatherScatter(Ts, ogsInt, ogsAdd, A->ogs); + ogsGatherScatter(Ti, ogsHlong, ogsAdd, A->ogs); + + // second neighbours + // #pragma omp parallel for + for(dlong i=0; idiag->rowStarts[i];jjdiag->rowStarts[i+1];jj++){ + const dlong col = C->diag->cols[jj]; + if (col==i) continue; + if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){ + smax = Ts[col]; + rmax = Tr[col]; + imax = Ti[col]; + } + } + //nonlocal entries + for(dlong jj=C->offd->rowStarts[i];jjoffd->rowStarts[i+1];jj++){ + const dlong col = C->offd->cols[jj]; + if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){ + smax = Ts[col]; + rmax = Tr[col]; + imax = Ti[col]; + } + } + + // if I am the strongest among all the 1 and 2 ring neighbours + // I am an MIS node + if((states[i] == 0) && (imax == (i + globalRowStarts[rank]))) + states[i] = 1; + + // if there is an MIS node within distance 2, I am removed + if((states[i] == 0) && (smax == 1)) + states[i] = -1; + } + + //share results + for (dlong n=N;nogs); + + // if number of undecided nodes = 0, algorithm terminates + hlong cnt = std::count(states, states+N, 0); + MPI_Allreduce(&cnt,&done,1,MPI_HLONG, MPI_SUM,A->comm); + done = (done == 0) ? 1 : 0; + } + + dlong numAggs = 0; + dlong *gNumAggs = (dlong *) calloc(size,sizeof(dlong)); + + // count the coarse nodes/aggregates + for(dlong i=0; icomm); + + globalAggStarts[0] = 0; + for (int r=0;rogs); + + // form the aggregates + // #pragma omp parallel for + for(dlong i=0; idiag->rowStarts[i];jjdiag->rowStarts[i+1];jj++){ + const dlong col = C->diag->cols[jj]; + if (col==i) continue; + if(customLess(smax, rmax, imax, states[col], rands[col], col + globalRowStarts[rank])){ + smax = states[col]; + rmax = rands[col]; + imax = col + globalRowStarts[rank]; + cmax = FineToCoarse[col]; + } + } + //nonlocal entries + for(dlong jj=C->offd->rowStarts[i];jjoffd->rowStarts[i+1];jj++){ + const dlong col = C->offd->cols[jj]; + if(customLess(smax, rmax, imax, states[col], rands[col], A->colMap[col])){ + smax = states[col]; + rmax = rands[col]; + imax = A->colMap[col]; + cmax = FineToCoarse[col]; + } + } + } + Ts[i] = smax; + Tr[i] = rmax; + Ti[i] = imax; + Tc[i] = cmax; + + if((states[i] == -1) && (smax == 1) && (cmax > -1)) + FineToCoarse[i] = cmax; + } + + //share results + for (dlong n=N;nogs); + ogsGatherScatter(Tr, ogsDfloat, ogsAdd, A->ogs); + ogsGatherScatter(Ts, ogsInt, ogsAdd, A->ogs); + ogsGatherScatter(Ti, ogsHlong, ogsAdd, A->ogs); + ogsGatherScatter(Tc, ogsHlong, ogsAdd, A->ogs); + + // second neighbours + // #pragma omp parallel for + for(dlong i=0; idiag->rowStarts[i];jjdiag->rowStarts[i+1];jj++){ + const dlong col = C->diag->cols[jj]; + if (col==i) continue; + if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){ + smax = Ts[col]; + rmax = Tr[col]; + imax = Ti[col]; + cmax = Tc[col]; + } + } + //nonlocal entries + for(dlong jj=C->offd->rowStarts[i];jjoffd->rowStarts[i+1];jj++){ + const dlong col = C->offd->cols[jj]; + if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){ + smax = Ts[col]; + rmax = Tr[col]; + imax = Ti[col]; + cmax = Tc[col]; + } + } + + if((states[i] == -1) && (smax == 1) && (cmax > -1)) + FineToCoarse[i] = cmax; + } + + //share results + for (dlong n=N;nogs); + + free(rands); + free(states); + free(Tr); + free(Ts); + free(Ti); + free(Tc); + + delete C; +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/agmgSetup/galerkinProd.cpp b/libs/parAlmond/src/agmgSetup/galerkinProd.cpp new file mode 100644 index 000000000..1bccf216a --- /dev/null +++ b/libs/parAlmond/src/agmgSetup/galerkinProd.cpp @@ -0,0 +1,273 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +parCSR *galerkinProd(parCSR *A, parCSR *P){ + + // MPI info + int rank, size; + MPI_Comm_rank(A->comm, &rank); + MPI_Comm_size(A->comm, &size); + + hlong *globalAggStarts = P->globalColStarts; + hlong globalAggOffset = globalAggStarts[rank]; + + //The galerkin product can be computed as + // (P^T A P)_IJ = sum_{i in Agg_I} sum_{j in Agg_J} P_iI A_ij P_jJ + // Since each row of P has only one entry, we can share the necessary + // P entries, form the products, and send them to their destination rank + + const dlong N = A->Nrows; + const dlong M = A->Ncols; + + //printf("Level has %d rows, and is making %d aggregates\n", N, globalAggStarts[rank+1]-globalAggStarts[rank]); + + hlong *Pcols = (hlong *) calloc(M,sizeof(hlong)); + dfloat *Pvals = (dfloat *) calloc(M,sizeof(dfloat)); + + //record the entries of P that this rank has + dlong cnt =0; + for (dlong i=0;idiag->rowStarts[i];jdiag->rowStarts[i+1];j++) { + Pcols[cnt] = P->diag->cols[j] + globalAggOffset; //global ID + Pvals[cnt] = P->diag->vals[j]; + cnt++; + } + for (dlong j=P->offd->rowStarts[i];joffd->rowStarts[i+1];j++) { + Pcols[cnt] = P->colMap[P->offd->cols[j]]; //global ID + Pvals[cnt] = P->offd->vals[j]; + cnt++; + } + } + + //fill the halo region + ogsGatherScatter(Pcols, ogsHlong, ogsAdd, A->ogs); + ogsGatherScatter(Pvals, ogsDfloat, ogsAdd, A->ogs); + + + + dlong sendNtotal = A->diag->nnz+A->offd->nnz; + nonzero_t *sendPTAP = (nonzero_t *) calloc(sendNtotal,sizeof(nonzero_t)); + + // Make the MPI_NONZERO_T data type + nonzero_t NZ; + MPI_Datatype MPI_NONZERO_T; + MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT}; + int blength[3] = {1, 1, 1}; + MPI_Aint addr[3], displ[3]; + MPI_Get_address ( &(NZ.row), addr+0); + MPI_Get_address ( &(NZ.col), addr+1); + MPI_Get_address ( &(NZ.val), addr+2); + displ[0] = 0; + displ[1] = addr[1] - addr[0]; + displ[2] = addr[2] - addr[0]; + MPI_Type_create_struct (3, blength, displ, dtype, &MPI_NONZERO_T); + MPI_Type_commit (&MPI_NONZERO_T); + + //form the fine PTAP products + cnt =0; + for (dlong i=0;idiag->rowStarts[i]; + dlong end = A->diag->rowStarts[i+1]; + for (dlong j=start;jdiag->cols[j]; + const dfloat val = A->diag->vals[j]; + + sendPTAP[cnt].row = Pcols[i]; + sendPTAP[cnt].col = Pcols[col]; + sendPTAP[cnt].val = val*Pvals[i]*Pvals[col]; + cnt++; + } + start = A->offd->rowStarts[i]; + end = A->offd->rowStarts[i+1]; + for (dlong j=start;joffd->cols[j]; + const dfloat val = A->offd->vals[j]; + + sendPTAP[cnt].row = Pcols[i]; + sendPTAP[cnt].col = Pcols[col]; + sendPTAP[cnt].val = val*Pvals[i]*Pvals[col]; + cnt++; + } + } + + free(Pcols); + free(Pvals); + + //sort entries by the coarse row and col + qsort(sendPTAP, sendNtotal, sizeof(nonzero_t), compareNonZeroByRow); + + //count number of non-zeros we're sending + int *sendCounts = (int *) calloc(size,sizeof(int)); + int *recvCounts = (int *) calloc(size,sizeof(int)); + int *sendOffsets = (int *) calloc(size+1,sizeof(int)); + int *recvOffsets = (int *) calloc(size+1,sizeof(int)); + + int r=0; + for(dlong i=0;i=globalAggStarts[r+1]) r++; + sendCounts[r]++; + } + + // find how many nodes to expect (should use sparse version) + MPI_Alltoall(sendCounts, 1, MPI_INT, + recvCounts, 1, MPI_INT, A->comm); + + // find send and recv offsets for gather + for(int r=0;rcomm); + + //clean up + MPI_Barrier(A->comm); + free(sendPTAP); + free(sendCounts); free(recvCounts); + free(sendOffsets); free(recvOffsets); + + //sort entries by the coarse row and col + qsort(recvPTAP, recvNtotal, sizeof(nonzero_t), compareNonZeroByRow); + + //count total number of nonzeros; + dlong nnz =0; + if (recvNtotal) nnz++; + for (dlong i=1;icomm); + free(recvPTAP); + + dlong numAggs = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local number of aggregates + + parCSR *Ac = new parCSR(numAggs, numAggs, A->comm, A->device); + + Ac->globalRowStarts = globalAggStarts; + Ac->globalColStarts = globalAggStarts; + + Ac->diag->rowStarts = (dlong *) calloc(numAggs+1, sizeof(dlong)); + Ac->offd->rowStarts = (dlong *) calloc(numAggs+1, sizeof(dlong)); + + for (dlong n=0;n globalAggStarts[rank]-1)&& + (PTAP[n].col < globalAggStarts[rank+1])) { + Ac->diag->rowStarts[row+1]++; + } else { + Ac->offd->rowStarts[row+1]++; + } + } + + // cumulative sum + for(dlong i=0; idiag->rowStarts[i+1] += Ac->diag->rowStarts[i]; + Ac->offd->rowStarts[i+1] += Ac->offd->rowStarts[i]; + } + Ac->diag->nnz = Ac->diag->rowStarts[numAggs]; + Ac->offd->nnz = Ac->offd->rowStarts[numAggs]; + + // Halo setup + hlong *colIds = (hlong *) malloc(Ac->offd->nnz*sizeof(hlong)); + cnt=0; + for (dlong n=0;n= globalAggStarts[rank+1])) { + colIds[cnt++] = PTAP[n].col; + } + } + Ac->haloSetup(colIds); + + //fill the CSR matrices + Ac->diagA = (dfloat *) calloc(Ac->Ncols, sizeof(dfloat)); + Ac->diagInv = (dfloat *) calloc(Ac->Ncols, sizeof(dfloat)); + Ac->diag->cols = (dlong *) calloc(Ac->diag->nnz, sizeof(dlong)); + Ac->offd->cols = (dlong *) calloc(Ac->offd->nnz, sizeof(dlong)); + Ac->diag->vals = (dfloat *) calloc(Ac->diag->nnz, sizeof(dfloat)); + Ac->offd->vals = (dfloat *) calloc(Ac->offd->nnz, sizeof(dfloat)); + dlong diagCnt = 0; + dlong offdCnt = 0; + for (dlong n=0;n globalAggStarts[rank]-1)&& + (PTAP[n].col < globalAggStarts[rank+1])) { + Ac->diag->cols[diagCnt] = (dlong) (PTAP[n].col - globalAggOffset); + Ac->diag->vals[diagCnt] = PTAP[n].val; + + //record the diagonal + dlong row = (dlong) (PTAP[n].row - globalAggOffset); + if (row==Ac->diag->cols[diagCnt]) + Ac->diagA[row] = Ac->diag->vals[diagCnt]; + + diagCnt++; + } else { + Ac->offd->cols[offdCnt] = colIds[offdCnt]; + Ac->offd->vals[offdCnt] = PTAP[n].val; + offdCnt++; + } + } + + //compute the inverse diagonal + for (dlong n=0;nNrows;n++) Ac->diagInv[n] = 1.0/Ac->diagA[n]; + + //propagate nullspace flag + Ac->nullSpace = A->nullSpace; + Ac->nullSpacePenalty = A->nullSpacePenalty; + + //clean up + MPI_Barrier(A->comm); + MPI_Type_free(&MPI_NONZERO_T); + free(colIds); + free(PTAP); + + return Ac; +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/agmgSetup/strongGraph.cpp b/libs/parAlmond/src/agmgSetup/strongGraph.cpp new file mode 100644 index 000000000..89766b67f --- /dev/null +++ b/libs/parAlmond/src/agmgSetup/strongGraph.cpp @@ -0,0 +1,147 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +parCSR* strongGraph(parCSR *A){ + + const dlong N = A->Nrows; + const dlong M = A->Ncols; + + parCSR *C = new parCSR(N, M); + + C->diag->rowStarts = (dlong *) calloc(N+1,sizeof(dlong)); + C->offd->rowStarts = (dlong *) calloc(N+1,sizeof(dlong)); + + dfloat *maxOD; + if (N) maxOD = (dfloat *) calloc(N,sizeof(dfloat)); + + dfloat *diagA = A->diagA; + + // #pragma omp parallel for + for(dlong i=0; i= 0) ? 1:-1; + const dfloat Aii = fabs(diagA[i]); + + //find maxOD + //local entries + dlong Jstart = A->diag->rowStarts[i]; + dlong Jend = A->diag->rowStarts[i+1]; + for(dlong jj= Jstart; jjdiag->cols[jj]; + if (col==i) continue; + dfloat Ajj = fabs(diagA[col]); + dfloat OD = -sign*A->diag->vals[jj]/(sqrt(Aii)*sqrt(Ajj)); + if(OD > maxOD[i]) maxOD[i] = OD; + } + //non-local entries + Jstart = A->offd->rowStarts[i], + Jend = A->offd->rowStarts[i+1]; + for(dlong jj= Jstart; jjoffd->cols[jj]; + dfloat Ajj = fabs(diagA[col]); + dfloat OD = -sign*A->offd->vals[jj]/(sqrt(Aii)*sqrt(Ajj)); + if(OD > maxOD[i]) maxOD[i] = OD; + } + + int diag_strong_per_row = 1; // diagonal entry + //local entries + Jstart = A->diag->rowStarts[i], + Jend = A->diag->rowStarts[i+1]; + for(dlong jj = Jstart; jjdiag->cols[jj]; + if (col==i) continue; + dfloat Ajj = fabs(diagA[col]); + dfloat OD = -sign*A->diag->vals[jj]/(sqrt(Aii)*sqrt(Ajj)); + if(OD > COARSENTHREASHOLD*maxOD[i]) diag_strong_per_row++; + } + int offd_strong_per_row = 0; + //non-local entries + Jstart = A->offd->rowStarts[i], Jend = A->offd->rowStarts[i+1]; + for(dlong jj= Jstart; jjoffd->cols[jj]; + dfloat Ajj = fabs(diagA[col]); + dfloat OD = -sign*A->offd->vals[jj]/(sqrt(Aii)*sqrt(Ajj)); + if(OD > COARSENTHREASHOLD*maxOD[i]) offd_strong_per_row++; + } + + C->diag->rowStarts[i+1] = diag_strong_per_row; + C->offd->rowStarts[i+1] = offd_strong_per_row; + } + + // cumulative sum + for(dlong i=1; idiag->rowStarts[i] += C->diag->rowStarts[i-1]; + C->offd->rowStarts[i] += C->offd->rowStarts[i-1]; + } + C->diag->nnz = C->diag->rowStarts[N]; + C->offd->nnz = C->offd->rowStarts[N]; + + C->diag->cols = (dlong *) malloc(C->diag->nnz*sizeof(dlong)); + C->offd->cols = (dlong *) malloc(C->offd->nnz*sizeof(dlong)); + // C->diag->vals = (dfloat *) malloc(0); + // C->offd->vals = (dfloat *) malloc(0); + + // fill in the columns for strong connections + // #pragma omp parallel for + for(dlong i=0; i= 0) ? 1:-1; + const dfloat Aii = fabs(diagA[i]); + + dlong diagCounter = C->diag->rowStarts[i]; + dlong offdCounter = C->offd->rowStarts[i]; + + //local entries + dlong Jstart = A->diag->rowStarts[i]; + dlong Jend = A->diag->rowStarts[i+1]; + for(dlong jj = Jstart; jjdiag->cols[jj]; + if (col==i) { + C->diag->cols[diagCounter++] = col;// diag entry + continue; + } + dfloat Ajj = fabs(diagA[col]); + dfloat OD = -sign*A->diag->vals[jj]/(sqrt(Aii)*sqrt(Ajj)); + if(OD > COARSENTHREASHOLD*maxOD[i]) + C->diag->cols[diagCounter++] = col; + } + Jstart = A->offd->rowStarts[i], Jend = A->offd->rowStarts[i+1]; + for(dlong jj = Jstart; jjoffd->cols[jj]; + dfloat Ajj = fabs(diagA[col]); + dfloat OD = -sign*A->offd->vals[jj]/(sqrt(Aii)*sqrt(Ajj)); + if(OD > COARSENTHREASHOLD*maxOD[i]) + C->offd->cols[offdCounter++] = col; + } + } + if(N) free(maxOD); + + return C; +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/agmgSetup/transpose.cpp b/libs/parAlmond/src/agmgSetup/transpose.cpp new file mode 100644 index 000000000..cc1801808 --- /dev/null +++ b/libs/parAlmond/src/agmgSetup/transpose.cpp @@ -0,0 +1,188 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +parCSR *transpose(parCSR *A){ + + // MPI info + int rank, size; + MPI_Comm_rank(A->comm, &rank); + MPI_Comm_size(A->comm, &size); + + hlong *globalRowStarts = A->globalRowStarts; + hlong *globalColStarts = A->globalColStarts; + + dlong Nrows = (dlong) (globalColStarts[rank+1]-globalColStarts[rank]); + dlong Ncols = (dlong) (globalRowStarts[rank+1]-globalRowStarts[rank]); + + parCSR *At = new parCSR(Nrows, Ncols, A->comm, A->device); + + At->globalRowStarts = globalColStarts; + At->globalColStarts = globalRowStarts; + + At->diag = new CSR(At->Nrows, At->Ncols); + At->offd = new CSR(At->Nrows, At->Ncols); + + At->diag->nnz = A->diag->nnz; //local entries remain local + At->diag->rowStarts = (dlong *) calloc(At->Nrows+1, sizeof(dlong)); + + //start with local entries + At->diag->cols = (dlong *) calloc(At->diag->nnz, sizeof(dlong)); + At->diag->vals = (dfloat *) calloc(At->diag->nnz, sizeof(dfloat)); + + // count the num of nonzeros per row for transpose + for(dlong i=0; idiag->nnz; i++){ + dlong row = A->diag->cols[i]; + At->diag->rowStarts[row+1]++; + } + + // cumulative sum for rows + for(dlong i=1; i<=At->Nrows; i++) + At->diag->rowStarts[i] += At->diag->rowStarts[i-1]; + + int *counter = (int *) calloc(At->Nrows+1,sizeof(int)); + for (dlong i=0; iNrows+1; i++) + counter[i] = At->diag->rowStarts[i]; + + for(dlong i=0; iNrows; i++){ + const dlong Jstart = A->diag->rowStarts[i]; + const dlong Jend = A->diag->rowStarts[i+1]; + + for(dlong jj=Jstart; jjdiag->cols[jj]; + At->diag->cols[counter[row]] = i; + At->diag->vals[counter[row]] = A->diag->vals[jj]; + + counter[row]++; + } + } + free(counter); + + + // Make the MPI_NONZERO_T data type + nonzero_t NZ; + MPI_Datatype MPI_NONZERO_T; + MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT}; + int blength[3] = {1, 1, 1}; + MPI_Aint addr[3], displ[3]; + MPI_Get_address ( &(NZ.row), addr+0); + MPI_Get_address ( &(NZ.col), addr+1); + MPI_Get_address ( &(NZ.val), addr+2); + displ[0] = 0; + displ[1] = addr[1] - addr[0]; + displ[2] = addr[2] - addr[0]; + MPI_Type_create_struct (3, blength, displ, dtype, &MPI_NONZERO_T); + MPI_Type_commit (&MPI_NONZERO_T); + + nonzero_t *sendNonZeros = (nonzero_t *) calloc(A->offd->nnz, sizeof(nonzero_t)); + + // copy data from nonlocal entries into send buffer + for(dlong i=0;iNrows;++i){ + for (dlong j=A->offd->rowStarts[i];joffd->rowStarts[i+1];j++) { + hlong col = A->colMap[A->offd->cols[j]]; //global ids + sendNonZeros[j].row = col; + sendNonZeros[j].col = i + globalRowStarts[rank]; //global ids + sendNonZeros[j].val = A->offd->vals[j]; + } + } + + //sort by destination row + qsort(sendNonZeros, A->offd->nnz, sizeof(nonzero_t), compareNonZeroByRow); + + //count number of non-zeros we're sending + int *sendCounts = (int*) calloc(size, sizeof(int)); + int *recvCounts = (int*) calloc(size, sizeof(int)); + int *sendOffsets = (int*) calloc(size+1, sizeof(int)); + int *recvOffsets = (int*) calloc(size+1, sizeof(int)); + + int r=0; + for (dlong n=0;noffd->nnz;n++) { + dlong row = sendNonZeros[n].row; + while(row>=globalColStarts[r+1]) r++; + sendCounts[r]++; + } + + MPI_Alltoall(sendCounts, 1, MPI_INT, + recvCounts, 1, MPI_INT, A->comm); + + for (r=0;roffd->nnz = recvOffsets[size]; //total nonzeros + + nonzero_t *recvNonZeros = (nonzero_t *) calloc(At->offd->nnz, sizeof(nonzero_t)); + + MPI_Alltoallv(sendNonZeros, sendCounts, sendOffsets, MPI_NONZERO_T, + recvNonZeros, recvCounts, recvOffsets, MPI_NONZERO_T, + A->comm); + + //clean up + MPI_Barrier(A->comm); + free(sendNonZeros); + free(sendCounts); + free(recvCounts); + free(sendOffsets); + free(recvOffsets); + + //sort by row + qsort(recvNonZeros, At->offd->nnz, sizeof(nonzero_t), compareNonZeroByRow); + + hlong globalRowOffset = At->globalRowStarts[rank]; + + hlong *colIds = (hlong *) malloc(At->offd->nnz*sizeof(hlong)); + dlong cnt=0; + for (dlong n=0;noffd->nnz;n++) { + colIds[n] = recvNonZeros[n].col; + } + At->haloSetup(colIds); + + //fill the CSR matrix + At->offd->rowStarts = (dlong *) calloc(At->Nrows+1, sizeof(dlong)); + At->offd->cols = (dlong *) calloc(At->offd->nnz, sizeof(dlong)); + At->offd->vals = (dfloat *) calloc(At->offd->nnz, sizeof(dfloat)); + for (dlong n=0;noffd->nnz;n++) { + dlong row = (dlong) (recvNonZeros[n].row - globalRowOffset); + At->offd->rowStarts[row+1]++; + At->offd->cols[n] = colIds[n]; + At->offd->vals[n] = recvNonZeros[n].val; + } + + // cumulative sum for rows + for(dlong i=1; i<=At->Nrows; i++) + At->offd->rowStarts[i] += At->offd->rowStarts[i-1]; + + MPI_Barrier(A->comm); + free(recvNonZeros); + free(colIds); + + return At; +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/agmgSmoother.cpp b/libs/parAlmond/src/agmgSmoother.cpp new file mode 100644 index 000000000..4f1e3df62 --- /dev/null +++ b/libs/parAlmond/src/agmgSmoother.cpp @@ -0,0 +1,200 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +void agmgLevel::smoothJacobi(dfloat *r, dfloat *x, + const bool x_is_zero) { + + // x = x + inv(D)*(b-A*x) + if(x_is_zero){ + vectorDotStar(Nrows,1.0,A->diagInv,r,0.0,x); + return; + } + + static dfloat *res = (dfloat *) scratch; + + A->SpMV(-1.0, x, 1.0, r, res); + vectorDotStar(Nrows, 1.0, A->diagInv, res, 1.0, x); +} + + +void agmgLevel::smoothDampedJacobi(dfloat *r, dfloat *x, + const bool x_is_zero) { + + // x = x + alpha*inv(D)*(b-A*x) + if(x_is_zero){ + vectorDotStar(Nrows,lambda,A->diagInv,r,0.0,x); + return; + } + + static dfloat *res = (dfloat *) scratch; + + A->SpMV(-1.0, x, 1.0, r, res); + vectorDotStar(Nrows, lambda, A->diagInv, res, 1.0, x); +} + +void agmgLevel::smoothChebyshev(dfloat *r, dfloat *x, + const bool x_is_zero) { + + const dfloat theta = 0.5*(lambda1+lambda0); + const dfloat delta = 0.5*(lambda1-lambda0); + const dfloat invTheta = 1.0/theta; + const dfloat sigma = theta/delta; + dfloat rho_n = 1./sigma; + dfloat rho_np1; + + static dfloat *res = ((dfloat*) scratch) + 0*Ncols; + static dfloat *Ad = ((dfloat*) scratch) + 1*Ncols; + static dfloat *d = ((dfloat*) scratch) + 2*Ncols; + + if(x_is_zero){ //skip the Ax if x is zero + //res = D^{-1}r + vectorDotStar(Nrows, 1.0, A->diagInv, r, 0.0, res); + vectorSet(Nrows, 0.0, x); + //d = invTheta*res + vectorAdd(Nrows, invTheta, res, 0.0, d); + } else { + //res = D^{-1}(r-Ax) + A->SpMV(-1.0, x, 1.0, r, res); + vectorDotStar(Nrows, A->diagInv, res); + + //d = invTheta*res + vectorAdd(Nrows, invTheta, res, 0.0, d); + } + + for (int k=0;kSpMV(1.0, d, 0.0, Ad); + vectorDotStar(Nrows, -1.0, A->diagInv, Ad, 1.0, res); + + rho_np1 = 1.0/(2.*sigma-rho_n); + + //d_k+1 = rho_k+1*rho_k*d_k + 2*rho_k+1*r_k+1/delta + vectorAdd(Nrows, 2.0*rho_np1/delta, res, rho_np1*rho_n, d); + rho_n = rho_np1; + } + //x_k+1 = x_k + d_k + vectorAdd(Nrows, 1.0, d, 1.0, x); +} + +void agmgLevel::smoothJacobi(occa::memory o_r, occa::memory o_x, + bool x_is_zero) { + + // occaTimerTic(parAlmond->device,"device smoothJacobi"); + if(x_is_zero){ + vectorDotStar(Nrows, 1.0, o_A->o_diagInv, o_r, 0.0, o_x); + // occaTimerToc(parAlmond->device,"device smoothJacobi"); + return; + } + + static occa::memory o_res = o_scratch; + + // res = r-A*x + o_A->SpMV(-1.0, o_x, 1.0, o_r, o_res); + + // x = x + alpha*inv(D)*res + vectorDotStar(Nrows, 1.0, o_A->o_diagInv, o_res, 1.0, o_x); + // occaTimerToc(parAlmond->device,"hyb smoothJacobi"); +} + +void agmgLevel::smoothDampedJacobi(occa::memory o_r, occa::memory o_x, + bool x_is_zero){ + + // occaTimerTic(parAlmond->device,"device smoothDampedJacobi"); + if(x_is_zero){ + vectorDotStar(Nrows, lambda, o_A->o_diagInv, o_r, 0.0, o_x); + // occaTimerToc(parAlmond->device,"device smoothDampedJacobi"); + return; + } + + static occa::memory o_res = o_scratch; + + // res = r-A*x + o_A->SpMV(-1.0, o_x, 1.0, o_r, o_res); + + // x = x + alpha*inv(D)*res + vectorDotStar(Nrows, lambda, o_A->o_diagInv, o_res, 1.0, o_x); + // occaTimerToc(parAlmond->device,"device smoothDampedJacobi"); +} + +void agmgLevel::smoothChebyshev(occa::memory o_r, occa::memory o_x, + bool x_is_zero) { + + const dfloat theta = 0.5*(lambda1+lambda0); + const dfloat delta = 0.5*(lambda1-lambda0); + const dfloat invTheta = 1.0/theta; + const dfloat sigma = theta/delta; + dfloat rho_n = 1./sigma; + dfloat rho_np1; + + static occa::memory o_res = o_scratch + 0*Ncols*sizeof(dfloat); + static occa::memory o_Ad = o_scratch + 1*Ncols*sizeof(dfloat); + static occa::memory o_d = o_scratch + 2*Ncols*sizeof(dfloat); + + // occaTimerTic(parAlmond->device,"device smoothChebyshev"); + + if(x_is_zero){ //skip the Ax if x is zero + //res = D^{-1}r + vectorDotStar(Nrows, 1.0, o_A->o_diagInv, o_r, 0.0, o_res); + vectorSet(Nrows, 0.0, o_x); + //d = invTheta*res + vectorAdd(Nrows, invTheta, o_res, 0.0, o_d); + } else { + //res = D^{-1}(r-Ax) + o_A->SpMV(-1.0, o_x, 1.0, o_r, o_res); + vectorDotStar(Nrows, o_A->o_diagInv, o_res); + + //d = invTheta*res + vectorAdd(Nrows, invTheta, o_res, 0.0, o_d); + } + + for (int k=0;kSpMV(1.0, o_d, 0.0, o_Ad); + vectorDotStar(Nrows, -1.0, o_A->o_diagInv, o_Ad, 1.0, o_res); + + rho_np1 = 1.0/(2.*sigma-rho_n); + + //d_k+1 = rho_k+1*rho_k*d_k + 2*rho_k+1*r_k+1/delta + vectorAdd(Nrows, 2.0*rho_np1/delta, o_res, rho_np1*rho_n, o_d); + rho_n = rho_np1; + } + //x_k+1 = x_k + d_k + vectorAdd(Nrows, 1.0, o_d, 1.0, o_x); + + // occaTimerToc(parAlmond->device,"device smoothChebyshev"); +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/coarseSolver.cpp b/libs/parAlmond/src/coarseSolver.cpp new file mode 100644 index 000000000..90630f04f --- /dev/null +++ b/libs/parAlmond/src/coarseSolver.cpp @@ -0,0 +1,389 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +coarseSolver::coarseSolver(setupAide options_) { + gatherLevel = false; + options = options_; +} + +int coarseSolver::getTargetSize() { + return 1000; +} + +//set up exact solver using xxt +void coarseSolver::setup(parCSR *A) { + + comm = A->comm; + + int rank, size; + MPI_Comm_rank(comm,&rank); + MPI_Comm_size(comm,&size); + + //copy the global coarse partition as ints + coarseOffsets = (int* ) calloc(size+1,sizeof(int)); + for (int r=0;rglobalRowStarts[r]; + + coarseTotal = coarseOffsets[size]; + coarseOffset = coarseOffsets[rank]; + + N = (int) A->Nrows; + + int sendNNZ = (int) (A->diag->nnz+A->offd->nnz); + int *rows; + int *cols; + dfloat *vals; + + // if((rank==0)&&(options.compareArgs("VERBOSE","TRUE"))) + // printf("Setting up coarse solver...");fflush(stdout); + + // Make the MPI_NONZERO_T data type + nonzero_t NZ; + MPI_Datatype MPI_NONZERO_T; + MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT}; + int blength[3] = {1, 1, 1}; + MPI_Aint addr[3], displ[3]; + MPI_Get_address ( &(NZ.row), addr+0); + MPI_Get_address ( &(NZ.col), addr+1); + MPI_Get_address ( &(NZ.val), addr+2); + displ[0] = 0; + displ[1] = addr[1] - addr[0]; + displ[2] = addr[2] - addr[0]; + MPI_Type_create_struct (3, blength, displ, dtype, &MPI_NONZERO_T); + MPI_Type_commit (&MPI_NONZERO_T); + + nonzero_t *sendNonZeros = (nonzero_t *) calloc(sendNNZ, sizeof(nonzero_t)); + + //populate matrix + int cnt = 0; + for (int n=0;ndiag->rowStarts[n]; + int end = (int) A->diag->rowStarts[n+1]; + for (int m=start;mdiag->cols[m] + coarseOffset; + sendNonZeros[cnt].val = A->diag->vals[m]; + cnt++; + } + start = (int) A->offd->rowStarts[n]; + end = (int) A->offd->rowStarts[n+1]; + for (dlong m=start;mcolMap[A->offd->cols[m]]; + sendNonZeros[cnt].val = A->offd->vals[m]; + cnt++; + } + } + + //get the nonzero counts from all ranks + int *recvNNZ = (int*) calloc(size,sizeof(int)); + int *NNZoffsets = (int*) calloc(size+1,sizeof(int)); + MPI_Allgather(&sendNNZ, 1, MPI_INT, + recvNNZ, 1, MPI_INT, comm); + + int totalNNZ = 0; + for (int r=0;rnull, N, MPI_DFLOAT, + nullTotal, coarseCounts, coarseOffsets, MPI_DFLOAT, + comm); + + //clean up + MPI_Barrier(comm); + MPI_Type_free(&MPI_NONZERO_T); + free(sendNonZeros); + free(NNZoffsets); + free(recvNNZ); + + + //assemble the full matrix + dfloat *coarseA = (dfloat *) calloc(coarseTotal*coarseTotal,sizeof(dfloat)); + for (int i=0;inullSpace) { //A is dense due to nullspace augmentation + for (int n=0;nnullSpacePenalty*nullTotal[n]*nullTotal[m]; + } + } + } + + free(recvNonZeros); + free(nullTotal); + + matrixInverse(coarseTotal, coarseA); + + //store only the local rows of the full inverse + invCoarseA = (dfloat *) calloc(N*coarseTotal,sizeof(dfloat)); + for (int n=0;nglobalRowStarts; + int coarseTotal = coarseOffsets[size]; + int coarseOffset = coarseOffsets[rank]; + + int *globalNumbering = (int *) calloc(coarseTotal,sizeof(int)); + for (int n=0;nA; + int N = level->Nrows; + + int totalNNZ; + int *rows; + int *cols; + dfloat *vals; + + if(!nullSpace) { + //if no nullspace, use sparse A + totalNNZ = A->diagNNZ+A->offdNNZ; + if (totalNNZ) { + rows = (int *) calloc(totalNNZ,sizeof(int)); + cols = (int *) calloc(totalNNZ,sizeof(int)); + vals = (dfloat *) calloc(totalNNZ,sizeof(dfloat)); + } + + //populate matrix + int cnt = 0; + for (int n=0;ndiagRowStarts[n];mdiagRowStarts[n+1];m++) { + rows[cnt] = n + coarseOffset; + cols[cnt] = A->diagCols[m] + coarseOffset; + vals[cnt] = A->diagCoefs[m]; + cnt++; + } + for (int m=A->offdRowStarts[n];moffdRowStarts[n+1];m++) { + rows[cnt] = n + coarseOffset; + cols[cnt] = A->colMap[A->offdCols[m]]; + vals[cnt] = A->offdCoefs[m]; + cnt++; + } + } + } else { + totalNNZ = A->Nrows*coarseTotal; //A is dense due to nullspace augmentation + if (totalNNZ) { + rows = (int *) calloc(totalNNZ,sizeof(int)); + cols = (int *) calloc(totalNNZ,sizeof(int)); + vals = (dfloat *) calloc(totalNNZ,sizeof(dfloat)); + } + + //gather null vector + dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); + int *nullCounts = (int*) calloc(size,sizeof(int)); + for (int r=0;rnull, A->Nrows, MPI_DFLOAT, nullTotal, nullCounts, coarseOffsets, MPI_DFLOAT, agmg::comm); + + //populate matrix + for (int n=0;ndiagRowStarts[n];mdiagRowStarts[n+1];m++) { + int col = A->diagCols[m] + coarseOffset; + vals[n*coarseTotal+col] += A->diagCoefs[m]; + } + for (int m=A->offdRowStarts[n];moffdRowStarts[n+1];m++) { + int col = A->colMap[A->offdCols[m]]; + vals[n*coarseTotal+col] += A->offdCoefs[m]; + } + } + } + + parAlmond->ExactSolve = xxtSetup(A->Nrows, + globalNumbering, + totalNNZ, + rows, + cols, + vals, + 0, + "int", + dfloatString); + + parAlmond->coarseTotal = coarseTotal; + parAlmond->coarseOffset = coarseOffset; + + parAlmond->xCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); + parAlmond->rhsCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); + + free(globalNumbering); + if (totalNNZ) { + free(rows); + free(cols); + free(vals); + } + + printf("Done UberCoarse setup\n"); +} + + +void exactCoarseSolve(parAlmond_t *parAlmond, int N, dfloat *rhs, dfloat *x) { + + //use coarse solver + for (int n=0;ncoarseTotal;n++) + parAlmond->rhsCoarse[n] =0.; + + for (int n=0;nrhsCoarse[n+parAlmond->coarseOffset] = rhs[n]; + + xxtSolve(parAlmond->xCoarse, parAlmond->ExactSolve, parAlmond->rhsCoarse); + + for (int n=0;nxCoarse[n+parAlmond->coarseOffset]; + +} + +void device_exactCoarseSolve(parAlmond_t *parAlmond, int N, occa::memory o_rhs, occa::memory o_x) { + + //use coarse solver + for (int n=0;ncoarseTotal;n++) + parAlmond->rhsCoarse[n] =0.; + + o_rhs.copyTo(parAlmond->rhsCoarse+parAlmond->coarseOffset); + xxtSolve(parAlmond->xCoarse, parAlmond->ExactSolve, parAlmond->rhsCoarse); + o_x.copyFrom(parAlmond->xCoarse+parAlmond->coarseOffset,N*sizeof(dfloat)); +} +#endif + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/kernels.cpp b/libs/parAlmond/src/kernels.cpp new file mode 100644 index 000000000..9beff4710 --- /dev/null +++ b/libs/parAlmond/src/kernels.cpp @@ -0,0 +1,166 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +int Nrefs = 0; + +occa::kernel haloExtractKernel; + +occa::kernel SpMVcsrKernel1; +occa::kernel SpMVcsrKernel2; +occa::kernel SpMVellKernel1; +occa::kernel SpMVellKernel2; +occa::kernel SpMVmcsrKernel1; +occa::kernel SpMVmcsrKernel2; + +occa::kernel vectorSetKernel; +occa::kernel vectorScaleKernel; +occa::kernel vectorAddScalarKernel; +occa::kernel vectorAddKernel1; +occa::kernel vectorAddKernel2; +occa::kernel vectorDotStarKernel1; +occa::kernel vectorDotStarKernel2; +occa::kernel vectorInnerProdKernel; +occa::kernel kcycleCombinedOp1Kernel; +occa::kernel kcycleCombinedOp2Kernel; +occa::kernel kcycleWeightedCombinedOp1Kernel; +occa::kernel kcycleWeightedCombinedOp2Kernel; +occa::kernel vectorAddInnerProdKernel; +occa::kernel vectorAddWeightedInnerProdKernel; + +void buildParAlmondKernels(MPI_Comm comm, occa::device device){ + + int rank, size; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + double seed = (double) rank; + srand48(seed); + + occa::properties kernelInfo; + kernelInfo["defines"].asObject(); + kernelInfo["includes"].asArray(); + kernelInfo["header"].asArray(); + kernelInfo["flags"].asObject(); + + if(sizeof(dlong)==4){ + kernelInfo["defines/" "dlong"]="int"; + } + if(sizeof(dlong)==8){ + kernelInfo["defines/" "dlong"]="long long int"; + } + + if(sizeof(dfloat) == sizeof(double)){ + kernelInfo["defines/" "dfloat"]= "double"; + kernelInfo["defines/" "dfloat4"]= "double4"; + } + else if(sizeof(dfloat) == sizeof(float)){ + kernelInfo["defines/" "dfloat"]= "float"; + kernelInfo["defines/" "dfloat4"]= "float4"; + } + + kernelInfo["defines/" "p_BLOCKSIZE"]= BLOCKSIZE; + + if(device.mode()=="OpenCL"){ + //kernelInfo["compiler_flags"] += "-cl-opt-disable"; + } + + if(device.mode()=="CUDA"){ // add backend compiler optimization for CUDA + kernelInfo["compiler_flags"] += "--ftz=true"; + kernelInfo["compiler_flags"] += "--prec-div=false"; + kernelInfo["compiler_flags"] += "--prec-sqrt=false"; + kernelInfo["compiler_flags"] += "--use_fast_math"; + kernelInfo["compiler_flags"] += "--fmad=true"; // compiler option for cuda + } + + if (rank==0) printf("Compiling parALMOND Kernels...");fflush(stdout); + + for (int r=0;rAx(ck,vk); + + dfloat rho[3]; + + if(ktype == PCG) + kcycleCombinedOp1(Nrows, rho, ck, rhs, vk, weight, weighted, comm); + + if(ktype == GMRES) + kcycleCombinedOp1(Nrows, rho, vk, rhs, vk, weight, weighted, comm); + + *alpha1 = rho[0]; + *rho1 = rho[1]; + *norm_rhs = sqrt(rho[2]); + + const dfloat a = -(*alpha1)/(*rho1); + + // rhs = rhs - (alpha1/rho1)*vk + *norm_rhstilde = sqrt(vectorAddInnerProd(Nrows, a, vk, 1.0, rhs, o_weight, weighted,comm)); +} + +void multigridLevel::kcycleOp2(const dfloat alpha1, const dfloat rho1) { + + // w = A*x + this->Ax(x,wk); + + dfloat rho[3]; + + if(ktype == PCG) + kcycleCombinedOp2(Nrows,rho, x, vk, wk, rhs, weight, weighted, comm); + + if(ktype == GMRES) + kcycleCombinedOp2(Nrows,rho, wk, vk, wk, rhs, weight, weighted, comm); + + const dfloat gamma = rho[0]; + const dfloat beta = rho[1]; + const dfloat alpha2 = rho[2]; + + if(fabs(rho1) > (dfloat) 1e-20){ + + const dfloat rho2 = beta - gamma*gamma/rho1; + + if(fabs(rho2) > (dfloat) 1e-20){ + // x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*dk + const dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2); + const dfloat b = alpha2/rho2; + + vectorAdd(Nrows, a, ck, b, x); + } + } +} + +void multigridLevel::device_kcycleOp1(dfloat *alpha1, dfloat *rho1, + dfloat *norm_rhs, dfloat *norm_rhstilde) { + + //ck = x + o_ck.copyFrom(o_x, Nrows*sizeof(dfloat)); + + // vk = A*ck + this->Ax(o_ck,o_vk); + + dfloat rho[3]; + + if(ktype == PCG) + kcycleCombinedOp1(Nrows, rho, o_ck, o_rhs, o_vk, o_weight, weighted, comm); + + if(ktype == GMRES) + kcycleCombinedOp1(Nrows, rho, o_vk, o_rhs, o_vk, o_weight, weighted, comm); + + *alpha1 = rho[0]; + *rho1 = rho[1]; + *norm_rhs = sqrt(rho[2]); + + const dfloat a = -(*alpha1)/(*rho1); + + // rhs = rhs - (alpha1/rho1)*vk + *norm_rhstilde = sqrt(vectorAddInnerProd(Nrows, a, o_vk, 1.0, o_rhs, o_weight, weighted,comm)); +} + +void multigridLevel::device_kcycleOp2(const dfloat alpha1, const dfloat rho1) { + + // w = A*x + this->Ax(o_x,o_wk); + + dfloat rho[3]; + + if(ktype == PCG) + kcycleCombinedOp2(Nrows,rho, o_x, o_vk, o_wk, o_rhs, o_weight, weighted, comm); + + if(ktype == GMRES) + kcycleCombinedOp2(Nrows,rho, o_wk, o_vk, o_wk, o_rhs, o_weight, weighted, comm); + + const dfloat gamma = rho[0]; + const dfloat beta = rho[1]; + const dfloat alpha2 = rho[2]; + + if(fabs(rho1) > (dfloat) 1e-20){ + + const dfloat rho2 = beta - gamma*gamma/rho1; + + if(fabs(rho2) > (dfloat) 1e-20){ + // x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*dk + const dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2); + const dfloat b = alpha2/rho2; + + vectorAdd(Nrows, a, o_ck, b, o_x); + } + } +} + +} \ No newline at end of file diff --git a/libs/parAlmond/src/matrix.cpp b/libs/parAlmond/src/matrix.cpp new file mode 100644 index 000000000..0fde8d16c --- /dev/null +++ b/libs/parAlmond/src/matrix.cpp @@ -0,0 +1,739 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +matrix_t::matrix_t(dlong N, dlong M): Nrows(N), Ncols(M) {} + +//------------------------------------------------------------------------ +// +// CSR matrix +// +//------------------------------------------------------------------------ + +CSR::CSR(dlong N, dlong M): matrix_t(N,M) {} + +CSR::~CSR() { + free(rowStarts); + free(cols); + free(vals); + + if (o_rowStarts.size()) o_rowStarts.free(); + if (o_cols.size()) o_cols.free(); + if (o_vals.size()) o_vals.free(); +} + +//------------------------------------------------------------------------ +// +// ELL matrix +// +//------------------------------------------------------------------------ + +ELL::ELL(dlong N, dlong M): matrix_t(N,M) {} + + +ELL::~ELL() { + free(cols); + free(vals); + + if (o_cols.size()) o_cols.free(); + if (o_vals.size()) o_vals.free(); +} + +void ELL::syncToDevice(occa::device device) { + + dlong *colsT = (dlong *) malloc(Nrows*nnzPerRow*sizeof(dlong)); + dfloat *valsT = (dfloat *) malloc(Nrows*nnzPerRow*sizeof(dfloat)); + for (dlong n=0;nrowStarts = (dlong *) calloc(Nrows+1, sizeof(dlong)); + offd->rowStarts = (dlong *) calloc(Nrows+1, sizeof(dlong)); + + //count the entries in each row + for (dlong n=0;nglobalOffset+Nrows-1)) + offd->rowStarts[row+1]++; + else + diag->rowStarts[row+1]++; + } + + // cumulative sum + for(dlong i=0; irowStarts[i+1] += diag->rowStarts[i]; + offd->rowStarts[i+1] += offd->rowStarts[i]; + } + diag->nnz = diag->rowStarts[Nrows]; + offd->nnz = offd->rowStarts[Nrows]; + + // Halo setup + hlong *colIds = (hlong *) malloc(offd->nnz*sizeof(hlong)); + dlong cnt=0; + for (dlong n=0;nglobalOffset+N-1)) + colIds[cnt++] = Aj[n]; + } + this->haloSetup(colIds); + + //fill the CSR matrices + diagA = (dfloat *) calloc(Ncols, sizeof(dfloat)); + diagInv = (dfloat *) calloc(Ncols, sizeof(dfloat)); + diag->cols = (dlong *) calloc(diag->nnz, sizeof(dlong)); + offd->cols = (dlong *) calloc(offd->nnz, sizeof(dlong)); + diag->vals = (dfloat *) calloc(diag->nnz, sizeof(dfloat)); + offd->vals = (dfloat *) calloc(offd->nnz, sizeof(dfloat)); + dlong diagCnt = 0; + dlong offdCnt = 0; + for (dlong n=0;nglobalOffset+Nrows-1)) { + offd->cols[offdCnt] = colIds[offdCnt]; + offd->vals[offdCnt] = Avals[n]; + offdCnt++; + } else { + diag->cols[diagCnt] = (dlong) (Aj[n] - globalOffset); + diag->vals[diagCnt] = Avals[n]; + + //record the diagonal + dlong row = (dlong) (Ai[n] - globalOffset); + if (row==diag->cols[diagCnt]) + diagA[row] = diag->vals[diagCnt]; + + diagCnt++; + } + } + + //fill the halo region + ogsGatherScatter(diagA, ogsDfloat, ogsAdd, ogs); + + //compute the inverse diagonal + for (dlong n=0;nnnz*sizeof(parallelId_t)); + + for (dlong n=0;nnnz;n++) { + parIds[n].localId = n; + parIds[n].globalId = colIds[n]; + } + + //sort by global index + qsort(parIds, offd->nnz, sizeof(parallelId_t), CompareGlobalId); + + //count unique nonlocal column ids + dlong Noffdcols = 0; //number of unique columns + if(offd->nnz) parIds[0].newId = Noffdcols; + for (dlong n=1;nnnz;n++) { + if (parIds[n].globalId != parIds[n-1].globalId) + Noffdcols++; + + parIds[n].newId = Noffdcols; + } + if(offd->nnz) Noffdcols++; + + //record the global ids of the unique columns + hlong *offdcols = (hlong *) malloc(Noffdcols*sizeof(hlong)); + Noffdcols = 0; + if(offd->nnz) offdcols[Noffdcols++] = parIds[0].globalId; + for (dlong n=1;nnnz;n++) + if (parIds[n].globalId != parIds[n-1].globalId) + offdcols[Noffdcols++] = parIds[n].globalId; + + //sort back to local order + qsort(parIds, offd->nnz, sizeof(parallelId_t), CompareLocalId); + + // be careful to make sure Ncols is set at this point + NlocalCols = Ncols; + Ncols += Noffdcols; + + //make an array of all the column ids required on this rank (local first) + colMap = (hlong*) malloc(Ncols*sizeof(hlong)); + for (dlong n=0; n does not participate in sum + } + + //construct the parCSR ogs object for comms + ogsHalo = ogsSetup(Nhalo, ghaloIds, comm, verbose, device); + + MPI_Barrier(comm); + free(ghaloIds); + free(offdcols); + free(minRank); + free(maxRank); + + //update column numbering + for (dlong n=0;nnnz;n++) + colIds[n] = NlocalCols + parIds[n].newId; + + size_t requiredBytes = Nhalo*sizeof(dfloat); + allocatePinnedScratchSpace(requiredBytes, device); + + free(parIds); +} + +void parCSR::haloExchangeStart(dfloat *x) { + // copy data from outgoing elements into temporary send buffer + for(int i=0;i Ntotal) k = (int) Ntotal; + + // do an arnoldi + + // allocate memory for Hessenberg matrix + double *H = (double *) calloc(k*k,sizeof(double)); + + // allocate memory for basis + dfloat **V = (dfloat **) calloc(k+1, sizeof(dfloat *)); + dfloat *Vx = (dfloat *) calloc(Ncols, sizeof(dfloat)); + + for(int i=0; i<=k; i++) + V[i] = (dfloat *) calloc(Nrows, sizeof(dfloat)); + + // generate a random vector for initial basis vector + vectorRandomize(Nrows, Vx); + + dfloat norm_vo = vectorNorm(Nrows,Vx, comm); + vectorScale(Nrows, 1.0/norm_vo, Vx); + + memcpy(V[0], Vx, Nrows*sizeof(dfloat)); + + for(int j=0; jSpMV(1.0, Vx, 0., V[j+1]); + vectorDotStar(Nrows, diagInv, V[j+1]); + + // modified Gram-Schmidth + for(int i=0; i<=j; i++){ + // H(i,j) = v[i]'*A*v[j] + dfloat hij = vectorInnerProd(Nrows, V[i], V[j+1],comm); + + // v[j+1] = v[j+1] - hij*v[i] + vectorAdd(Nrows,-hij, V[i], 1.0, V[j+1]); + + H[i + j*k] = (double) hij; + } + + if(j+1 < k){ + + dfloat norm_vj = vectorNorm(Nrows,V[j+1],comm); + + H[j+1+ j*k] = (double) norm_vj; + + vectorScale(Nrows, 1./H[j+1 + j*k], V[j+1]); + } + } + + double *WR = (double *) calloc(k,sizeof(double)); + double *WI = (double *) calloc(k,sizeof(double)); + + eig(k, H, WR, WI); + + double rho = 0.; + + for(int i=0; iNrows, A->Ncols) { + + int *rowCounters = (int*) calloc(A->Nrows, sizeof(int)); + + int maxNnzPerRow = 0; + int minNnzPerRow = 0; + if (A->Nrows) + minNnzPerRow = (int) A->diag->rowStarts[1] - A->diag->rowStarts[0]; + + for(dlong i=0; iNrows; i++) { + int rowNnz = (int) A->diag->rowStarts[i+1] - A->diag->rowStarts[i]; + rowCounters[i] = rowNnz; + + maxNnzPerRow = (rowNnz > maxNnzPerRow) ? rowNnz : maxNnzPerRow; + minNnzPerRow = (rowNnz < minNnzPerRow) ? rowNnz : minNnzPerRow; + } + + // This chooses the nnzPerRow by binning. Just pack all the local nonzeros in ELL + /* + // create bins + int numBins = maxNnzPerRow - minNnzPerRow + 1; + + //zero row check + if (numBins<0) numBins =0; + + int *bins; + if (numBins) + bins = (int *) calloc(numBins, sizeof(int)); + + for(dlong i=0; iNrows; i++) + bins[rowCounters[i]-minNnzPerRow]++; + + dfloat threshold = 2.0/3.0; + dlong totalNNZ = csrA->diagNNZ+csrA->offdNNZ; + int nnzPerRow = 0; + dlong nnz = 0; + + //increase the nnz per row in E until it holds threshold*totalnnz nonzeros + for(int i=0; i threshold*totalNNZ)||(i==numBins-1)){ + nnzPerRow = i+minNnzPerRow; + break; + } + } + */ + if(Nrows) { + free(rowCounters); + // free(bins); + } + + int nnzPerRow = maxNnzPerRow; + + //build the ELL matrix from the local CSR + E = new ELL(Nrows, Ncols); + C = new MCSR(Nrows, Ncols); + + E->nnzPerRow = nnzPerRow; + + E->cols = (dlong *) calloc(Nrows*E->nnzPerRow, sizeof(dlong)); + E->vals = (dfloat *) calloc(Nrows*E->nnzPerRow, sizeof(dfloat)); + + C->nnz = 0; + C->actualRows = 0; + + for(dlong i=0; idiag->rowStarts[i]; + dlong Jend = A->diag->rowStarts[i+1]; + int rowNnz = (int) (Jend - Jstart); + + // store only min of nnzPerRow and rowNnz + int maxNnz = (nnzPerRow >= rowNnz) ? rowNnz : nnzPerRow; + + for(int c=0; ccols[i*nnzPerRow+c] = A->diag->cols[Jstart+c]; + E->vals[i*nnzPerRow+c] = A->diag->vals[Jstart+c]; + } + + for(int c=maxNnz; ccols[i*nnzPerRow+c] = -1; //ignore this column + } + + // count the number of nonzeros to be stored in MCSR format + + //all of offd + int cnt= (int) (A->offd->rowStarts[i+1]-A->offd->rowStarts[i]); + if (rowNnz>nnzPerRow) + cnt += rowNnz-nnzPerRow; //excess of diag + + if (cnt) { + C->nnz += cnt; + C->actualRows++; + } + } + + C->rowStarts = (dlong *) calloc(C->actualRows+1, sizeof(dlong)); + C->rows = (dlong *) calloc(C->actualRows, sizeof(dlong)); + C->cols = (dlong *) calloc(C->nnz, sizeof(dlong)); + C->vals = (dfloat *) calloc(C->nnz, sizeof(dfloat)); + + dlong row = 0; + dlong cnt = 0; + for(dlong i=0; idiag->rowStarts[i]; + dlong Jend = A->diag->rowStarts[i+1]; + int rowNnz = (int) (Jend - Jstart); + int rowCnt =0; + + // store the remaining row in MCSR format + if(rowNnz > nnzPerRow){ + rowCnt += rowNnz-nnzPerRow; + for(int c=nnzPerRow; ccols[cnt] = A->diag->cols[Jstart+c]; + C->vals[cnt] = A->diag->vals[Jstart+c]; + cnt++; + } + } + + //add the offd non-zeros + Jstart = A->offd->rowStarts[i]; + Jend = A->offd->rowStarts[i+1]; + rowCnt += (int) (Jend-Jstart); + for (dlong j=Jstart;jcols[cnt] = A->offd->cols[j]; + C->vals[cnt] = A->offd->vals[j]; + cnt++; + } + + if (rowCnt) { + C->rows[row++] = i; + C->rowStarts[row] = cnt; + } + } + + nullSpace = A->nullSpace; + nullSpacePenalty = A->nullSpacePenalty; + + null = A->null; + o_null = A->o_null; + + diagA = A->diagA; + o_diagA = A->o_diagA; + + diagInv = A->diagInv; + o_diagInv = A->o_diagInv; + + comm = A->comm; + globalRowStarts = A->globalRowStarts; + globalColStarts = A->globalColStarts; + colMap = A->colMap; + + ogs = A->ogs; + ogsHalo = A->ogsHalo; + + Nhalo = A->Nhalo; + Nshared = A->Nshared; + NlocalCols = A->NlocalCols; + + haloIds = A->haloIds; + o_haloIds = A->o_haloIds; + + device = A->device; +} + +parHYB::~parHYB() { + delete E; + delete C; + + free(diagA); + free(diagInv); + + if (o_diagA.size()) o_diagA.free(); + if (o_diagInv.size()) o_diagInv.free(); + + free(null); + if (o_null.size()) o_null.free(); + + free(globalRowStarts); + free(globalColStarts); + + free(colMap); + free(haloIds); + + if (ogs) ogsFree(ogs); + if (ogsHalo) ogsFree(ogsHalo); +}; + +void parHYB::syncToDevice() { + + E->syncToDevice(device); + C->syncToDevice(device); + + if (Nrows) { + o_diagA = device.malloc(Nrows*sizeof(dfloat), diagA); + o_diagInv = device.malloc(Nrows*sizeof(dfloat), diagInv); + + if(nullSpace) + o_null = device.malloc(Nrows*sizeof(dfloat), null); + } + + if (Nshared) + o_haloIds = device.malloc(Nshared*sizeof(dlong), haloIds); +} + +void parHYB::haloExchangeStart(dfloat *x) { + // copy data from outgoing elements into temporary send buffer + for(int i=0;iNrows; + + dfloat* rhs = level->rhs; + dfloat* x = level->x; + dfloat* res = level->res; + + //check for base level + if(k==baseLevel) { + coarseLevel->solve(rhs, x); + return; + } + + multigridLevel *levelC = levels[k+1]; + dlong mCoarse = levelC->Nrows; + dfloat* rhsC = levelC->rhs; + dfloat* xC = levelC->x; + + //apply smoother to x and then return res = rhs-Ax + level->smooth(rhs, x, true); + level->residual(rhs, x, res); + + // rhsC = P^T res + levelC->coarsen(res, rhsC); + + if(k+1>NUMKCYCLES) { + this->vcycle(k+1); + } else{ + // first inner krylov iteration + this->kcycle(k+1); + + // ck = x + // alpha1=ck*rhsC, rho1=ck*Ack, norm_rhs=sqrt(rhsC*rhsC) + // rhsC = rhsC - (alpha1/rho1)*vkp1 + // norm_rtilde = sqrt(rhsC*rhsC) + dfloat rho1, alpha1, norm_rhs, norm_rhstilde; + levelC->kcycleOp1(&alpha1, &rho1, &norm_rhs, &norm_rhstilde); + + if(norm_rhstilde < KCYCLETOL*norm_rhs){ + // xC = (alpha1/rho1)*xC + vectorScale(mCoarse, alpha1/rho1, xC); + } else{ + + // second inner krylov iteration + this->kcycle(k+1); + + // gamma=xC*Ack, beta=xC*AxC, alpha2=xC*rhsC + // rho2=beta - gamma*gamma/rho1 + // xC = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*xC + levelC->kcycleOp2(alpha1, rho1); + } + } + + // x = x + P xC + levelC->prolongate(xC, x); + + level->smooth(rhs, x, false); +} + + +void solver_t::device_kcycle(int k){ + + multigridLevel *level = levels[k]; + + dlong m = level->Nrows; + + occa::memory o_rhs = level->o_rhs; + occa::memory o_x = level->o_x; + occa::memory o_res = level->o_res; + + //check for device<->host handoff + if(m < GPU_CPU_SWITCH_SIZE){ + o_rhs.copyTo(level->rhs, m*sizeof(dfloat)); + this->kcycle(k); + o_x.copyFrom(level->x, m*sizeof(dfloat)); + return; + } + + //check for base level + if(k==baseLevel) { + coarseLevel->solve(o_rhs, o_x); + return; + } + + multigridLevel *levelC = levels[k+1]; + dlong mCoarse = levelC->Nrows; + occa::memory o_rhsC = levelC->o_rhs; + occa::memory o_xC = levelC->o_x; + + //apply smoother to x and then compute res = rhs-Ax + level->smooth(o_rhs, o_x, true); + level->residual(o_rhs, o_x, o_res); + + // rhsC = P^T res + levelC->coarsen(o_res, o_rhsC); + + if(k+1>NUMKCYCLES) { + this->device_vcycle(k+1); + } else{ + // first inner krylov iteration + this->device_kcycle(k+1); + + // alpha1=ck*rhsC, rho1=ck*Ack, norm_rhs=sqrt(rhsC*rhsC) + // rhsC = rhsC - (alpha1/rho1)*vkp1 + // norm_rtilde = sqrt(rhsC*rhsC) + dfloat rho1, alpha1, norm_rhs, norm_rhstilde; + levelC->device_kcycleOp1(&alpha1, &rho1, &norm_rhs, &norm_rhstilde); + + if(norm_rhstilde < KCYCLETOL*norm_rhs){ + // xC = (alpha1/rho1)*xC + vectorScale(mCoarse, alpha1/rho1, o_xC); + } else{ + + // second inner krylov iteration + this->device_kcycle(k+1); + + // gamma=xC*Ack, beta=xC*AxC, alpha2=xC*rhsC + // rho2=beta - gamma*gamma/rho1 + // xC = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*xC + levelC->device_kcycleOp2(alpha1, rho1); + } + } + + // x = x + P xC + levelC->prolongate(o_xC, o_x); + level->smooth(o_rhs, o_x, false); +} + + + +void solver_t::vcycle(int k) { + + multigridLevel *level = levels[k]; + + dlong m = level->Nrows; + + dfloat* rhs = level->rhs; + dfloat* x = level->x; + dfloat* res = level->res; + + //check for base level + if(k==baseLevel) { + coarseLevel->solve(rhs, x); + return; + } + + multigridLevel *levelC = levels[k+1]; + dlong mCoarse = levelC->Nrows; + dfloat* rhsC = levelC->rhs; + dfloat* xC = levelC->x; + + //apply smoother to x and then return res = rhs-Ax + level->smooth(rhs, x, true); + level->residual(rhs, x, res); + + // rhsC = P^T res + levelC->coarsen(res, rhsC); + + this->vcycle(k+1); + + // x = x + P xC + levelC->prolongate(xC, x); + + level->smooth(rhs, x, false); +} + + +void solver_t::device_vcycle(int k){ + + multigridLevel *level = levels[k]; + + dlong m = level->Nrows; + + occa::memory o_rhs = level->o_rhs; + occa::memory o_x = level->o_x; + occa::memory o_res = level->o_res; + + //check for device<->host handoff + if(m < GPU_CPU_SWITCH_SIZE){ + o_rhs.copyTo(level->rhs, m*sizeof(dfloat)); + vcycle(k); + o_x.copyFrom(level->x, m*sizeof(dfloat)); + return; + } + + //check for base level + if(k==baseLevel) { + coarseLevel->solve(o_rhs, o_x); + return; + } + + multigridLevel *levelC = levels[k+1]; + dlong mCoarse = levelC->Nrows; + occa::memory o_rhsC = levelC->o_rhs; + occa::memory o_xC = levelC->o_x; + + //apply smoother to x and then compute res = rhs-Ax + level->smooth(o_rhs, o_x, true); + level->residual(o_rhs, o_x, o_res); + + // rhsC = P^T res + levelC->coarsen(o_res, o_rhsC); + + this->device_vcycle(k+1); + + // x = x + P xC + levelC->prolongate(o_xC, o_x); + + level->smooth(o_rhs, o_x, false); +} + +} //hamespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/parAlmond.cpp b/libs/parAlmond/src/parAlmond.cpp new file mode 100644 index 000000000..7259d0792 --- /dev/null +++ b/libs/parAlmond/src/parAlmond.cpp @@ -0,0 +1,107 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + + +solver_t *Init(occa::device device, MPI_Comm comm, setupAide options) { + solver_t *M = new solver_t(device, comm, options); + + if (Nrefs==0) buildParAlmondKernels(comm, device); + Nrefs++; + + return M; +} + +void AMGSetup(solver_t *MM, + hlong* globalRowStarts, //global partition + dlong nnz, //-- + hlong* Ai, //-- Local A matrix data (globally indexed, COO storage, row sorted) + hlong* Aj, //-- + dfloat* Avals, //-- + bool nullSpace, + dfloat nullSpacePenalty){ + + solver_t *M = (solver_t *) MM; + + int rank, size; + MPI_Comm_rank(M->comm, &rank); + MPI_Comm_size(M->comm, &size); + + hlong TotalRows = globalRowStarts[M->size]; + dlong numLocalRows = (dlong) (globalRowStarts[M->rank+1]-globalRowStarts[M->rank]); + + if(rank==0) printf("Setting up AMG...");fflush(stdout); + + //populate null space vector + dfloat *null = (dfloat *) calloc(numLocalRows, sizeof(dfloat)); + for (dlong i=0;icomm, M->device); + free(null); + + M->AMGSetup(A); + + if(rank==0) printf("done.\n"); +} + +void Precon(solver_t *M, occa::memory o_x, occa::memory o_rhs) { + + M->levels[0]->o_x = o_x; + M->levels[0]->o_rhs = o_rhs; + + if ((M->exact)&&(M->ktype==PCG)){ + M->device_pcg(1000,1e-8); + } else if((M->exact)&&(M->ktype==GMRES)){ + M->device_pgmres(1000,1e-8); + } else if(M->ctype==KCYCLE) { + M->device_kcycle(0); + } else if(M->ctype==VCYCLE) { + M->device_vcycle(0); + } +} + +void Report(solver_t *M) { + M->Report(); +} + +void Free(solver_t* M) { + Nrefs--; + if (Nrefs==0) { + freeParAlmondKernels(); + freeScratchSpace(); + freePinnedScratchSpace(); + } + + delete M; +} + +} //namespace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/pcg.cpp b/libs/parAlmond/src/pcg.cpp new file mode 100644 index 000000000..0bf5e7492 --- /dev/null +++ b/libs/parAlmond/src/pcg.cpp @@ -0,0 +1,240 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +void solver_t::pcg(const int maxIt, const dfloat tol){ + + const dlong m = levels[0]->Nrows; + const dlong n = levels[0]->Ncols; + + ktype = PCG; + + // use parAlmond's buffers + dfloat *r = levels[0]->rhs; + dfloat *z = levels[0]->x; + + // initial residual + dfloat rdotr0 = vectorInnerProd(m, r, r, levels[0]->comm); + + dfloat *x = (dfloat *) calloc(n,sizeof(dfloat)); + dfloat *Ap = (dfloat *) calloc(n,sizeof(dfloat)); + dfloat *p = (dfloat *) calloc(n,sizeof(dfloat)); + + //sanity check + if (rdotr0<=(tol*tol)) { + memcpy(levels[0]->x, x, m*sizeof(dfloat)); + free(x); free(p); free(Ap); + return; + } + + // Precondition, z = M^{-1}*r + if(ctype==KCYCLE) { + this->kcycle(0); + } else if(ctype==VCYCLE) { + this->vcycle(0); + } + memcpy(p, z, m*sizeof(dfloat)); + + dfloat rdotz0 = vectorInnerProd(m, r, z, levels[0]->comm); + + dfloat rdotr1 = 0; + dfloat rdotz1 = 0; + dfloat alpha, beta, pAp; + + int Niter = 0; + while(rdotr0>(tol*tol)){ + // Ap = A*p; + levels[0]->Ax(p, Ap); + + dfloat pAp = vectorInnerProd(m, p, Ap, levels[0]->comm); + + alpha = rdotz0/pAp; + + // update solution + // x = x + alpha * p; + vectorAdd(m, alpha, p, 1.0, x); + + // update residual + // r = r - alpha * Ap; + vectorAdd(m, -alpha, Ap, 1.0, r); + + dfloat rdotr1 = vectorInnerProd(m, r, r, levels[0]->comm); + + if(rdotr1 < tol*tol) { + rdotr0 = rdotr1; + break; + } + + // Precondition, z = M^{-1}*r + if(ctype==KCYCLE) { + this->kcycle(0); + } else if(ctype==VCYCLE) { + this->vcycle(0); + } + + dfloat rdotz1 = vectorInnerProd(m, r, z, levels[0]->comm); + + if(ctype==KCYCLE) { + // flexible pcg beta = (z.(-alpha*Ap))/zdotz0 + dfloat zdotAp = vectorInnerProd(m, z, Ap, levels[0]->comm); + beta = -alpha*zdotAp/rdotz0; + } else { + beta = rdotz1/rdotz0; + } + + // p = z + beta*p + vectorAdd(m, 1.0, z, beta, p); + + // switch rdotz0 <= rdotz1 + rdotz0 = rdotz1; + + // switch rdotz0,rdotr0 <= rdotz1,rdotr1 + rdotr0 = rdotr1; + + Niter++; + + printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0)); + + if(Niter==maxIt) break; + } + + //copy result back to parAlmond's x storage + memcpy(levels[0]->x, x, m*sizeof(dfloat)); + free(x); free(p); free(Ap); +} + +void solver_t::device_pcg(const int maxIt, const dfloat tol){ + + const dlong m = levels[0]->Nrows; + const dlong n = levels[0]->Ncols; + + ktype = PCG; + + // use parAlmond's buffers + occa::memory &o_r = levels[0]->o_rhs; + occa::memory &o_z = levels[0]->o_x; + + // initial residual + dfloat rdotr0 = vectorInnerProd(m, o_r, o_r, levels[0]->comm); + + occa::memory o_x = device.malloc(n*sizeof(dfloat),levels[0]->x); + occa::memory o_Ap = device.malloc(n*sizeof(dfloat),levels[0]->x); + occa::memory o_p = device.malloc(n*sizeof(dfloat),levels[0]->x); + + // x = 0; + vectorSet(m, 0.0, o_x); + + //sanity check + if (rdotr0<=(tol*tol)) { + levels[0]->o_x.copyFrom(o_x); + printf("Almond PCG iter %d, res = %g\n", 0, sqrt(rdotr0)); + o_x.free(); o_p.free(); o_Ap.free(); + return; + } + + // Precondition, z = M^{-1}*r + if(ctype==KCYCLE) { + this->device_kcycle(0); + } else if(ctype==VCYCLE) { + this->device_vcycle(0); + } + o_p.copyFrom(o_z); + + dfloat rdotz0 = vectorInnerProd(m, o_r, o_z, levels[0]->comm); + + dfloat rdotr1 = 0; + dfloat rdotz1 = 0; + dfloat alpha, beta, pAp; + + int Niter = 0; + while(rdotr0>(tol*tol)){ + // Ap = A*p; + levels[0]->Ax(o_p, o_Ap); + + dfloat pAp = vectorInnerProd(m, o_p, o_Ap, levels[0]->comm); + + alpha = rdotz0/pAp; + + // update solution + // x = x + alpha * p; + vectorAdd(m, alpha, o_p, 1.0, o_x); + + // update residual + // r = r - alpha * Ap; + vectorAdd(m, -alpha, o_Ap, 1.0, o_r); + + dfloat rdotr1 = vectorInnerProd(m, o_r, o_r, levels[0]->comm); + + if(rdotr1 < tol*tol) { + rdotr0 = rdotr1; + break; + } + + // Precondition, z = M^{-1}*r + if(ctype==KCYCLE) { + this->device_kcycle(0); + } else if(ctype==VCYCLE) { + this->device_vcycle(0); + } + + dfloat rdotz1 = vectorInnerProd(m, o_r, o_z, levels[0]->comm); + + if(ctype==KCYCLE) { + // flexible pcg beta = (z.(-alpha*Ap))/zdotz0 + dfloat zdotAp = vectorInnerProd(m, o_z, o_Ap, levels[0]->comm); + beta = -alpha*zdotAp/rdotz0; + } else if(ctype==VCYCLE) { + beta = rdotz1/rdotz0; + } + + // p = z + beta*p + vectorAdd(m, 1.0, o_z, beta, o_p); + + // switch rdotz0 <= rdotz1 + rdotz0 = rdotz1; + + // switch rdotz0,rdotr0 <= rdotz1,rdotr1 + rdotr0 = rdotr1; + + Niter++; + + //printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0)); + + if(Niter==maxIt) break; + } + + //copy result back to parAlmond's x storage + levels[0]->o_x.copyFrom(o_x); + + printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0)); + + o_x.free(); o_p.free(); o_Ap.free(); +} + +} //namespace parAlmond \ No newline at end of file diff --git a/solvers/parALMOND/src/gmres.c b/libs/parAlmond/src/pgmres.cpp similarity index 58% rename from solvers/parALMOND/src/gmres.c rename to libs/parAlmond/src/pgmres.cpp index 7d91efc48..35671912a 100644 --- a/solvers/parALMOND/src/gmres.c +++ b/libs/parAlmond/src/pgmres.cpp @@ -24,7 +24,9 @@ SOFTWARE. */ -#include "agmg.h" +#include "parAlmond.hpp" + +namespace parAlmond { void gmresUpdate(dlong Nrows, dfloat *x, @@ -46,14 +48,13 @@ void gmresUpdate(dlong Nrows, } for(int j=0; j=0; --k){ y[k] = s[k]; @@ -73,60 +74,47 @@ void gmresUpdate(parAlmond_t *parAlmond, dlong Nrows, } for(int j=0; jlevels[0]->A; + const dlong m = levels[0]->Nrows; + const dlong n = levels[0]->Ncols; - const dlong m = A->Nrows; - // const dlong n = A->Ncols; - - parAlmond->ktype = GMRES; + ktype = GMRES; // use parAlmond's buffers - dfloat *r = parAlmond->levels[0]->rhs; - dfloat *z = parAlmond->levels[0]->x; + dfloat *r = levels[0]->rhs; + dfloat *z = levels[0]->x; // initial residual - dfloat nbLocal = innerProd(m, r, r); - dfloat nb = 0; - MPI_Allreduce(&nbLocal,&nb,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - nb = sqrt(nb); + dfloat nb = sqrt(vectorInnerProd(m, r, r, levels[0]->comm)); // x = 0; - dfloat *x = (dfloat *) calloc(m,sizeof(dfloat)); - setVector(m, x, 0.0); + dfloat *x = (dfloat *) calloc(n,sizeof(dfloat)); + vectorSet(m, 0.0, x); //sanity check if (nb<=tol) { - for (dlong i=0;ilevels[0]->x[i] = x[i]; - - free(x); + memcpy(levels[0]->x, x, m*sizeof(dfloat)); + free(x); return; } // M r = b - A*x0 - if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - kcycle(parAlmond, 0); - } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - vcycle(parAlmond, 0); - } else { - for (dlong k=0;kkcycle(0); + } else if(ctype==VCYCLE) { + this->vcycle(0); } - for (dlong k=0;kcomm)); dfloat *s = (dfloat *) calloc(maxIt+1, sizeof(dfloat)); s[0] = nr; @@ -152,26 +140,20 @@ void pgmres(parAlmond_t *parAlmond, Niter = i+1; // Av = A*V(:.i) - axpy(A, 1.0, V[i], 0.0, Av,parAlmond->nullSpace,parAlmond->nullSpacePenalty); + levels[0]->Ax(V[i], Av); // M w = A vi - for (dlong k=0;koptions.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - kcycle(parAlmond, 0); - } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - vcycle(parAlmond, 0); - } else { - for (dlong k=0;kkcycle(0); + } else if(ctype==VCYCLE) { + this->vcycle(0); } - for (dlong k=0;kcomm); // w = w - hki*V[k] vectorAdd(m, -hki, V[k], 1.0, w); @@ -180,9 +162,7 @@ void pgmres(parAlmond_t *parAlmond, H[k + i*(maxIt+1)] = hki; } - dfloat wdotwLocal = innerProd(m, w, w); - dfloat wdotw = 0.; - MPI_Allreduce(&wdotwLocal,&wdotw,1,MPI_DFLOAT,MPI_SUM,agmg::comm); + dfloat wdotw = vectorInnerProd(m, w, w, levels[0]->comm); H[i+1 + i*(maxIt+1)] = sqrt(wdotw); @@ -215,10 +195,7 @@ void pgmres(parAlmond_t *parAlmond, if(fabs(s[i+1]) < tol) break; if(i < maxIt-1){ - dfloat wdotwLocal = innerProd(m, w, w); - dfloat wdotw = 0.; - MPI_Allreduce(&wdotwLocal,&wdotw,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - + dfloat wdotw = vectorInnerProd(m, w, w, levels[0]->comm); dfloat nw = sqrt(wdotw); // V(:,i+1) = w/nw @@ -229,10 +206,9 @@ void pgmres(parAlmond_t *parAlmond, gmresUpdate(m, x, V, H, s, Niter, maxIt); //copy result back to parAlmond's x storage - for (dlong i=0;ilevels[0]->x[i] = x[i]; + memcpy(levels[0]->x, x, m*sizeof(dfloat)); - free(x); + free(x); free(s); free(V); free(H); free(J); free(Av); free(w); @@ -241,65 +217,52 @@ void pgmres(parAlmond_t *parAlmond, printf("gmres did not converge in given number of iterations\n"); } -//TODO need to link this with MPI -void device_pgmres(parAlmond_t *parAlmond, - int maxIt, - dfloat tol){ - - hyb* A = parAlmond->levels[0]->deviceA; +void solver_t::device_pgmres(const int maxIt, + const dfloat tol){ - const dlong m = A->Nrows; - // const dlong n = A->Ncols; + const dlong m = levels[0]->Nrows; + const dlong n = levels[0]->Ncols; // use parAlmond's buffers - occa::memory &o_r = parAlmond->levels[0]->o_rhs; - occa::memory &o_z = parAlmond->levels[0]->o_x; + occa::memory &o_r = levels[0]->o_rhs; + occa::memory &o_z = levels[0]->o_x; // initial residual - dfloat nbLocal = innerProd(parAlmond, m, o_r, o_r); - dfloat nb = 0; - MPI_Allreduce(&nbLocal,&nb,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - nb = sqrt(nb); + dfloat nb = sqrt(vectorInnerProd(m, o_r, o_r, levels[0]->comm)); - dfloat *dummy = (dfloat*) calloc(m, sizeof(dfloat)); - occa::memory o_x = parAlmond->device.malloc(m*sizeof(dfloat), dummy); - occa::memory o_Av= parAlmond->device.malloc(m*sizeof(dfloat), dummy); - occa::memory o_w = parAlmond->device.malloc(m*sizeof(dfloat), dummy); + occa::memory o_x = device.malloc(n*sizeof(dfloat), levels[0]->x); + occa::memory o_Av= device.malloc(n*sizeof(dfloat), levels[0]->x); + occa::memory o_w = device.malloc(n*sizeof(dfloat), levels[0]->x); //sanity check if (nb<=tol) { - parAlmond->levels[0]->o_x.copyFrom(o_x); + levels[0]->o_x.copyFrom(o_x); printf("Almond PGMRES iter %d, res = %g\n", 0, nb); o_x.free(); o_Av.free(); o_w.free(); return; } // M r = b - A*x0 - if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - device_kcycle(parAlmond, 0); - } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - device_vcycle(parAlmond, 0); - } else { - o_z.copyFrom(o_r); + if(ctype==KCYCLE) { + this->device_kcycle(0); + } else if(ctype==VCYCLE) { + this->device_vcycle(0); } o_r.copyFrom(o_z); - dfloat nrLocal = innerProd(parAlmond, m, o_r, o_r); - dfloat nr = 0; - MPI_Allreduce(&nrLocal,&nr,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - nr = sqrt(nr); + + dfloat nr = sqrt(vectorInnerProd(m, o_r, o_r, levels[0]->comm)); dfloat *s = (dfloat *) calloc(maxIt+1, sizeof(dfloat)); s[0] = nr; occa::memory *o_V = (occa::memory *) calloc(maxIt, sizeof(occa::memory)); for(int i=0; idevice.malloc(m*sizeof(dfloat), dummy); + o_V[i] = device.malloc(n*sizeof(dfloat), levels[0]->x); } - free(dummy); // V(:,0) = r/nr - vectorAdd(parAlmond, m, (1./nr), o_r, 0., o_V[0]); + vectorAdd(m, (1./nr), o_r, 0., o_V[0]); dfloat *H = (dfloat *) calloc((maxIt+1)*(maxIt+1), sizeof(dfloat)); dfloat *J = (dfloat *) calloc(4*maxIt, sizeof(dfloat)); @@ -312,33 +275,26 @@ void device_pgmres(parAlmond_t *parAlmond, Niter = i+1; // r = A*V(:.i) - axpy(parAlmond, A, 1.0, o_V[i], 0.0, o_r,parAlmond->nullSpace,parAlmond->nullSpacePenalty); + levels[0]->Ax(o_V[i], o_r); // M w = A vi - if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - device_kcycle(parAlmond, 0); - } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - device_vcycle(parAlmond, 0); - } else { - o_z.copyFrom(o_r); + if(ctype==KCYCLE) { + this->device_kcycle(0); + } else if(ctype==VCYCLE) { + this->device_vcycle(0); } for(int k=0; k<=i; ++k){ - dfloat hkiLocal = innerProd(parAlmond, m, o_z, o_V[k]); - dfloat hki = 0.; - MPI_Allreduce(&hkiLocal,&hki,1,MPI_DFLOAT,MPI_SUM,agmg::comm); + dfloat hki = vectorInnerProd(m, o_z, o_V[k], levels[0]->comm); // w = w - hki*V[k] - vectorAdd(parAlmond, m, -hki, o_V[k], 1.0, o_z); + vectorAdd(m, -hki, o_V[k], 1.0, o_z); // H(k,i) = hki H[k + i*(maxIt+1)] = hki; } - dfloat nwLocal = innerProd(parAlmond, m, o_z, o_z); - dfloat nw = 0.; - MPI_Allreduce(&nwLocal,&nw,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - nw = sqrt(nw); + dfloat nw = sqrt(vectorInnerProd(m, o_z, o_z, levels[0]->comm)); H[i+1 + i*(maxIt+1)] = nw; for(int k=0; klevels[0]->o_x.copyFrom(o_x); + levels[0]->o_x.copyFrom(o_x); printf("Almond PGMRES iter %d, res = %g\n", Niter, fabs(s[i+1])); @@ -389,10 +345,12 @@ void device_pgmres(parAlmond_t *parAlmond, o_V[i].free(); free((void*)o_V); - free(s); + free(s); free(H); free(J); o_Av.free(); o_w.free(); o_x.free(); } + +} //namepsace parAlmond \ No newline at end of file diff --git a/libs/parAlmond/src/solver.cpp b/libs/parAlmond/src/solver.cpp new file mode 100644 index 000000000..b80705dce --- /dev/null +++ b/libs/parAlmond/src/solver.cpp @@ -0,0 +1,97 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +solver_t::solver_t(occa::device device_, MPI_Comm comm_, + setupAide options_) { + + device = device_; + + comm = comm_; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + levels = (multigridLevel **) calloc(MAX_LEVELS,sizeof(multigridLevel *)); + numLevels = 0; + + options = options_; + + if (options.compareArgs("PARALMOND CYCLE", "NONSYM")) { + ktype = GMRES; + } else { + ktype = PCG; + } + + if(options.compareArgs("PARALMOND CYCLE", "EXACT")) + exact = true; + else + exact = false; + + if(options.compareArgs("PARALMOND CYCLE", "VCYCLE")) + ctype = VCYCLE; + else + ctype = KCYCLE; + + if (options.compareArgs("PARALMOND SMOOTHER", "CHEBYSHEV")) { + stype = CHEBYSHEV; + options.getArgs("PARALMOND CHEBYSHEV DEGREE", ChebyshevIterations); + if (!ChebyshevIterations) ChebyshevIterations=2; //default to 2 + } else { //default to DAMPED_JACOBI + stype = DAMPED_JACOBI; + } +} + +solver_t::~solver_t() { + + for (int n=0;nReport(); + } + + if(rank==0) + printf("--------------------------------------------------------------------------\n"); +} + +} \ No newline at end of file diff --git a/solvers/parALMOND/src/timer.c b/libs/parAlmond/src/timer.cpp similarity index 98% rename from solvers/parALMOND/src/timer.c rename to libs/parAlmond/src/timer.cpp index e98545ebd..45ea1a1c2 100644 --- a/solvers/parALMOND/src/timer.c +++ b/libs/parAlmond/src/timer.cpp @@ -24,7 +24,7 @@ SOFTWARE. */ -#include "agmg.h" +#include "parAlmond.hpp" void occaTimerTic(occa::device device,std::string name) { std::string profilerOn = occa::env::var("OCCA_PROFILE"); diff --git a/libs/parAlmond/src/utils.cpp b/libs/parAlmond/src/utils.cpp new file mode 100644 index 000000000..1d1196a8f --- /dev/null +++ b/libs/parAlmond/src/utils.cpp @@ -0,0 +1,244 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "parAlmond.hpp" + +namespace parAlmond { + +//scratch space +size_t scratchSpaceBytes=0; +void *scratch=NULL; +occa::memory o_scratch; + +size_t pinnedScratchSpaceBytes=0; +void *pinnedScratch=NULL; +occa::memory o_pinnedScratch; + +size_t reductionScratchBytes=0; +void *reductionScratch=NULL; +occa::memory o_reductionScratch; + +void allocateScratchSpace(size_t requiredBytes, occa::device device) { + + if (scratchSpaceBytesglobalId < fb->globalId) return -1; + if(fa->globalId > fb->globalId) return +1; + + if(fa->localId < fb->localId) return -1; + if(fa->localId > fb->localId) return +1; + + return 0; +} + +// compare on local indices +int CompareLocalId(const void *a, const void *b){ + + parallelId_t *fa = (parallelId_t*) a; + parallelId_t *fb = (parallelId_t*) b; + + if(fa->localId < fb->localId) return -1; + if(fa->localId > fb->localId) return +1; + + if(fa->globalId < fb->globalId) return -1; + if(fa->globalId > fb->globalId) return +1; + + return 0; +} + +bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i){ + + if(s > smax) return true; + if(smax > s) return false; + + if(r > rmax) return true; + if(rmax > r) return false; + + if(i > imax) return true; + if(i < imax) return false; + + return false; +} + +int compareOwner(const void *a, const void *b){ + parallelAggregate_t *pa = (parallelAggregate_t *) a; + parallelAggregate_t *pb = (parallelAggregate_t *) b; + + if (pa->ownerRank < pb->ownerRank) return -1; + if (pa->ownerRank > pb->ownerRank) return +1; + + return 0; +} + +int compareAgg(const void *a, const void *b){ + parallelAggregate_t *pa = (parallelAggregate_t *) a; + parallelAggregate_t *pb = (parallelAggregate_t *) b; + + if (pa->coarseId < pb->coarseId) return -1; + if (pa->coarseId > pb->coarseId) return +1; + + if (pa->originRank < pb->originRank) return -1; + if (pa->originRank > pb->originRank) return +1; + + return 0; +} + +int compareOrigin(const void *a, const void *b){ + parallelAggregate_t *pa = (parallelAggregate_t *) a; + parallelAggregate_t *pb = (parallelAggregate_t *) b; + + if (pa->originRank < pb->originRank) return -1; + if (pa->originRank > pb->originRank) return +1; + + return 0; +} + +int compareNonZeroByRow(const void *a, const void *b){ + nonzero_t *pa = (nonzero_t *) a; + nonzero_t *pb = (nonzero_t *) b; + + if (pa->row < pb->row) return -1; + if (pa->row > pb->row) return +1; + + if (pa->col < pb->col) return -1; + if (pa->col > pb->col) return +1; + + return 0; +}; + + +void matrixInverse(int N, dfloat *A){ + int lwork = N*N; + int info; + + // compute inverse mass matrix + double *tmpInvA = (double*) calloc(N*N, sizeof(double)); + + int *ipiv = (int*) calloc(N, sizeof(int)); + double *work = (double*) calloc(lwork, sizeof(double)); + + for(int n=0;n 2 then should load input data from argv setupAide options(argv[1]); - + // set up mesh stuff string fileName; int N, dim, elementType; @@ -65,7 +65,7 @@ int main(int argc, char **argv){ if(mesh->Nelements<10) meshPrint3D(mesh); - + // parameter for elliptic problem (-laplacian + lambda)*q = f dfloat lambda; options.getArgs("LAMBDA", lambda); @@ -81,12 +81,12 @@ int main(int argc, char **argv){ if(options.compareArgs("BENCHMARK", "BK5") || options.compareArgs("BENCHMARK", "BP5")){ - + // test Ax throughput occa::streamTag startAx = mesh->device.tagStream(); - + int NAx = 1; - + for(int it=0;itpartialAxKernel(mesh->NlocalGatherElements, + elliptic->partialAxKernel(mesh->NlocalGatherElements, mesh->o_localGatherElementList, mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, elliptic->o_x, elliptic->o_Ax); } else{ - elliptic->partialAxKernel(mesh->NlocalGatherElements, + elliptic->partialAxKernel(mesh->NlocalGatherElements, mesh->o_localGatherElementList, elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM, lambda, elliptic->o_x, elliptic->o_Ax); } } } - + occa::streamTag stopAx = mesh->device.tagStream(); - + mesh->device.finish(); - + double elapsedAx = mesh->device.timeBetween(startAx, stopAx); elapsedAx /= NAx; - - - printf("%d, %d, %g, %d, %g, %g; \%\%elemental: N, dofs, elapsed, dummy, time per node, nodes/time %s\n", + + + if (mesh->rank==0) + printf("%d, %d, %g, %d, %g, %g; \%\%elemental: N, dofs, elapsed, dummy, time per node, nodes/time %s\n", mesh->N, mesh->NlocalGatherElements*mesh->Np, - 0, elapsedAx, + 0, elapsedAx/(mesh->Np*mesh->Nelements), mesh->Nelements*mesh->Np/elapsedAx, - options.getArgs("DISCRETIZATION").c_str()); - + (char*) options.getArgs("DISCRETIZATION").c_str()); + } else{ - + // convergence tolerance dfloat tol = 1e-8; - + occa::streamTag startTag = mesh->device.tagStream(); - + int it = ellipticSolve(elliptic, lambda, tol, elliptic->o_r, elliptic->o_x); occa::streamTag stopTag = mesh->device.tagStream(); mesh->device.finish(); - + double elapsed = mesh->device.timeBetween(startTag, stopTag); - printf("%d, %d, %g, %d, %g, %g; \%\%global: N, dofs, elapsed, iterations, time per node, nodes*iterations/time %s\n", + if (mesh->rank==0) + printf("%d, %d, %g, %d, %g, %g; \%\%global: N, dofs, elapsed, iterations, time per node, nodes*iterations/time %s\n", mesh->N, mesh->Nelements*mesh->Np, elapsed, it, elapsed/(mesh->Np*mesh->Nelements), mesh->Nelements*(it*mesh->Np/elapsed), - options.getArgs("PRECONDITIONER").c_str()); + (char*) options.getArgs("PRECONDITIONER").c_str()); if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){ dfloat zero = 0.; @@ -159,13 +161,13 @@ int main(int argc, char **argv){ elliptic->o_mapB, elliptic->o_x); } - + // copy solution from DEVICE to HOST elliptic->o_x.copyTo(mesh->q); - + if (options.compareArgs("BASIS","BERN")) meshApplyElementMatrix(mesh,mesh->VB,mesh->q,mesh->q); - + dfloat maxError = 0; for(dlong e=0;eNelements;++e){ for(int n=0;nNp;++n){ @@ -173,23 +175,23 @@ int main(int argc, char **argv){ dfloat xn = mesh->x[id]; dfloat yn = mesh->y[id]; dfloat zn = mesh->z[id]; - + dfloat exact; if (elliptic->dim==2) exact = sin(M_PI*xn)*sin(M_PI*yn); - else + else exact = cos(M_PI*xn)*cos(M_PI*yn)*cos(M_PI*zn); dfloat error = fabs(exact-mesh->q[id]); - + maxError = mymax(maxError, error); } } - + dfloat globalMaxError = 0; MPI_Allreduce(&maxError, &globalMaxError, 1, MPI_DFLOAT, MPI_MAX, mesh->comm); if(mesh->rank==0) - fprintf(stderr,"globalMaxError = %g\n", globalMaxError); - + printf("globalMaxError = %g\n", globalMaxError); + #if 0 char fname[BUFSIZ]; string outName; @@ -197,7 +199,7 @@ int main(int argc, char **argv){ sprintf(fname, "%s_%04d.vtu",(char*)outName.c_str(), rank); if(elliptic->dim==3) meshPlotVTU3D(mesh, fname, 0); - else + else meshPlotVTU2D(mesh, fname, 0); #endif } @@ -217,10 +219,9 @@ int main(int argc, char **argv){ ellipticPlotVTUHex3D(mesh, "bah", 0); } #endif - + // close down MPI MPI_Finalize(); - - exit(0); + return 0; } diff --git a/solvers/elliptic/src/ellipticMultiGridLevel.c b/solvers/elliptic/src/ellipticMultiGridLevel.c new file mode 100644 index 000000000..6838a1318 --- /dev/null +++ b/solvers/elliptic/src/ellipticMultiGridLevel.c @@ -0,0 +1,164 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "elliptic.h" + +void MGLevel::Ax(occa::memory o_x, occa::memory o_Ax) { + ellipticOperator(elliptic,lambda, + o_x,o_Ax, dfloatString); // "float" ); // hard coded for testing (should make an option) +} + +void MGLevel::residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res) { + ellipticOperator(elliptic,lambda, + o_x,o_res, dfloatString); // "float" ); // hard coded for testing (should make an option) + + // subtract r = b - A*x + ellipticScaledAdd(elliptic, 1.f, o_rhs, -1.f, o_res); +} + +void MGLevel::coarsen(occa::memory o_x, occa::memory o_Rx) { + if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) + elliptic->dotMultiplyKernel(mesh->Nelements*NpF, o_invDegree, o_x, o_x); + + elliptic->precon->coarsenKernel(mesh->Nelements, o_R, o_x, o_Rx); + + if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { + ogsGatherScatter(o_Rx, ogsDfloat, ogsAdd, elliptic->ogs); + if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Rx); + } +} + +void MGLevel::prolongate(occa::memory o_x, occa::memory o_Px) { + elliptic->precon->prolongateKernel(mesh->Nelements, o_R, o_x, o_Px); +} + +void MGLevel::smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero) { + if (stype==RICHARDSON) { + this->smoothRichardson(o_rhs, o_x, x_is_zero); + } else if (stype==CHEBYSHEV) { + this->smoothChebyshev(o_rhs, o_x, x_is_zero); + } +} + +void MGLevel::smoother(occa::memory o_x, occa::memory o_Sx) { + if (smtype==JACOBI) { + this->smootherJacobi(o_x, o_Sx); + } else if (smtype==LOCALPATCH) { + this->smootherLocalPatch(o_x, o_Sx); + } +} + +void MGLevel::smoothRichardson(occa::memory &o_r, occa::memory &o_x, bool xIsZero) { + + occa::memory o_res = o_smootherResidual; + + if (xIsZero) { + this->smoother(o_r, o_x); + return; + } + + dfloat one = 1.; dfloat mone = -1.; + + //res = r-Ax + this->Ax(o_x,o_res); + elliptic->scaledAddKernel(Nrows, one, o_r, mone, o_res); + + //smooth the fine problem x = x + S(r-Ax) + this->smoother(o_res, o_res); + elliptic->scaledAddKernel(Nrows, one, o_res, one, o_x); +} + +void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZero) { + + const dfloat theta = 0.5*(lambda1+lambda0); + const dfloat delta = 0.5*(lambda1-lambda0); + const dfloat invTheta = 1.0/theta; + const dfloat sigma = theta/delta; + dfloat rho_n = 1./sigma; + dfloat rho_np1; + + dfloat one = 1., mone = -1., zero = 0.0; + + occa::memory o_res = o_smootherResidual; + occa::memory o_Ad = o_smootherResidual2; + occa::memory o_d = o_smootherUpdate; + + if(xIsZero){ //skip the Ax if x is zero + //res = Sr + this->smoother(o_r, o_res); + + //d = invTheta*res + elliptic->scaledAddKernel(Nrows, invTheta, o_res, zero, o_d); + } else { + //res = S(r-Ax) + this->Ax(o_x,o_res); + elliptic->scaledAddKernel(Nrows, one, o_r, mone, o_res); + this->smoother(o_res, o_res); + + //d = invTheta*res + elliptic->scaledAddKernel(Nrows, invTheta, o_res, zero, o_d); + } + + for (int k=0;kscaledAddKernel(Nrows, one, o_d, zero, o_x); + else + elliptic->scaledAddKernel(Nrows, one, o_d, one, o_x); + + //r_k+1 = r_k - SAd_k + this->Ax(o_d,o_Ad); + this->smoother(o_Ad, o_Ad); + elliptic->scaledAddKernel(Nrows, mone, o_Ad, one, o_res); + + rho_np1 = 1.0/(2.*sigma-rho_n); + dfloat rhoDivDelta = 2.0*rho_np1/delta; + + //d_k+1 = rho_k+1*rho_k*d_k + 2*rho_k+1*r_k+1/delta + elliptic->scaledAddKernel(Nrows, rhoDivDelta, o_res, rho_np1*rho_n, o_d); + + rho_n = rho_np1; + } + //x_k+1 = x_k + d_k + elliptic->scaledAddKernel(Nrows, one, o_d, one, o_x); +} + +void MGLevel::smootherLocalPatch(occa::memory &o_r, occa::memory &o_Sr) { + + // occaTimerTic(mesh->device,"approxBlockJacobiSolveKernel"); + elliptic->precon->approxBlockJacobiSolverKernel(mesh->Nelements, + elliptic->precon->o_patchesIndex, + elliptic->precon->o_invAP, + elliptic->precon->o_invDegreeAP, + o_r, + o_Sr); + // occaTimerToc(mesh->device,"approxBlockJacobiSolveKernel"); +} + +void MGLevel::smootherJacobi(occa::memory &o_r, occa::memory &o_Sr) { + elliptic->dotMultiplyKernel(mesh->Np*mesh->Nelements,o_invDiagA,o_r,o_Sr); +} + diff --git a/solvers/elliptic/src/ellipticMultiGridLevelSetup.c b/solvers/elliptic/src/ellipticMultiGridLevelSetup.c new file mode 100644 index 000000000..183a2b87a --- /dev/null +++ b/solvers/elliptic/src/ellipticMultiGridLevelSetup.c @@ -0,0 +1,453 @@ +/* + +The MIT License (MIT) + +Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +*/ + +#include "elliptic.h" + +size_t MGLevel::smootherResidualBytes; +dfloat* MGLevel::smootherResidual; +occa::memory MGLevel::o_smootherResidual; +occa::memory MGLevel::o_smootherResidual2; +occa::memory MGLevel::o_smootherUpdate; + +//build a single level +MGLevel::MGLevel(elliptic_t *ellipticBase, dfloat lambda_, int Nc, + setupAide options_, parAlmond::KrylovType ktype_, MPI_Comm comm_): + multigridLevel(ellipticBase->mesh->Nelements*ellipticBase->mesh->Np, + (ellipticBase->mesh->Nelements+ellipticBase->mesh->totalHaloPairs)*ellipticBase->mesh->Np, + ktype_, + comm_) { + + elliptic = ellipticBase; + mesh = elliptic->mesh; + options = options_; + lambda = lambda_; + degree = Nc; + weighted = false; + + //use weighted inner products + if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { + weighted = true; + o_weight = elliptic->o_invDegree; + weight = elliptic->invDegree; + } + + this->setupSmoother(); +} + +//build a level and connect it to the previous one +MGLevel::MGLevel(elliptic_t *ellipticBase, //finest level + mesh_t **meshLevels, + elliptic_t *ellipticFine, //previous level + elliptic_t *ellipticCoarse, //current level + dfloat lambda_, + int Nf, int Nc, + setupAide options_, + parAlmond::KrylovType ktype_, + MPI_Comm comm_): + multigridLevel(ellipticCoarse->mesh->Nelements*ellipticCoarse->mesh->Np, + (ellipticCoarse->mesh->Nelements+ellipticCoarse->mesh->totalHaloPairs)*ellipticCoarse->mesh->Np, + ktype_, + comm_) { + + elliptic = ellipticCoarse; + mesh = elliptic->mesh; + options = options_; + lambda = lambda_; + degree = Nc; + weighted = false; + + //use weighted inner products + if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { + weighted = true; + o_weight = elliptic->o_invDegree; + weight = elliptic->invDegree; + + NpF = ellipticFine->mesh->Np; + o_invDegree = ellipticFine->ogs->o_invDegree; + } + + this->setupSmoother(); + + /* build coarsening and prologation operators to connect levels */ + if (elliptic->elementType==TRIANGLES||elliptic->elementType==TETRAHEDRA){ + this->buildCoarsenerTriTet(meshLevels, Nf, Nc); + } else { + this->buildCoarsenerQuadHex(meshLevels, Nf, Nc); + } +} + +void MGLevel::setupSmoother() { + + //set up the fine problem smoothing + if(options.compareArgs("MULTIGRID SMOOTHER","LOCALPATCH")){ + smtype = LOCALPATCH; + + dfloat *invAP; + dlong Npatches; + dlong *patchesIndex; + + dfloat rateTolerance; // 0 - accept no approximate patches, 1 - accept all approximate patches + if(options.compareArgs("MULTIGRID SMOOTHER","EXACT")){ + rateTolerance = 0.0; + } else { + rateTolerance = 1.0; + } + + //initialize the full inverse operators on each 4 element patch + ellipticBuildLocalPatches(elliptic, lambda, rateTolerance, &Npatches, &patchesIndex, &invAP); + + o_invAP = mesh->device.malloc(Npatches*mesh->Np*mesh->Np*sizeof(dfloat),invAP); + o_patchesIndex = mesh->device.malloc(mesh->Nelements*sizeof(dlong), patchesIndex); + + dfloat *invDegree = (dfloat*) calloc(mesh->Nelements,sizeof(dfloat)); + for (dlong e=0;eNelements;e++) invDegree[e] = 1.0; + + o_invDegreeAP = mesh->device.malloc(mesh->Nelements*sizeof(dfloat),invDegree); + + if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) { + stype = CHEBYSHEV; + + if (!options.getArgs("MULTIGRID CHEBYSHEV DEGREE", ChebyshevIterations)) + ChebyshevIterations = 2; //default to degree 2 + + //estimate the max eigenvalue of S*A + dfloat rho = this->maxEigSmoothAx(); + + lambda1 = rho; + lambda0 = rho/10.; + } else { + stype = RICHARDSON; + + //estimate the max eigenvalue of S*A + dfloat rho = this->maxEigSmoothAx(); + + //set the stabilty weight (jacobi-type interation) + lambda0 = (4./3.)/rho; + + for (dlong n=0;nNelements;n++) + invDegree[n] *= lambda0; + + //update diagonal with weight + o_invDegreeAP.copyFrom(invDegree); + } + free(invDegree); free(invAP); free(patchesIndex); + + } else if (options.compareArgs("MULTIGRID SMOOTHER","DAMPEDJACOBI")) { //default to damped jacobi + smtype = JACOBI; + dfloat *invDiagA; + ellipticBuildJacobi(elliptic,lambda, &invDiagA); + + o_invDiagA = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat), invDiagA); + + if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) { + stype = CHEBYSHEV; + + if (!options.getArgs("MULTIGRID CHEBYSHEV DEGREE", ChebyshevIterations)) + ChebyshevIterations = 2; //default to degree 2 + + //estimate the max eigenvalue of S*A + dfloat rho = this->maxEigSmoothAx(); + + lambda1 = rho; + lambda0 = rho/10.; + } else { + stype = RICHARDSON; + + //estimate the max eigenvalue of S*A + dfloat rho = this->maxEigSmoothAx(); + + //set the stabilty weight (jacobi-type interation) + lambda0 = (4./3.)/rho; + + for (dlong n=0;nNp*mesh->Nelements;n++) + invDiagA[n] *= lambda0; + + //update diagonal with weight + o_invDiagA.copyFrom(invDiagA); + } + free(invDiagA); + } +} + +void MGLevel::Report() { + + hlong hNrows = (hlong) Nrows; + + dlong minNrows=0, maxNrows=0; + hlong totalNrows=0; + dfloat avgNrows; + + MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, mesh->comm); + MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, mesh->comm); + avgNrows = (dfloat) totalNrows/mesh->size; + + if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min + MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, mesh->comm); + + char smootherString[BUFSIZ]; + if (stype==RICHARDSON&&smtype==JACOBI) + strcpy(smootherString, "Damped Jacobi "); + else if (stype==CHEBYSHEV&&smtype==JACOBI) + strcpy(smootherString, "Chebyshev "); + else if (stype==RICHARDSON&&smtype==LOCALPATCH) + strcpy(smootherString, "Local Patch "); + else if (stype==RICHARDSON&&smtype==LOCALPATCH) + strcpy(smootherString, "Local Patch+Cheb"); + + if (mesh->rank==0){ + printf( "| pMG | %10d | Matrix-free | %s|\n",minNrows, smootherString); + printf(" | | %10d | Degree %2d | |\n", maxNrows, degree); + printf(" | | %10d | | |\n", (int) avgNrows); + } +} + + +void MGLevel::buildCoarsenerTriTet(mesh_t **meshLevels, int Nf, int Nc) { + + int NpFine = meshLevels[Nf]->Np; + int NpCoarse = meshLevels[Nc]->Np; + dfloat *P = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat)); + dfloat *Ptmp = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat)); + + //initialize P as identity (which it is for SPARSE) + for (int i=0;iNp; + int Np = meshLevels[n ]->Np; + + //copy P + for (int i=0;iinterpRaise[i*Np+k]*Ptmp[k*NpCoarse + j]; + } + } + } + } + + if (elliptic->options.compareArgs("BASIS","BERN")) { + dfloat* BBP = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat)); + for (int j=0;jinvVB[l+j*NpFine]*P[k+l*NpCoarse]*meshLevels[Nc]->VB[i+k*NpCoarse]; + } + } + } + } + for (int j=0;jmesh->device.malloc(NpFine*NpCoarse*sizeof(dfloat), R); + + free(P); free(Ptmp); +} + +void MGLevel::buildCoarsenerQuadHex(mesh_t **meshLevels, int Nf, int Nc) { + + int NqFine = Nf+1; + int NqCoarse = Nc+1; + dfloat *P = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat)); + dfloat *Ptmp = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat)); + + //initialize P as identity + for (int i=0;iinterpRaise[i*Nq+k]*Ptmp[k*NqCoarse + j]; + } + } + } + } + + //the coarsen matrix is P^T + R = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat)); + for (int i=0;imesh->device.malloc(NqFine*NqCoarse*sizeof(dfloat), R); + + free(P); free(Ptmp); +} + + +static void eig(const int Nrows, double *A, double *WR, double *WI){ + + int NB = 256; + char JOBVL = 'V'; + char JOBVR = 'V'; + int N = Nrows; + int LDA = Nrows; + int LWORK = (NB+2)*N; + + double *WORK = new double[LWORK]; + double *VL = new double[Nrows*Nrows]; + double *VR = new double[Nrows*Nrows]; + + int INFO = -999; + + dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI, + VL, &LDA, VR, &LDA, WORK, &LWORK, &INFO); + + + assert(INFO == 0); + + delete [] VL; + delete [] VR; + delete [] WORK; +} + +dfloat MGLevel::maxEigSmoothAx(){ + + const dlong N = Nrows; + const dlong M = Ncols; + + int k = 10; + + hlong Nlocal = (hlong) Nrows; + hlong Ntotal = 0; + MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, mesh->comm); + if(k > Ntotal) k = (int) Ntotal; + + // do an arnoldi + + // allocate memory for Hessenberg matrix + double *H = (double *) calloc(k*k,sizeof(double)); + + // allocate memory for basis + dfloat *Vx = (dfloat*) calloc(M, sizeof(dfloat)); + occa::memory *o_V = (occa::memory *) calloc(k+1, sizeof(occa::memory)); + + occa::memory o_Vx = mesh->device.malloc(M*sizeof(dfloat),Vx); + occa::memory o_AVx = mesh->device.malloc(M*sizeof(dfloat),Vx); + + for(int i=0; i<=k; i++) + o_V[i] = mesh->device.malloc(M*sizeof(dfloat),Vx); + + // generate a random vector for initial basis vector + for (dlong i=0;iogs); + + for (dlong i=0;iNmasked;i++) Vx[elliptic->maskIds[i]] = 0.; + } + + o_Vx.copyFrom(Vx); //copy to device + dfloat norm_vo = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_Vx, o_Vx); + norm_vo = sqrt(norm_vo); + + ellipticScaledAdd(elliptic, 1./norm_vo, o_Vx, 0. , o_V[0]); + + for(int j=0; jAx(o_V[j],o_AVx); + this->smoother(o_AVx, o_V[j+1]); + + // modified Gram-Schmidth + for(int i=0; i<=j; i++){ + // H(i,j) = v[i]'*A*v[j] + dfloat hij = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_V[i], o_V[j+1]); + + // v[j+1] = v[j+1] - hij*v[i] + ellipticScaledAdd(elliptic, -hij, o_V[i], 1., o_V[j+1]); + + H[i + j*k] = (double) hij; + } + + if(j+1 < k){ + // v[j+1] = v[j+1]/||v[j+1]|| + dfloat norm_vj = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_V[j+1], o_V[j+1]); + norm_vj = sqrt(norm_vj); + ellipticScaledAdd(elliptic, 1/norm_vj, o_V[j+1], 0., o_V[j+1]); + + H[j+1+ j*k] = (double) norm_vj; + } + } + + double *WR = (double *) calloc(k,sizeof(double)); + double *WI = (double *) calloc(k,sizeof(double)); + + eig(k, H, WR, WI); + + double rho = 0.; + + for(int i=0; irank==0)&&(options.compareArgs("VERBOSE","TRUE"))) printf("weight = %g \n", rho); + + return rho; +} diff --git a/solvers/elliptic/src/ellipticMultiGridSetup.c b/solvers/elliptic/src/ellipticMultiGridSetup.c index 72db6933c..7534a7018 100644 --- a/solvers/elliptic/src/ellipticMultiGridSetup.c +++ b/solvers/elliptic/src/ellipticMultiGridSetup.c @@ -26,74 +26,6 @@ SOFTWARE. #include "elliptic.h" -void ellipticMultigridAx(void **args, occa::memory &o_x, occa::memory &o_Ax) { - - elliptic_t *elliptic = (elliptic_t *) args[0]; - dfloat *lambda = (dfloat *) args[1]; - - ellipticOperator(elliptic,*lambda,o_x,o_Ax, dfloatString); // "float" ); // hard coded for testing (should make an option) -} - -void ellipticMultigridCoarsen(void **args, occa::memory &o_x, occa::memory &o_Rx) { - - elliptic_t *elliptic = (elliptic_t *) args[0]; - elliptic_t *Felliptic = (elliptic_t *) args[1]; - setupAide options = elliptic->options; - - mesh_t *mesh = elliptic->mesh; - mesh_t *Fmesh = Felliptic->mesh; - precon_t *precon = elliptic->precon; - occa::memory o_R = elliptic->o_R; - - if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) - Felliptic->dotMultiplyKernel(Fmesh->Nelements*Fmesh->Np, Fmesh->ogs->o_invDegree, o_x, o_x); - - precon->coarsenKernel(mesh->Nelements, o_R, o_x, o_Rx); - - if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { - ogsGatherScatter(o_Rx, ogsDfloat, ogsAdd, mesh->ogs); - if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Rx); - } -} - -void ellipticMultigridProlongate(void **args, occa::memory &o_x, occa::memory &o_Px) { - - elliptic_t *elliptic = (elliptic_t *) args[0]; - mesh_t *mesh = elliptic->mesh; - precon_t *precon = elliptic->precon; - occa::memory o_R = elliptic->o_R; - - precon->prolongateKernel(mesh->Nelements, o_R, o_x, o_Px); -} - -void ellipticGather(void **args, occa::memory &o_x, occa::memory &o_Gx) { - - elliptic_t *elliptic = (elliptic_t *) args[0]; - ogs_t *ogs = (ogs_t *) args[1]; - occa::memory *o_s= (occa::memory *) args[2]; - - mesh_t *mesh = elliptic->mesh; - setupAide options = elliptic->options; - - ogsGather(o_Gx, o_x, ogsDfloat, ogsAdd, ogs); - elliptic->dotMultiplyKernel(ogs->Ngather, ogs->o_gatherInvDegree, o_Gx, o_Gx); -} - -void ellipticScatter(void **args, occa::memory &o_x, occa::memory &o_Sx) { - - elliptic_t *elliptic = (elliptic_t *) args[0]; - ogs_t *ogs = (ogs_t *) args[1]; - occa::memory *o_s= (occa::memory *) args[2]; - - mesh_t *mesh = elliptic->mesh; - setupAide options = elliptic->options; - - ogsScatter(o_Sx, o_x, ogsDfloat, ogsAdd, ogs); -} - -void buildCoarsenerTriTet(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc); -void buildCoarsenerQuadHex(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc); - void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) { mesh_t *mesh = elliptic->mesh; @@ -105,7 +37,7 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd meshLevels[n] = (mesh_t *) calloc(1,sizeof(mesh_t)); meshLevels[n]->Nverts = mesh->Nverts; meshLevels[n]->Nfaces = mesh->Nfaces; - + switch(elliptic->elementType){ case TRIANGLES: meshLoadReferenceNodesTri2D(meshLevels[n], n); break; @@ -119,35 +51,35 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd } //set the number of MG levels and their degree - int numLevels; + int numMGLevels; int *levelDegree; if (options.compareArgs("MULTIGRID COARSENING","ALLDEGREES")) { - numLevels = mesh->N; - levelDegree= (int *) calloc(numLevels,sizeof(int)); - for (int n=0;nN - n; //all degrees + numMGLevels = mesh->N; + levelDegree= (int *) calloc(numMGLevels,sizeof(int)); + for (int n=0;nN - n; //all degrees } else if (options.compareArgs("MULTIGRID COARSENING","HALFDEGREES")) { - numLevels = floor(mesh->N/2.)+1; - levelDegree= (int *) calloc(numLevels,sizeof(int)); - for (int n=0;nN - 2*n; //decrease by two - levelDegree[numLevels-1] = 1; //ensure the last level is degree 1 + numMGLevels = floor(mesh->N/2.)+1; + levelDegree= (int *) calloc(numMGLevels,sizeof(int)); + for (int n=0;nN - 2*n; //decrease by two + levelDegree[numMGLevels-1] = 1; //ensure the last level is degree 1 } else { //default "HALFDOFS" // pick the degrees so the dofs of each level halfs (roughly) //start by counting the number of levels neccessary - numLevels = 1; + numMGLevels = 1; int degree = mesh->N; int dofs = meshLevels[degree]->Np; int basedofs = mesh->Nverts; while (dofs>basedofs) { - numLevels++; + numMGLevels++; for (;degree>0;degree--) if (meshLevels[degree]->Np<=dofs/2) break; dofs = meshLevels[degree]->Np; } - levelDegree= (int *) calloc(numLevels,sizeof(int)); + levelDegree= (int *) calloc(numMGLevels,sizeof(int)); degree = mesh->N; - numLevels = 1; + numMGLevels = 1; levelDegree[0] = degree; dofs = meshLevels[degree]->Np; while (dofs>basedofs) { @@ -155,133 +87,48 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd if (meshLevels[degree]->Np<=dofs/2) break; dofs = meshLevels[degree]->Np; - levelDegree[numLevels] = degree; - numLevels++; + levelDegree[numMGLevels] = degree; + numMGLevels++; } } - //storage for lambda parameter - dfloat *vlambda = (dfloat *) calloc(1,sizeof(dfloat)); - *vlambda = lambda; + int Nmax = levelDegree[0]; + int Nmin = levelDegree[numMGLevels-1]; //initialize parAlmond - precon->parAlmond = parAlmondInit(mesh, elliptic->options); - agmgLevel **levels = precon->parAlmond->levels; + precon->parAlmond = parAlmond::Init(mesh->device, mesh->comm, options); + parAlmond::multigridLevel **levels = precon->parAlmond->levels; + + //set up the finest level + if (Nmax>Nmin) { + levels[0] = new MGLevel(elliptic, lambda, Nmax, options, + precon->parAlmond->ktype, mesh->comm); + MGLevelAllocateStorage((MGLevel*) levels[0], 0, + precon->parAlmond->ctype); + precon->parAlmond->numLevels++; + } - //build a elliptic struct for every degree - elliptic_t **ellipticsN = (elliptic_t**) calloc(mesh->N+1,sizeof(elliptic_t*)); - ellipticsN[mesh->N] = elliptic; //top level - for (int n=1;nelliptic, + ellipticC, + lambda, + Nf, Nc, + options, + precon->parAlmond->ktype, mesh->comm); + MGLevelAllocateStorage((MGLevel*) levels[n], n, + precon->parAlmond->ctype); precon->parAlmond->numLevels++; - levels[n] = (agmgLevel *) calloc(1,sizeof(agmgLevel)); - levels[n]->gatherLevel = false; //dont gather this level - if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {//use weighted inner products - precon->parAlmond->levels[n]->weightedInnerProds = true; - precon->parAlmond->levels[n]->o_weight = ellipticL->o_invDegree; - precon->parAlmond->levels[n]->weight = ellipticL->invDegree; - } - - //use the matrix free Ax - levels[n]->AxArgs = (void **) calloc(2,sizeof(void*)); - levels[n]->AxArgs[0] = (void *) ellipticL; - levels[n]->AxArgs[1] = (void *) vlambda; - levels[n]->device_Ax = ellipticMultigridAx; - - levels[n]->smoothArgs = (void **) calloc(2,sizeof(void*)); - levels[n]->smoothArgs[0] = (void *) ellipticL; - levels[n]->smoothArgs[1] = (void *) levels[n]; - - levels[n]->Nrows = mesh->Nelements*ellipticL->mesh->Np; - levels[n]->Ncols = (mesh->Nelements+mesh->totalHaloPairs)*ellipticL->mesh->Np; - - if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) { - if (!options.getArgs("MULTIGRID CHEBYSHEV DEGREE", levels[n]->ChebyshevIterations)) - levels[n]->ChebyshevIterations = 2; //default to degree 2 - - levels[n]->device_smooth = ellipticMultigridSmoothChebyshev; - - levels[n]->smootherResidual = (dfloat *) calloc(levels[n]->Ncols,sizeof(dfloat)); - - // extra storage for smoothing op - levels[n]->o_smootherResidual = mesh->device.malloc(levels[n]->Ncols*sizeof(dfloat),levels[n]->smootherResidual); - levels[n]->o_smootherResidual2 = mesh->device.malloc(levels[n]->Ncols*sizeof(dfloat),levels[n]->smootherResidual); - levels[n]->o_smootherUpdate = mesh->device.malloc(levels[n]->Ncols*sizeof(dfloat),levels[n]->smootherResidual); - } else { - levels[n]->device_smooth = ellipticMultigridSmooth; - - // extra storage for smoothing op - levels[n]->o_smootherResidual = mesh->device.malloc(levels[n]->Ncols*sizeof(dfloat)); - } - - levels[n]->smootherArgs = (void **) calloc(2,sizeof(void*)); - levels[n]->smootherArgs[0] = (void *) ellipticL; - levels[n]->smootherArgs[1] = (void *) vlambda; - - dfloat rateTolerance; // 0 - accept not approximate patches, 1 - accept all approximate patches - if(options.compareArgs("MULTIGRID SMOOTHER","EXACT")){ - rateTolerance = 0.0; - } else { - rateTolerance = 1.0; - } - - //set up the fine problem smoothing - if(options.compareArgs("MULTIGRID SMOOTHER","LOCALPATCH")){ - ellipticSetupSmootherLocalPatch(ellipticL, ellipticL->precon, levels[n], lambda, rateTolerance); - } else { //default to damped jacobi - ellipticSetupSmootherDampedJacobi(ellipticL, ellipticL->precon, levels[n], lambda); - } - } - - //report top levels - if (options.compareArgs("VERBOSE","TRUE")) { - if((mesh->rank==0)&&(numLevels>0)) { //report the upper multigrid levels - printf("------------------Multigrid Report---------------------\n"); - printf("-------------------------------------------------------\n"); - printf("level| Degree | dimension | Smoother \n"); - printf(" | Degree | (min,max,avg) | Smoother \n"); - printf("-------------------------------------------------------\n"); - } - - for(int lev=0; levNverts*mesh->Nelements: levels[lev]->Nrows; - hlong hNrows = (hlong) Nrows; - - dlong minNrows=0, maxNrows=0; - hlong totalNrows=0; - dfloat avgNrows; - - MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, mesh->comm); - MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, mesh->comm); - avgNrows = (dfloat) totalNrows/mesh->size; - - if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min - MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, mesh->comm); - - char smootherString[BUFSIZ]; - strcpy(smootherString, (char*) (options.getArgs("MULTIGRID SMOOTHER")).c_str()); - - if (mesh->rank==0){ - printf(" %3d | %3d | %10.2f | %s \n", - lev, levelDegree[lev], (dfloat)minNrows, smootherString); - printf(" | | %10.2f | \n", (dfloat)maxNrows); - printf(" | | %10.2f | \n", avgNrows); - } - } - if((mesh->rank==0)&&(numLevels>0)) - printf("-------------------------------------------------------\n"); } /* build degree 1 problem and pass to AMG */ @@ -289,18 +136,27 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd dlong nnzCoarseA; ogs_t *coarseogs; - elliptic_t* ellipticL = ellipticsN[1]; - int basisNp = ellipticL->mesh->Np; + //set up the base level + elliptic_t* ellipticCoarse; + if (Nmax>Nmin) { + int Nc = levelDegree[numMGLevels-1]; + int Nf = levelDegree[numMGLevels-2]; + printf("=============BUILDING MULTIGRID LEVEL OF DEGREE %d==================\n", Nmin); + ellipticCoarse = ellipticBuildMultigridLevel(elliptic,Nc,Nf); + } else { + ellipticCoarse = elliptic; + } + int basisNp = ellipticCoarse->mesh->Np; dfloat *basis = NULL; - if (options.compareArgs("BASIS","BERN")) basis = ellipticL->mesh->VB; + if (options.compareArgs("BASIS","BERN")) basis = ellipticCoarse->mesh->VB; hlong *coarseGlobalStarts = (hlong*) calloc(mesh->size+1, sizeof(hlong)); if (options.compareArgs("DISCRETIZATION","IPDG")) { - ellipticBuildIpdg(ellipticL, basisNp, basis, lambda, &coarseA, &nnzCoarseA,coarseGlobalStarts); + ellipticBuildIpdg(ellipticCoarse, basisNp, basis, lambda, &coarseA, &nnzCoarseA,coarseGlobalStarts); } else if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { - ellipticBuildContinuous(ellipticL,lambda,&coarseA,&nnzCoarseA,&coarseogs,coarseGlobalStarts); + ellipticBuildContinuous(ellipticCoarse,lambda,&coarseA,&nnzCoarseA,&coarseogs,coarseGlobalStarts); } hlong *Rows = (hlong *) calloc(nnzCoarseA, sizeof(hlong)); @@ -312,168 +168,124 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd Cols[i] = coarseA[i].col; Vals[i] = coarseA[i].val; } + free(coarseA); // build amg starting at level N=1 - parAlmondAgmgSetup(precon->parAlmond, - coarseGlobalStarts, - nnzCoarseA, - Rows, - Cols, - Vals, - elliptic->allNeumann, - elliptic->allNeumannPenalty); - free(coarseA); free(Rows); free(Cols); free(Vals); - - //tell parAlmond to gather this level - agmgLevel *coarseLevel = precon->parAlmond->levels[numLevels-1]; - if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { - coarseLevel->gatherLevel = true; - coarseLevel->weightedInnerProds = false; - - coarseLevel->Srhs = (dfloat*) calloc(ellipticL->mesh->Np*ellipticL->mesh->Nelements,sizeof(dfloat)); - coarseLevel->Sx = (dfloat*) calloc(ellipticL->mesh->Np*ellipticL->mesh->Nelements,sizeof(dfloat)); - coarseLevel->o_Srhs = ellipticL->mesh->device.malloc(ellipticL->mesh->Np*ellipticL->mesh->Nelements*sizeof(dfloat),coarseLevel->Srhs); - coarseLevel->o_Sx = ellipticL->mesh->device.malloc(ellipticL->mesh->Np*ellipticL->mesh->Nelements*sizeof(dfloat),coarseLevel->Sx); - - coarseLevel->gatherArgs = (void **) calloc(3,sizeof(void*)); - coarseLevel->gatherArgs[0] = (void *) ellipticL; - coarseLevel->gatherArgs[1] = (void *) ellipticL->ogs; - coarseLevel->gatherArgs[2] = (void *) &(coarseLevel->o_Sx); - coarseLevel->scatterArgs = coarseLevel->gatherArgs; - - coarseLevel->device_gather = ellipticGather; - coarseLevel->device_scatter = ellipticScatter; + parAlmond::AMGSetup(precon->parAlmond, + coarseGlobalStarts, + nnzCoarseA, + Rows, + Cols, + Vals, + elliptic->allNeumann, + elliptic->allNeumannPenalty); + free(Rows); free(Cols); free(Vals); + + //overwrite the finest AMG level with the degree 1 matrix free level + // delete levels[numMGLevels-1]; + if (Nmax>Nmin) { + int Nc = levelDegree[numMGLevels-1]; + int Nf = levelDegree[numMGLevels-2]; + elliptic_t *ellipticFine = ((MGLevel*) levels[numMGLevels-2])->elliptic; + levels[numMGLevels-1] = new MGLevel(elliptic, + meshLevels, + ellipticFine, + ellipticCoarse, + lambda, + Nf, Nc, + options, + precon->parAlmond->ktype, mesh->comm); + } else { + levels[numMGLevels-1] = new MGLevel(ellipticCoarse, lambda, Nmin, options, + precon->parAlmond->ktype, mesh->comm); } + MGLevelAllocateStorage((MGLevel*) levels[numMGLevels-1], numMGLevels-1, + precon->parAlmond->ctype); - /* build coarsening and prologation operators to connect levels */ - for(int n=1; nelementType==TRIANGLES||elliptic->elementType==TETRAHEDRA){ - buildCoarsenerTriTet(ellipticL, meshLevels, Nf, Nc); + //tell parAlmond to gather when going to the next level + if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) { + if (precon->parAlmond->numLevels > numMGLevels) { + parAlmond::agmgLevel *nextLevel + = (parAlmond::agmgLevel*)precon->parAlmond->levels[numMGLevels]; + + nextLevel->gatherLevel = true; + nextLevel->ogs = ellipticCoarse->ogs; + nextLevel->Gx = (dfloat*) calloc(levels[numMGLevels-1]->Ncols,sizeof(dfloat)); + nextLevel->Sx = (dfloat*) calloc(ellipticCoarse->mesh->Np*ellipticCoarse->mesh->Nelements,sizeof(dfloat)); + nextLevel->o_Gx = ellipticCoarse->mesh->device.malloc(levels[numMGLevels-1]->Ncols*sizeof(dfloat),nextLevel->Gx); + nextLevel->o_Sx = ellipticCoarse->mesh->device.malloc(ellipticCoarse->mesh->Np*ellipticCoarse->mesh->Nelements*sizeof(dfloat),nextLevel->Sx); } else { - buildCoarsenerQuadHex(ellipticL, meshLevels, Nf, Nc); + //this level is the base + parAlmond::coarseSolver *coarseLevel = precon->parAlmond->coarseLevel; + + coarseLevel->gatherLevel = true; + coarseLevel->ogs = ellipticCoarse->ogs; + coarseLevel->Gx = (dfloat*) calloc(coarseLevel->ogs->Ngather,sizeof(dfloat)); + coarseLevel->Sx = (dfloat*) calloc(ellipticCoarse->mesh->Np*ellipticCoarse->mesh->Nelements,sizeof(dfloat)); + coarseLevel->o_Gx = ellipticCoarse->mesh->device.malloc(coarseLevel->ogs->Ngather*sizeof(dfloat),coarseLevel->Gx); + coarseLevel->o_Sx = ellipticCoarse->mesh->device.malloc(ellipticCoarse->mesh->Np*ellipticCoarse->mesh->Nelements*sizeof(dfloat),coarseLevel->Sx); } - - levels[n]->coarsenArgs = (void **) calloc(2,sizeof(void*)); - levels[n]->coarsenArgs[0] = (void *) ellipticL; - levels[n]->coarsenArgs[1] = (void *) ellipticF; - - levels[n]->prolongateArgs = levels[n]->coarsenArgs; - - levels[n]->device_coarsen = ellipticMultigridCoarsen; - levels[n]->device_prolongate = ellipticMultigridProlongate; } for (int n=1;nN+1;n++) free(meshLevels[n]); free(meshLevels); -} - - - -void buildCoarsenerTriTet(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc) { - - int NpFine = meshLevels[Nf]->Np; - int NpCoarse = meshLevels[Nc]->Np; - dfloat *P = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat)); - dfloat *Ptmp = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat)); - - //initialize P as identity (which it is for SPARSE) - for (int i=0;iNp; - int Np = meshLevels[n ]->Np; - - //copy P - for (int i=0;iinterpRaise[i*Np+k]*Ptmp[k*NpCoarse + j]; - } - } + //report top levels + if (options.compareArgs("VERBOSE","TRUE")) { + if (mesh->rank==0) { //report the upper multigrid levels + printf("------------------Multigrid Report----------------------------------------\n"); + printf("--------------------------------------------------------------------------\n"); + printf("level| Type | dimension | nnz per row | Smoother |\n"); + printf(" | | (min,max,avg) | (min,max,avg) | |\n"); + printf("--------------------------------------------------------------------------\n"); } - } - if (elliptic->options.compareArgs("BASIS","BERN")) { - dfloat* BBP = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat)); - for (int j=0;jinvVB[l+j*NpFine]*P[k+l*NpCoarse]*meshLevels[Nc]->VB[i+k*NpCoarse]; - } - } - } - } - for (int j=0;jparAlmond->numLevels; lev++) { + if(mesh->rank==0) {printf(" %3d ", lev);fflush(stdout);} + levels[lev]->Report(); } - free(BBP); - } - //the coarsen matrix is P^T - elliptic->R = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat)); - for (int i=0;iR[i*NpFine+j] = P[j*NpCoarse+i]; - } + if (mesh->rank==0) + printf("--------------------------------------------------------------------------\n"); } - elliptic->o_R = elliptic->mesh->device.malloc(NpFine*NpCoarse*sizeof(dfloat), elliptic->R); - - free(P); free(Ptmp); } -void buildCoarsenerQuadHex(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc) { - - int NqFine = Nf+1; - int NqCoarse = Nc+1; - dfloat *P = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat)); - dfloat *Ptmp = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat)); - //initialize P as identity - for (int i=0;iinterpRaise[i*Nq+k]*Ptmp[k*NqCoarse + j]; - } - } +void MGLevelAllocateStorage(MGLevel *level, int k, parAlmond::CycleType ctype) { + // extra storage for smoothing op + size_t Nbytes = level->Ncols*sizeof(dfloat); + if (MGLevel::smootherResidualBytes < Nbytes) { + if (MGLevel::o_smootherResidual.size()) { + free(MGLevel::smootherResidual); + MGLevel::o_smootherResidual.free(); + MGLevel::o_smootherResidual2.free(); + MGLevel::o_smootherUpdate.free(); } + + MGLevel::smootherResidual = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); + MGLevel::o_smootherResidual = level->mesh->device.malloc(Nbytes,MGLevel::smootherResidual); + MGLevel::o_smootherResidual2 = level->mesh->device.malloc(Nbytes,MGLevel::smootherResidual); + MGLevel::o_smootherUpdate = level->mesh->device.malloc(Nbytes,MGLevel::smootherResidual); + MGLevel::smootherResidualBytes = Nbytes; } - //the coarsen matrix is P^T - elliptic->R = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat)); - for (int i=0;iR[i*NqFine+j] = P[j*NqCoarse+i]; + if (k) level->x = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); + if (k) level->rhs = (dfloat *) calloc(level->Nrows,sizeof(dfloat)); + if (k) level->o_x = level->mesh->device.malloc(level->Ncols*sizeof(dfloat),level->x); + if (k) level->o_rhs = level->mesh->device.malloc(level->Nrows*sizeof(dfloat),level->rhs); + + level->res = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); + level->o_res = level->mesh->device.malloc(level->Ncols*sizeof(dfloat),level->res); + + //kcycle vectors + if (ctype==parAlmond::KCYCLE) { + if ((k>0) && (kck = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); + level->vk = (dfloat *) calloc(level->Nrows,sizeof(dfloat)); + level->wk = (dfloat *) calloc(level->Nrows,sizeof(dfloat)); + level->o_ck = level->mesh->device.malloc(level->Ncols*sizeof(dfloat),level->ck); + level->o_vk = level->mesh->device.malloc(level->Nrows*sizeof(dfloat),level->vk); + level->o_wk = level->mesh->device.malloc(level->Nrows*sizeof(dfloat),level->wk); } } - elliptic->o_R = elliptic->mesh->device.malloc(NqFine*NqCoarse*sizeof(dfloat), elliptic->R); - - free(P); free(Ptmp); } diff --git a/solvers/elliptic/src/ellipticPreconditioner.c b/solvers/elliptic/src/ellipticPreconditioner.c index c824aa7ec..7a3a3f1fd 100644 --- a/solvers/elliptic/src/ellipticPreconditioner.c +++ b/solvers/elliptic/src/ellipticPreconditioner.c @@ -32,14 +32,29 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, mesh_t *mesh = elliptic->mesh; precon_t *precon = elliptic->precon; setupAide options = elliptic->options; - - if ( options.compareArgs("PRECONDITIONER", "FULLALMOND") - || options.compareArgs("PRECONDITIONER", "MULTIGRID")) { + + if (options.compareArgs("PRECONDITIONER", "MULTIGRID")) { occaTimerTic(mesh->device,"parALMOND"); - parAlmondPrecon(precon->parAlmond, o_z, o_r); + parAlmond::Precon(precon->parAlmond, o_z, o_r); occaTimerToc(mesh->device,"parALMOND"); + } else if (options.compareArgs("PRECONDITIONER", "FULLALMOND")) { + + if (options.compareArgs("DISCRETIZATION", "IPDG")) { + occaTimerTic(mesh->device,"parALMOND"); + parAlmond::Precon(precon->parAlmond, o_z, o_r); + occaTimerToc(mesh->device,"parALMOND"); + } else if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) { + ogsGather(precon->o_rhsG, o_r, ogsDfloat, ogsAdd, elliptic->ogs); + elliptic->dotMultiplyKernel(elliptic->ogs->Ngather, + elliptic->ogs->o_gatherInvDegree, precon->o_rhsG, precon->o_rhsG); + occaTimerTic(mesh->device,"parALMOND"); + parAlmond::Precon(precon->parAlmond, precon->o_xG, precon->o_rhsG); + occaTimerToc(mesh->device,"parALMOND"); + ogsScatter(o_z, precon->o_xG, ogsDfloat, ogsAdd, elliptic->ogs); + } + } else if(options.compareArgs("PRECONDITIONER", "MASSMATRIX")){ dfloat invLambda = 1./lambda; @@ -53,20 +68,20 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, elliptic->dotMultiplyKernel(mesh->Nelements*mesh->Np, ogs->o_invDegree, o_r, elliptic->o_rtmp); - if(mesh->NglobalGatherElements) - precon->partialblockJacobiKernel(mesh->NglobalGatherElements, + if(mesh->NglobalGatherElements) + precon->partialblockJacobiKernel(mesh->NglobalGatherElements, mesh->o_globalGatherElementList, invLambda, mesh->o_vgeo, precon->o_invMM, elliptic->o_rtmp, o_z); ogsGatherScatterStart(o_z, ogsDfloat, ogsAdd, ogs); if(mesh->NlocalGatherElements) - precon->partialblockJacobiKernel(mesh->NlocalGatherElements, + precon->partialblockJacobiKernel(mesh->NlocalGatherElements, mesh->o_localGatherElementList, invLambda, mesh->o_vgeo, precon->o_invMM, elliptic->o_rtmp, o_z); - + ogsGatherScatterFinish(o_z, ogsDfloat, ogsAdd, ogs); - + elliptic->dotMultiplyKernel(mesh->Nelements*mesh->Np, ogs->o_invDegree, o_z, o_z); //post-mask @@ -81,7 +96,7 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, precon->SEMFEMInterpKernel(mesh->Nelements,mesh->o_SEMFEMAnterp,o_z,precon->o_rFEM); ogsGather(precon->o_GrFEM, precon->o_rFEM, ogsDfloat, ogsAdd, precon->FEMogs); occaTimerTic(mesh->device,"parALMOND"); - parAlmondPrecon(precon->parAlmond, precon->o_GzFEM, precon->o_GrFEM); + parAlmond::Precon(precon->parAlmond, precon->o_GzFEM, precon->o_GrFEM); occaTimerToc(mesh->device,"parALMOND"); ogsScatter(precon->o_zFEM, precon->o_GzFEM, ogsDfloat, ogsAdd, precon->FEMogs); precon->SEMFEMAnterpKernel(mesh->Nelements,mesh->o_SEMFEMAnterp,precon->o_zFEM,o_z); @@ -90,9 +105,13 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, ogsGatherScatter(o_z, ogsDfloat, ogsAdd, elliptic->ogs); if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_z); } else { + ogsGather(precon->o_rhsG, o_r, ogsDfloat, ogsAdd, precon->FEMogs); + elliptic->dotMultiplyKernel(precon->FEMogs->Ngather, + precon->FEMogs->o_gatherInvDegree, precon->o_rhsG, precon->o_rhsG); occaTimerTic(mesh->device,"parALMOND"); - parAlmondPrecon(precon->parAlmond, o_z, o_r); + parAlmond::Precon(precon->parAlmond, precon->o_xG, precon->o_rhsG); occaTimerToc(mesh->device,"parALMOND"); + ogsScatter(o_z, precon->o_xG, ogsDfloat, ogsAdd, precon->FEMogs); } } else if(options.compareArgs("PRECONDITIONER", "JACOBI")){ @@ -102,7 +121,7 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, occaTimerTic(mesh->device,"dotDivideKernel"); elliptic->dotMultiplyKernel(Ntotal, o_r, precon->o_invDiagA, o_z); occaTimerToc(mesh->device,"dotDivideKernel"); - + } else{ // turn off preconditioner o_z.copyFrom(o_r); } diff --git a/solvers/elliptic/src/ellipticPreconditionerSetup.c b/solvers/elliptic/src/ellipticPreconditionerSetup.c index fe7d5ab24..934addc3e 100644 --- a/solvers/elliptic/src/ellipticPreconditionerSetup.c +++ b/solvers/elliptic/src/ellipticPreconditionerSetup.c @@ -52,15 +52,16 @@ void ellipticPreconditionerSetup(elliptic_t *elliptic, ogs_t *ogs, dfloat lambda hlong *Rows = (hlong *) calloc(nnz, sizeof(hlong)); hlong *Cols = (hlong *) calloc(nnz, sizeof(hlong)); dfloat *Vals = (dfloat*) calloc(nnz,sizeof(dfloat)); - + for (dlong n=0;nparAlmond = parAlmondInit(mesh, options); - parAlmondAgmgSetup(precon->parAlmond, + precon->parAlmond = parAlmond::Init(mesh->device, mesh->comm, options); + parAlmond::AMGSetup(precon->parAlmond, globalStarts, nnz, Rows, @@ -68,70 +69,20 @@ void ellipticPreconditionerSetup(elliptic_t *elliptic, ogs_t *ogs, dfloat lambda Vals, elliptic->allNeumann, elliptic->allNeumannPenalty); - free(A); free(Rows); free(Cols); free(Vals); - - if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {//tell parAlmond to gather this level - agmgLevel *baseLevel = precon->parAlmond->levels[0]; - - baseLevel->gatherLevel = true; - baseLevel->Srhs = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat)); - baseLevel->Sx = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat)); - baseLevel->o_Srhs = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat)); - baseLevel->o_Sx = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat)); + free(Rows); free(Cols); free(Vals); - baseLevel->weightedInnerProds = false; + if (options.compareArgs("VERBOSE", "TRUE")) + parAlmond::Report(precon->parAlmond); - baseLevel->gatherArgs = (void **) calloc(3,sizeof(void*)); - baseLevel->gatherArgs[0] = (void *) elliptic; - baseLevel->gatherArgs[1] = (void *) precon->ogs; - baseLevel->gatherArgs[2] = (void *) &(baseLevel->o_Sx); - baseLevel->scatterArgs = baseLevel->gatherArgs; + if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {//tell parAlmond to gather this level + parAlmond::multigridLevel *baseLevel = precon->parAlmond->levels[0]; - baseLevel->device_gather = ellipticGather; - baseLevel->device_scatter = ellipticScatter; + precon->rhsG = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat)); + precon->xG = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat)); + precon->o_rhsG = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat)); + precon->o_xG = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat)); } -/* - if (strstr(options,"MATRIXFREE")&&strstr(options,"IPDG")) { //swap the top AMG level ops for matrix free versions - agmgLevel *baseLevel = precon->parAlmond->levels[0]; - - dfloat *vlambda = (dfloat *) calloc(1,sizeof(dfloat)); - *vlambda = lambda; - baseLevel->AxArgs = (void **) calloc(3,sizeof(void*)); - baseLevel->AxArgs[0] = (void *) elliptic; - baseLevel->AxArgs[1] = (void *) vlambda; - baseLevel->AxArgs[2] = (void *) options; - baseLevel->device_Ax = AxTri2D; - - baseLevel->smoothArgs = (void **) calloc(2,sizeof(void*)); - baseLevel->smoothArgs[0] = (void *) elliptic; - baseLevel->smoothArgs[1] = (void *) baseLevel; - baseLevel->device_smooth = smoothTri2D; - - baseLevel->smootherArgs = (void **) calloc(1,sizeof(void*)); - baseLevel->smootherArgs[0] = (void *) elliptic; - - baseLevel->Nrows = mesh->Nelements*mesh->Np; - baseLevel->Ncols = (mesh->Nelements+mesh->totalHaloPairs)*mesh->Np; - - // extra storage for smoothing op - baseLevel->o_smootherResidual = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat),baseLevel->x); - - dfloat rateTolerance; // 0 - accept not approximate patches, 1 - accept all approximate patches - if(strstr(options, "EXACT")){ - rateTolerance = 0.0; - } else { - rateTolerance = 1.0; - } - - //set up the fine problem smoothing - if(strstr(options, "LOCALPATCH")){ - ellipticSetupSmootherLocalPatch(elliptic, precon, baseLevel, tau, lambda, BCType, rateTolerance, options); - } else { //default to damped jacobi - ellipticSetupSmootherDampedJacobi(elliptic, precon, baseLevel, tau, lambda, BCType, options); - } - } -*/ } else if (options.compareArgs("PRECONDITIONER", "MASSMATRIX")){ precon->o_invMM = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), mesh->invMM); diff --git a/solvers/elliptic/src/ellipticSEMFEMSetup.c b/solvers/elliptic/src/ellipticSEMFEMSetup.c index 59a8cd9a7..a09d996e0 100644 --- a/solvers/elliptic/src/ellipticSEMFEMSetup.c +++ b/solvers/elliptic/src/ellipticSEMFEMSetup.c @@ -108,7 +108,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) memcpy(femMesh,mesh,sizeof(mesh_t)); if (elliptic->elementType==TRIANGLES) { - + //set semfem nodes as the grid points pmesh->Np = mesh->NpFEM; pmesh->r = mesh->rFEM; @@ -139,7 +139,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) if( (pmesh->r[n]+1)*(pmesh->r[n]+1)+(pmesh->s[n]-1)*(pmesh->s[n]-1)vertexNodes[2] = n; } - + // connect elements using parallel sort meshParallelConnect(pmesh); @@ -208,7 +208,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) // global nodes meshParallelConnectNodes(pmesh); - //pmesh->globalIds is now populated + //pmesh->globalIds is now populated } @@ -220,32 +220,32 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) femMesh->EToV = (hlong*) calloc(femMesh->Nelements*femMesh->Nverts, sizeof(hlong)); femMesh->EX = (dfloat*) calloc(femMesh->Nverts*femMesh->Nelements, sizeof(dfloat)); femMesh->EY = (dfloat*) calloc(femMesh->Nverts*femMesh->Nelements, sizeof(dfloat)); - if (elliptic->dim==3) + if (elliptic->dim==3) femMesh->EZ = (dfloat*) calloc(femMesh->Nverts*femMesh->Nelements, sizeof(dfloat)); - + dlong *localIds = (dlong *) calloc(femMesh->Nverts*femMesh->Nelements,sizeof(dlong)); // dlong NFEMverts = mesh->Nelements*mesh->NpFEM; for(dlong e=0;eNelements;++e){ for (int n=0;nNelFEM;n++) { dlong id[femMesh->Nverts]; - + dlong femId = e*mesh->NelFEM*mesh->Nverts+n*mesh->Nverts; for (int i=0;iNverts;i++) { //local ids in the subelement fem grid - id[i] = e*mesh->NpFEM + mesh->FEMEToV[n*mesh->Nverts+i]; - + id[i] = e*mesh->NpFEM + mesh->FEMEToV[n*mesh->Nverts+i]; + /* read vertex triplet for triangle */ femMesh->EToV[femId+i] = pmesh->globalIds[id[i]]; - + femMesh->EX[femId+i] = pmesh->x[id[i]]; femMesh->EY[femId+i] = pmesh->y[id[i]]; - if (elliptic->dim==3) + if (elliptic->dim==3) femMesh->EZ[femId+i] = pmesh->z[id[i]]; } - + switch(elliptic->elementType){ case TRIANGLES: localIds[femId+0] = id[0]; @@ -310,9 +310,9 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) for (int n=0;nNelFEM;n++) { for (int f=0;fNfaces;f++) { - + for (int face=0; faceNfaces;face++) { - + //count the nodes on this face which are on a macro face int NvertsOnFace = 0; for (int i=0;iNfp;i++){ @@ -322,7 +322,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) } if (NvertsOnFace == femMesh->Nfp) femFaceMap[n*femMesh->Nfaces+f] = face; //on macro face - } + } } } @@ -401,19 +401,19 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) } } } - ogsGatherScatter(mapB, ogsInt, ogsMin, pmesh->ogs); + ogsGatherScatter(mapB, ogsInt, ogsMin, pmesh->ogs); //use the bc flags to find masked ids for (dlong n=0;nNelements*pmesh->Np;n++) { if (mapB[n] == 1) { //Dirichlet boundary pmesh->maskedGlobalIds[n] = 0; } - } - free(mapB); + } + free(mapB); } else { //mask using the original mask - for (dlong n=0;nNmasked;n++) + for (dlong n=0;nNmasked;n++) pmesh->maskedGlobalIds[elliptic->maskIds[n]] = 0; } @@ -475,7 +475,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) } } } - } + } } else if (elliptic->elementType==TETRAHEDRA) { //build stiffness matrices femMesh->Srr = (dfloat *) calloc(femMesh->Np*femMesh->Np,sizeof(dfloat)); @@ -505,7 +505,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) } } } - + if (mesh->rank==0) printf("Building full SEMFEM matrix..."); fflush(stdout); // Build non-zeros of stiffness matrix (unassembled) @@ -528,8 +528,8 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) BuildFEMMatrixTet3D(femMesh,pmesh,lambda, localIds, globalNumbering, globalOwners,&cnt,sendNonZeros); break; case HEXAHEDRA: BuildFEMMatrixHex3D(femMesh,pmesh,lambda, localIds, globalNumbering, globalOwners,&cnt,sendNonZeros); break; - } - + } + // Make the MPI_NONZERO_T data type MPI_Datatype MPI_NONZERO_T; MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT}; @@ -601,9 +601,10 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) Cols[n] = A[n].col; Vals[n] = A[n].val; } + free(A); - precon->parAlmond = parAlmondInit(mesh, options); - parAlmondAgmgSetup(precon->parAlmond, + precon->parAlmond = parAlmond::Init(mesh->device, mesh->comm, options); + parAlmond::AMGSetup(precon->parAlmond, globalStarts, nnz, Rows, @@ -611,13 +612,16 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) Vals, elliptic->allNeumann, elliptic->allNeumannPenalty); - free(A); free(Rows); free(Cols); free(Vals); + free(Rows); free(Cols); free(Vals); + + if (options.compareArgs("VERBOSE", "TRUE")) + parAlmond::Report(precon->parAlmond); if (elliptic->elementType==TRIANGLES||elliptic->elementType==TETRAHEDRA) { - //tell parAlmond not to gather this level (its done manually) - agmgLevel *baseLevel = precon->parAlmond->levels[0]; - baseLevel->gatherLevel = false; - baseLevel->weightedInnerProds = false; + // //tell parAlmond not to gather this level (its done manually) + // agmgLevel *baseLevel = precon->parAlmond->levels[0]; + // baseLevel->gatherLevel = false; + // baseLevel->weightedInnerProds = false; // build interp and anterp dfloat *SEMFEMAnterp = (dfloat*) calloc(mesh->NpFEM*mesh->Np, sizeof(dfloat)); @@ -639,30 +643,36 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) precon->o_GzFEM = mesh->device.malloc(precon->FEMogs->Ngather*sizeof(dfloat)); } else { - //tell parAlmond to gather this level - agmgLevel *baseLevel = precon->parAlmond->levels[0]; + // //tell parAlmond to gather this level + // agmgLevel *baseLevel = precon->parAlmond->levels[0]; + + // baseLevel->gatherLevel = true; + parAlmond::multigridLevel *baseLevel = precon->parAlmond->levels[0]; + precon->rhsG = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat)); + precon->xG = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat)); + precon->o_rhsG = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat)); + precon->o_xG = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat)); - baseLevel->gatherLevel = true; - baseLevel->Srhs = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat)); - baseLevel->Sx = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat)); - baseLevel->o_Srhs = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat)); - baseLevel->o_Sx = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat)); + // baseLevel->Srhs = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat)); + // baseLevel->Sx = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat)); + // baseLevel->o_Srhs = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat)); + // baseLevel->o_Sx = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat)); - baseLevel->weightedInnerProds = false; + // baseLevel->weightedInnerProds = false; - baseLevel->gatherArgs = (void **) calloc(3,sizeof(void*)); - baseLevel->gatherArgs[0] = (void *) elliptic; - baseLevel->gatherArgs[1] = (void *) precon->FEMogs; //use the gs made from the partial gathered femgrid - baseLevel->gatherArgs[2] = (void *) &(baseLevel->o_Sx); - baseLevel->scatterArgs = baseLevel->gatherArgs; + // baseLevel->gatherArgs = (void **) calloc(3,sizeof(void*)); + // baseLevel->gatherArgs[0] = (void *) elliptic; + // baseLevel->gatherArgs[1] = (void *) precon->FEMogs; //use the gs made from the partial gathered femgrid + // baseLevel->gatherArgs[2] = (void *) &(baseLevel->o_Sx); + // baseLevel->scatterArgs = baseLevel->gatherArgs; - baseLevel->device_gather = ellipticGather; - baseLevel->device_scatter = ellipticScatter; + // baseLevel->device_gather = ellipticGather; + // baseLevel->device_scatter = ellipticScatter; } } -void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, +void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering, int *globalOwners, dlong *cnt, nonZero_t *A) { @@ -705,7 +715,7 @@ void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, } } -void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, +void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering, int *globalOwners, dlong *cnt, nonZero_t *A) { @@ -777,7 +787,7 @@ void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, } } -void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, +void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering, int *globalOwners, dlong *cnt, nonZero_t *A) { @@ -828,7 +838,7 @@ void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, } } -void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, +void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong *localIds, hlong* globalNumbering, int *globalOwners, dlong *cnt, nonZero_t *A) { @@ -840,17 +850,17 @@ void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, dlong nn = nx+ny*femMesh->Nq+nz*femMesh->Nq*femMesh->Nq; dlong idn = localIds[e*femMesh->Np + nn]; if (globalNumbering[idn]<0) continue; //skip masked nodes - + for (int mz=0;mzNq;mz++) { for (int my=0;myNq;my++) { for (int mx=0;mxNq;mx++) { dlong mm = mx+my*femMesh->Nq+mz*femMesh->Nq*femMesh->Nq; dlong idm = localIds[e*femMesh->Np + mm]; if (globalNumbering[idm]<0) continue; //skip masked nodes - + int id; dfloat val = 0.; - + if ((ny==my)&&(nz==mz)) { for (int k=0;kNq;k++) { id = k+ny*femMesh->Nq+nz*femMesh->Nq*femMesh->Nq; @@ -888,7 +898,7 @@ void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, val += Gss*femMesh->D[ny+k*femMesh->Nq]*femMesh->D[my+k*femMesh->Nq]; } } - + if (nx==mx) { id = nx+my*femMesh->Nq+nz*femMesh->Nq*femMesh->Nq; dfloat Gst = femMesh->ggeo[e*femMesh->Np*femMesh->Nggeo + id + G12ID*femMesh->Np]; @@ -907,13 +917,13 @@ void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, val += Gtt*femMesh->D[nz+k*femMesh->Nq]*femMesh->D[mz+k*femMesh->Nq]; } } - + if ((nx==mx)&&(ny==my)&&(nz==mz)) { id = nx + ny*femMesh->Nq+nz*femMesh->Nq*femMesh->Nq; dfloat JW = femMesh->ggeo[e*femMesh->Np*femMesh->Nggeo + id + GWJID*femMesh->Np]; val += JW*lambda; } - + // pack non-zero dfloat nonZeroThreshold = 1e-7; if (fabs(val) >= nonZeroThreshold) { diff --git a/solvers/elliptic/src/ellipticSetup.c b/solvers/elliptic/src/ellipticSetup.c index 6b08a95b8..eaab788fa 100644 --- a/solvers/elliptic/src/ellipticSetup.c +++ b/solvers/elliptic/src/ellipticSetup.c @@ -31,7 +31,7 @@ SOFTWARE. void reportMemoryUsage(occa::device &device, const char *mess); elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelInfo, setupAide options){ - + elliptic_t *elliptic = (elliptic_t*) calloc(1, sizeof(elliptic_t)); options.getArgs("MESH DIMENSION", elliptic->dim); @@ -50,8 +50,9 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI else meshOccaSetup2D(mesh, options, kernelInfo); - reportMemoryUsage(mesh->device, "after occa setup"); - + if (mesh->rank==0) + reportMemoryUsage(mesh->device, "after occa setup"); + // Boundary Type translation. Just default from the mesh file. int BCType[3] = {0,1,2}; elliptic->BCType = (int*) calloc(3,sizeof(int)); @@ -62,12 +63,12 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){ if(options.compareArgs("ELEMENT MAP", "TRILINEAR")){ printf("mesh->dim = %d, mesh->Nverts = %d\n", mesh->dim, mesh->Nverts); - + // pack gllz, gllw, and elementwise EXYZ hlong Nxyz = mesh->Nelements*mesh->dim*mesh->Nverts; dfloat *EXYZ = (dfloat*) calloc(Nxyz, sizeof(dfloat)); dfloat *gllzw = (dfloat*) calloc(2*mesh->Nq, sizeof(dfloat)); - + int sk = 0; for(int n=0;nNq;++n) gllzw[sk++] = mesh->gllz[n]; @@ -83,7 +84,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI for(int v=0;vNverts;++v) EXYZ[sk++] = mesh->EZ[e*mesh->Nverts+v]; } - + // nodewise ggeo with element coordinates and gauss node info elliptic->o_EXYZ = mesh->device.malloc(Nxyz*sizeof(dfloat), EXYZ); elliptic->o_gllzw = mesh->device.malloc(2*mesh->Nq*sizeof(dfloat), gllzw); @@ -94,7 +95,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI } } - // + // ellipticSolveSetup(elliptic, lambda, kernelInfo); @@ -105,12 +106,12 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI // load forcing into r for(dlong e=0;eNelements;++e){ for(int n=0;nNp;++n){ - + dfloat J; if (elliptic->elementType==TRIANGLES || elliptic->elementType==TETRAHEDRA) { J = mesh->vgeo[e*mesh->Nvgeo+JID]; } else { - J = mesh->vgeo[mesh->Np*(e*mesh->Nvgeo + JID) + n]; + J = mesh->vgeo[mesh->Np*(e*mesh->Nvgeo + JID) + n]; } dlong id = n+e*mesh->Np; dfloat xn = mesh->x[id]; @@ -119,7 +120,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI if(elliptic->dim==2) elliptic->r[id] = J*(2*M_PI*M_PI+lambda)*sin(M_PI*xn)*sin(M_PI*yn); - else + else elliptic->r[id] = J*(3*M_PI*M_PI+lambda)*cos(M_PI*xn)*cos(M_PI*yn)*cos(M_PI*zn); elliptic->x[id] = 0; } @@ -135,13 +136,13 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI elliptic->o_x = mesh->device.malloc(Nall*sizeof(dfloat), elliptic->x); - string boundaryHeaderFileName; + string boundaryHeaderFileName; options.getArgs("DATA FILE", boundaryHeaderFileName); kernelInfo["includes"] += (char*)boundaryHeaderFileName.c_str(); // set kernel name suffix char *suffix; - + if(elliptic->elementType==TRIANGLES) suffix = strdup("Tri2D"); if(elliptic->elementType==QUADRILATERALS) @@ -159,7 +160,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI if(r==mesh->rank){ sprintf(fileName, DELLIPTIC "/okl/ellipticRhsBCIpdg%s.okl", suffix); sprintf(kernelName, "ellipticRhsBCIpdg%s", suffix); - + elliptic->rhsBCIpdgKernel = mesh->device.buildKernel(fileName,kernelName, kernelInfo); } MPI_Barrier(mesh->comm); @@ -186,17 +187,17 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI if(r==mesh->rank){ sprintf(fileName, DELLIPTIC "/okl/ellipticRhsBC%s.okl", suffix); sprintf(kernelName, "ellipticRhsBC%s", suffix); - + elliptic->rhsBCKernel = mesh->device.buildKernel(fileName,kernelName, kernelInfo); - + sprintf(fileName, DELLIPTIC "/okl/ellipticAddBC%s.okl", suffix); sprintf(kernelName, "ellipticAddBC%s", suffix); - + elliptic->addBCKernel = mesh->device.buildKernel(fileName,kernelName, kernelInfo); } MPI_Barrier(mesh->comm); } - + dfloat zero = 0.f; elliptic->rhsBCKernel(mesh->Nelements, mesh->o_ggeo, @@ -217,7 +218,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI // gather-scatter if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){ - ogsGatherScatter(elliptic->o_r, ogsDfloat, ogsAdd, mesh->ogs); + ogsGatherScatter(elliptic->o_r, ogsDfloat, ogsAdd, mesh->ogs); if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, elliptic->o_r); } diff --git a/solvers/elliptic/src/ellipticSmoother.c b/solvers/elliptic/src/ellipticSmoother.c deleted file mode 100644 index 6441e9d30..000000000 --- a/solvers/elliptic/src/ellipticSmoother.c +++ /dev/null @@ -1,138 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "elliptic.h" - -void ellipticMultigridSmooth(void **args, occa::memory &o_r, occa::memory &o_x, bool xIsZero) { - - elliptic_t *elliptic = (elliptic_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - occa::memory o_res = level->o_smootherResidual; - - if (xIsZero) { - level->device_smoother(level->smootherArgs, o_r, o_x); - return; - } - - dfloat one = 1.; dfloat mone = -1.; - - //res = r-Ax - level->device_Ax(level->AxArgs,o_x,o_res); - elliptic->scaledAddKernel(level->Nrows,one, o_r, mone, o_res); - - //smooth the fine problem x = x + S(r-Ax) - level->device_smoother(level->smootherArgs, o_res, o_res); - elliptic->scaledAddKernel(level->Nrows,one, o_res, one, o_x); -} - -void ellipticMultigridSmoothChebyshev(void **args, occa::memory &o_r, occa::memory &o_x, bool xIsZero) { - - elliptic_t *elliptic = (elliptic_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - dfloat lambdaN = level->smoother_params[0]; - dfloat lambda1 = level->smoother_params[1]; - - dfloat theta = 0.5*(lambdaN+lambda1); - dfloat delta = 0.5*(lambdaN-lambda1); - dfloat invTheta = 1.0/theta; - dfloat sigma = theta/delta; - dfloat rho_n = 1./sigma; - dfloat rho_np1; - - dfloat one = 1., mone = -1., zero = 0.0; - - occa::memory o_res = level->o_smootherResidual; - occa::memory o_Ad = level->o_smootherResidual2; - occa::memory o_d = level->o_smootherUpdate; - - if(xIsZero){ //skip the Ax if x is zero - //res = Sr - level->device_smoother(level->smootherArgs, o_r, o_res); - - //d = invTheta*res - elliptic->scaledAddKernel(level->Nrows, invTheta, o_res, zero, o_d); - } else { - //res = S(r-Ax) - level->device_Ax(level->AxArgs,o_x,o_res); - elliptic->scaledAddKernel(level->Nrows, one, o_r, mone, o_res); - level->device_smoother(level->smootherArgs, o_res, o_res); - - //d = invTheta*res - elliptic->scaledAddKernel(level->Nrows, invTheta, o_res, zero, o_d); - } - - for (int k=0;kChebyshevIterations;k++) { - //x_k+1 = x_k + d_k - if (xIsZero&&(k==0)) - elliptic->scaledAddKernel(level->Nrows, one, o_d, zero, o_x); - else - elliptic->scaledAddKernel(level->Nrows, one, o_d, one, o_x); - - //r_k+1 = r_k - SAd_k - level->device_Ax(level->AxArgs,o_d,o_Ad); - level->device_smoother(level->smootherArgs, o_Ad, o_Ad); - elliptic->scaledAddKernel(level->Nrows, mone, o_Ad, one, o_res); - - rho_np1 = 1.0/(2.*sigma-rho_n); - dfloat rhoDivDelta = 2.0*rho_np1/delta; - - //d_k+1 = rho_k+1*rho_k*d_k + 2*rho_k+1*r_k+1/delta - elliptic->scaledAddKernel(level->Nrows, rhoDivDelta, o_res, rho_np1*rho_n, o_d); - - rho_n = rho_np1; - } - //x_k+1 = x_k + d_k - elliptic->scaledAddKernel(level->Nrows, one, o_d, one, o_x); - -} - -void LocalPatch(void **args, occa::memory &o_r, occa::memory &o_Sr) { - - elliptic_t *elliptic = (elliptic_t*) args[0]; - mesh_t *mesh = elliptic->mesh; - precon_t *precon = elliptic->precon; - - occaTimerTic(mesh->device,"approxBlockJacobiSolveKernel"); - precon->approxBlockJacobiSolverKernel(mesh->Nelements, - precon->o_patchesIndex, - precon->o_invAP, - precon->o_invDegreeAP, - o_r, - o_Sr); - occaTimerToc(mesh->device,"approxBlockJacobiSolveKernel"); -} - -void dampedJacobi(void **args, occa::memory &o_r, occa::memory &o_Sr) { - - elliptic_t *elliptic = (elliptic_t *) args[0]; - mesh_t *mesh = elliptic->mesh; - - occa::memory o_invDiagA = elliptic->precon->o_invDiagA; - - elliptic->dotMultiplyKernel(mesh->Np*mesh->Nelements,o_invDiagA,o_r,o_Sr); -} \ No newline at end of file diff --git a/solvers/elliptic/src/ellipticSmootherSetup.c b/solvers/elliptic/src/ellipticSmootherSetup.c deleted file mode 100644 index 3202a68aa..000000000 --- a/solvers/elliptic/src/ellipticSmootherSetup.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "elliptic.h" - -typedef struct{ - - dlong localId; - hlong baseId; - int haloFlag; - -} preconGatherInfo_t; - -int parallelCompareBaseId(const void *a, const void *b){ - - preconGatherInfo_t *fa = (preconGatherInfo_t*) a; - preconGatherInfo_t *fb = (preconGatherInfo_t*) b; - - if(fa->baseId < fb->baseId) return -1; - if(fa->baseId > fb->baseId) return +1; - - return 0; -} - -void ellipticSetupSmootherLocalPatch(elliptic_t *elliptic, precon_t *precon, - agmgLevel *level, dfloat lambda, - dfloat rateTolerance) { - - dfloat *invAP; - dlong Npatches; - dlong *patchesIndex; - - mesh_t *mesh = elliptic->mesh; - setupAide options = elliptic->options; - - int NpP = mesh->Np; - - //initialize the full inverse operators on each 4 element patch - ellipticBuildLocalPatches(elliptic, lambda, rateTolerance, &Npatches, &patchesIndex, &invAP); - - precon->o_invAP = mesh->device.malloc(Npatches*NpP*NpP*sizeof(dfloat),invAP); - precon->o_patchesIndex = mesh->device.malloc(mesh->Nelements*sizeof(dlong), patchesIndex); - - dfloat *invDegree = (dfloat*) calloc(mesh->Nelements,sizeof(dfloat)); - for (dlong e=0;eNelements;e++) { - invDegree[e] = 1.0; - } - precon->o_invDegreeAP = mesh->device.malloc(mesh->Nelements*sizeof(dfloat),invDegree); - - level->device_smoother = LocalPatch; - - //estimate the max eigenvalue of S*A - dfloat rho = maxEigSmoothAx(elliptic, level); - - if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) { - - level->smoother_params = (dfloat *) calloc(2,sizeof(dfloat)); - - level->smoother_params[0] = rho; - level->smoother_params[1] = rho/10.; - - } else { - - //set the stabilty weight (jacobi-type interation) - dfloat weight = (4./3.)/rho; - - for (dlong e=0;eNelements;e++) - invDegree[e] *= weight; - - //update with weight - precon->o_invDegreeAP.copyFrom(invDegree); - } - free(invDegree); -} - -void ellipticSetupSmootherDampedJacobi(elliptic_t *elliptic, precon_t *precon, - agmgLevel *level, dfloat lambda) { - - dfloat *invDiagA; - mesh_t *mesh = elliptic->mesh; - setupAide options = elliptic->options; - - ellipticBuildJacobi(elliptic,lambda, &invDiagA); - - precon->o_invDiagA = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat), invDiagA); - - level->device_smoother = dampedJacobi; - - //estimate the max eigenvalue of S*A - dfloat rho = maxEigSmoothAx(elliptic, level); - - if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) { - - level->smoother_params = (dfloat *) calloc(2,sizeof(dfloat)); - - level->smoother_params[0] = rho; - level->smoother_params[1] = rho/10.; - - } else { - - //set the stabilty weight (jacobi-type interation) - dfloat weight = (4./3.)/rho; - - for (dlong n=0;nNp*mesh->Nelements;n++) - invDiagA[n] *= weight; - - //update diagonal with weight - precon->o_invDiagA.copyFrom(invDiagA); - } - - free(invDiagA); -} - -static void eig(const int Nrows, double *A, double *WR, double *WI){ - - int NB = 256; - char JOBVL = 'V'; - char JOBVR = 'V'; - int N = Nrows; - int LDA = Nrows; - int LWORK = (NB+2)*N; - - double *WORK = new double[LWORK]; - double *VL = new double[Nrows*Nrows]; - double *VR = new double[Nrows*Nrows]; - - int INFO = -999; - - dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI, - VL, &LDA, VR, &LDA, WORK, &LWORK, &INFO); - - - assert(INFO == 0); - - delete [] VL; - delete [] VR; - delete [] WORK; -} - -dfloat maxEigSmoothAx(elliptic_t* elliptic, agmgLevel *level){ - - mesh_t *mesh = elliptic->mesh; - setupAide options = elliptic->options; - - const dlong N = level->Nrows; - const dlong M = level->Ncols; - - int k = 10; - - hlong Nlocal = (hlong) level->Nrows; - hlong Ntotal = 0; - MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, mesh->comm); - if(k > Ntotal) k = (int) Ntotal; - - // do an arnoldi - - // allocate memory for Hessenberg matrix - double *H = (double *) calloc(k*k,sizeof(double)); - - // allocate memory for basis - dfloat *Vx = (dfloat*) calloc(M, sizeof(dfloat)); - occa::memory *o_V = (occa::memory *) calloc(k+1, sizeof(occa::memory)); - - occa::memory o_Vx = mesh->device.malloc(M*sizeof(dfloat),Vx); - occa::memory o_AVx = mesh->device.malloc(M*sizeof(dfloat),Vx); - - for(int i=0; i<=k; i++) - o_V[i] = mesh->device.malloc(M*sizeof(dfloat),Vx); - - // generate a random vector for initial basis vector - for (dlong i=0;iogs); - - for (dlong i=0;iNmasked;i++) Vx[elliptic->maskIds[i]] = 0.; - } - - o_Vx.copyFrom(Vx); //copy to device - dfloat norm_vo = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_Vx, o_Vx); - norm_vo = sqrt(norm_vo); - - ellipticScaledAdd(elliptic, 1./norm_vo, o_Vx, 0. , o_V[0]); - - for(int j=0; jdevice_Ax(level->AxArgs,o_V[j],o_AVx); - level->device_smoother(level->smootherArgs, o_AVx, o_V[j+1]); - - // modified Gram-Schmidth - for(int i=0; i<=j; i++){ - // H(i,j) = v[i]'*A*v[j] - dfloat hij = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_V[i], o_V[j+1]); - - // v[j+1] = v[j+1] - hij*v[i] - ellipticScaledAdd(elliptic, -hij, o_V[i], 1., o_V[j+1]); - - H[i + j*k] = (double) hij; - } - - if(j+1 < k){ - // v[j+1] = v[j+1]/||v[j+1]|| - dfloat norm_vj = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_V[j+1], o_V[j+1]); - norm_vj = sqrt(norm_vj); - ellipticScaledAdd(elliptic, 1/norm_vj, o_V[j+1], 0., o_V[j+1]); - - H[j+1+ j*k] = (double) norm_vj; - } - } - - double *WR = (double *) calloc(k,sizeof(double)); - double *WI = (double *) calloc(k,sizeof(double)); - - eig(k, H, WR, WI); - - double rho = 0.; - - for(int i=0; irank==0)&&(options.compareArgs("VERBOSE","TRUE"))) printf("weight = %g \n", rho); - - return rho; -} diff --git a/solvers/elliptic/src/ellipticSolveSetup.c b/solvers/elliptic/src/ellipticSolveSetup.c index 566e59ba7..7c53e5cce 100644 --- a/solvers/elliptic/src/ellipticSolveSetup.c +++ b/solvers/elliptic/src/ellipticSolveSetup.c @@ -38,7 +38,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k MPI_Finalize(); exit(-1); } - if (options.compareArgs("PRECONDITIONER","MASSMATRIX") && elliptic->elementType!=TRIANGLES + if (options.compareArgs("PRECONDITIONER","MASSMATRIX") && elliptic->elementType!=TRIANGLES && elliptic->elementType!=TETRAHEDRA ) { printf("ERROR: MASSMATRIX preconditioner is only available for triangle and tetrhedra elements. Use JACOBI instead.\n"); MPI_Finalize(); @@ -56,11 +56,11 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k dlong Nall = Ntotal + Nhalo; dlong Nblock2 = mymax(1,(Nblock+blockSize-1)/blockSize); - + //tau if (elliptic->elementType==TRIANGLES || elliptic->elementType==QUADRILATERALS) elliptic->tau = 2.0*(mesh->N+1)*(mesh->N+2)/2.0; - else + else elliptic->tau = 2.0*(mesh->N+1)*(mesh->N+3); elliptic->p = (dfloat*) calloc(Nall, sizeof(dfloat)); @@ -97,19 +97,19 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k elliptic->sendBuffer = (dfloat*) o_sendBuffer.getMappedPointer(); elliptic->recvBuffer = (dfloat*) o_recvBuffer.getMappedPointer(); - + occa::memory o_gradSendBuffer = mesh->device.mappedAlloc(2*Nbytes, NULL); occa::memory o_gradRecvBuffer = mesh->device.mappedAlloc(2*Nbytes, NULL); elliptic->gradSendBuffer = (dfloat*) o_gradSendBuffer.getMappedPointer(); elliptic->gradRecvBuffer = (dfloat*) o_gradRecvBuffer.getMappedPointer(); #endif - + elliptic->sendBuffer = (dfloat*) occaHostMallocPinned(mesh->device, Nbytes, NULL, elliptic->o_sendBuffer); elliptic->recvBuffer = (dfloat*) occaHostMallocPinned(mesh->device, Nbytes, NULL, elliptic->o_recvBuffer); elliptic->gradSendBuffer = (dfloat*) occaHostMallocPinned(mesh->device, 2*Nbytes, NULL, elliptic->o_gradSendBuffer); elliptic->gradRecvBuffer = (dfloat*) occaHostMallocPinned(mesh->device, 2*Nbytes, NULL, elliptic->o_gradRecvBuffer); - + }else{ elliptic->sendBuffer = NULL; elliptic->recvBuffer = NULL; @@ -176,13 +176,13 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k } } } - - // !!!!!! Removed MPI::BOOL since some mpi versions complains about it !!!!! - int lallNeumann, gallNeumann; - lallNeumann = allNeumann ? 0:1; + + // !!!!!! Removed MPI::BOOL since some mpi versions complains about it !!!!! + int lallNeumann, gallNeumann; + lallNeumann = allNeumann ? 0:1; MPI_Allreduce(&lallNeumann, &gallNeumann, 1, MPI_INT, MPI_SUM, mesh->comm); - elliptic->allNeumann = (gallNeumann>0) ? false: true; + elliptic->allNeumann = (gallNeumann>0) ? false: true; // MPI_Allreduce(&allNeumann, &(elliptic->allNeumann), 1, MPI::BOOL, MPI_LAND, mesh->comm); if (mesh->rank==0&& options.compareArgs("VERBOSE","TRUE")) printf("allNeumann = %d \n", elliptic->allNeumann); @@ -204,16 +204,16 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k elliptic->o_EToB = mesh->device.malloc(mesh->Nelements*mesh->Nfaces*sizeof(int), elliptic->EToB); #if 0 - if (mesh->rank==0 && options.compareArgs("VERBOSE","TRUE")) + if (mesh->rank==0 && options.compareArgs("VERBOSE","TRUE")) occa::setVerboseCompilation(true); - else + else occa::setVerboseCompilation(false); #endif //setup an unmasked gs handle int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0; meshParallelGatherScatterSetup(mesh, Ntotal, mesh->globalIds, mesh->comm, verbose); - + //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann) elliptic->mapB = (int *) calloc(mesh->Nelements*mesh->Np,sizeof(int)); for (dlong e=0;eNelements;e++) { @@ -229,7 +229,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k } } } - ogsGatherScatter(elliptic->mapB, ogsInt, ogsMin, mesh->ogs); + ogsGatherScatter(elliptic->mapB, ogsInt, ogsMin, mesh->ogs); //use the bc flags to find masked ids elliptic->Nmasked = 0; @@ -241,7 +241,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k } } elliptic->o_mapB = mesh->device.malloc(mesh->Nelements*mesh->Np*sizeof(int), elliptic->mapB); - + elliptic->maskIds = (dlong *) calloc(elliptic->Nmasked, sizeof(dlong)); elliptic->Nmasked =0; //reset for (dlong n=0;nNelements*mesh->Np;n++) { @@ -252,16 +252,16 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k //make a masked version of the global id numbering mesh->maskedGlobalIds = (hlong *) calloc(Ntotal,sizeof(hlong)); memcpy(mesh->maskedGlobalIds, mesh->globalIds, Ntotal*sizeof(hlong)); - for (dlong n=0;nNmasked;n++) + for (dlong n=0;nNmasked;n++) mesh->maskedGlobalIds[elliptic->maskIds[n]] = 0; //use the masked ids to make another gs handle elliptic->ogs = ogsSetup(Ntotal, mesh->maskedGlobalIds, mesh->comm, verbose, mesh->device); elliptic->o_invDegree = elliptic->ogs->o_invDegree; - + /*preconditioner setup */ elliptic->precon = (precon_t*) calloc(1, sizeof(precon_t)); - + kernelInfo["parser/" "automate-add-barriers"] = "disabled"; if(mesh->device.mode()=="CUDA"){ // add backend compiler optimization for CUDA @@ -273,7 +273,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k // set kernel name suffix char *suffix; - + if(elliptic->elementType==TRIANGLES) suffix = strdup("Tri2D"); if(elliptic->elementType==QUADRILATERALS) @@ -289,7 +289,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k for (int r=0;rsize;r++) { if (r==mesh->rank) { - //mesh kernels + //mesh kernels mesh->haloExtractKernel = mesh->device.buildKernel(DHOLMES "/okl/meshHaloExtract2D.okl", "meshHaloExtract2D", @@ -307,7 +307,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k kernelInfo["defines/" "p_blockSize"]= blockSize; - + mesh->sumKernel = mesh->device.buildKernel(DHOLMES "/okl/sum.okl", @@ -338,8 +338,8 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k mesh->device.buildKernel(DHOLMES "/okl/norm2.okl", "norm2", kernelInfo); - - + + elliptic->scaledAddKernel = mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl", "scaledAdd", @@ -354,7 +354,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k mesh->device.buildKernel(DHOLMES "/okl/dotDivide.okl", "dotDivide", kernelInfo); - + // add custom defines kernelInfo["defines/" "p_NpP"]= (mesh->Np+mesh->Nfp*mesh->Nfaces); kernelInfo["defines/" "p_Nverts"]= mesh->Nverts; @@ -412,7 +412,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k dfloatKernelInfo["defines/" "pfloat"]= dfloatString; elliptic->AxKernel = mesh->device.buildKernel(fileName,kernelName,dfloatKernelInfo); - + if(elliptic->elementType!=HEXAHEDRA){ sprintf(kernelName, "ellipticPartialAx%s", suffix); } @@ -427,7 +427,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k elliptic->partialAxKernel = mesh->device.buildKernel(fileName,kernelName,dfloatKernelInfo); elliptic->partialFloatAxKernel = mesh->device.buildKernel(fileName,kernelName,floatKernelInfo); - + if (options.compareArgs("BASIS","BERN")) { sprintf(fileName, DELLIPTIC "/okl/ellipticGradientBB%s.okl", suffix); @@ -437,14 +437,14 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k sprintf(kernelName, "ellipticPartialGradientBB%s", suffix); elliptic->partialGradientKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo); - + sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdgBB%s.okl", suffix); sprintf(kernelName, "ellipticAxIpdgBB%s", suffix); elliptic->ipdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo); sprintf(kernelName, "ellipticPartialAxIpdgBB%s", suffix); elliptic->partialIpdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo); - + } else if (options.compareArgs("BASIS","NODAL")) { sprintf(fileName, DELLIPTIC "/okl/ellipticGradient%s.okl", suffix); @@ -482,7 +482,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k sprintf(kernelName, "ellipticApproxBlockJacobiSolver"); elliptic->precon->approxBlockJacobiSolverKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo); - if ( elliptic->elementType == TRIANGLES + if ( elliptic->elementType == TRIANGLES || elliptic->elementType == TETRAHEDRA) { elliptic->precon->SEMFEMInterpKernel = mesh->device.buildKernel(DELLIPTIC "/okl/ellipticSEMFEMInterp.okl", diff --git a/solvers/ins/makefile b/solvers/ins/makefile index ed9ac8158..ad439ae39 100644 --- a/solvers/ins/makefile +++ b/solvers/ins/makefile @@ -4,7 +4,7 @@ ERROR: @echo "Error, environment variable [OCCA_DIR] is not set" endif -CXXFLAGS = +CXXFLAGS = include ${OCCA_DIR}/scripts/Makefile @@ -12,7 +12,7 @@ include ${OCCA_DIR}/scripts/Makefile HDRDIR = ../../include GSDIR = ../../3rdParty/gslib OGSDIR = ../../libs/gatherScatter -ALMONDDIR = ../parALMOND +ALMONDDIR = ../../libs/parAlmond ELLIPTICDIR = ../elliptic # set options for this machine @@ -22,16 +22,16 @@ CC = mpic++ LD = mpic++ # compiler flags to be used (set to compile with debugging on) -CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -I$(ELLIPTICDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DINS='"${CURDIR}"' +CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -I$(ELLIPTICDIR) -I$(ALMONDDIR) -g -D DHOLMES='"${CURDIR}/../.."' -D DINS='"${CURDIR}"' -# link flags to be used +# link flags to be used LDFLAGS = -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g # libraries to be linked in -LIBS = -L$(ELLIPTICDIR) -lelliptic -L$(ALMONDDIR) -lparALMOND \ +LIBS = -L$(ELLIPTICDIR) -lelliptic -L$(ALMONDDIR) -lparAlmond \ -L$(OGSDIR) -logs -L$(GSDIR)/lib -lgs \ -L$(OCCA_DIR)/lib $(links) -L../../3rdParty/BlasLapack -lBlasLapack -lgfortran \ - + INCLUDES = ins.h DEPS = $(INCLUDES) \ @@ -39,12 +39,13 @@ $(HDRDIR)/mesh.h \ $(HDRDIR)/mesh2D.h \ $(HDRDIR)/mesh3D.h \ $(OGSDIR)/ogs.hpp \ -$(ALMONDDIR)/parALMOND.h \ +$(ALMONDDIR)/parAlmond.hpp \ $(ELLIPTICDIR)/elliptic.h \ -$(ELLIPTICDIR)/ellipticPrecon.h +$(ELLIPTICDIR)/ellipticPrecon.h \ +$(ELLIPTICDIR)/ellipticMultiGrid.h # types of files we are going to construct rules for -.SUFFIXES: .c +.SUFFIXES: .c # rule for .c files .c.o: $(DEPS) @@ -75,7 +76,7 @@ AOBJS = \ ./src/insPressureUpdate.o \ ./src/insRestart.o \ ./src/insWeldTriVerts.o \ -./src/insIsoPlotVTU.o +./src/insIsoPlotVTU.o # library objects LOBJS = \ @@ -136,7 +137,7 @@ LOBJS = \ ../../src/occaHostMallocPinned.o \ ../../src/timer.o -insMain:$(AOBJS) $(LOBJS) ./src/insMain.o libblas libogs libparALMOND libelliptic +insMain:$(AOBJS) $(LOBJS) ./src/insMain.o libblas libogs libparAlmond libelliptic $(LD) $(LDFLAGS) -o insMain ./src/insMain.o $(COBJS) $(AOBJS) $(LOBJS) $(paths) $(LIBS) lib:$(AOBJS) @@ -148,8 +149,8 @@ libogs: libblas: cd ../../3rdParty/BlasLapack; make -j lib; cd ../../solvers/ins -libparALMOND: - cd ../parALMOND; make -j lib; cd ../ins +libparAlmond: + cd ../../libs/parAlmond; make -j lib; cd ../../solvers/ins libelliptic: cd ../elliptic; make -j lib; cd ../ins diff --git a/solvers/parALMOND/include/agmg.h b/solvers/parALMOND/include/agmg.h deleted file mode 100644 index cb99d0c6d..000000000 --- a/solvers/parALMOND/include/agmg.h +++ /dev/null @@ -1,73 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#ifndef AGMG_H -#define AGMG_H 1 - -#ifdef OCCA_VERSION_1_0 -#include "occa/modes/opencl/utils.hpp" -#endif - -#include "mesh.h" - -#include "parAlmond.h" -#include "agmgLevel.h" -#include "agmgMatrices.h" -#include "vectorPrimitives.h" -#include - -#define AGMGBDIM 32 //block size -#define SIMDWIDTH 32 //width of simd blocks -#define MAX_LEVELS 100 -#define GPU_CPU_SWITCH_SIZE 0 //host-device switch threshold - -#define RDIMX 32 -#define RDIMY 8 -#define RLOAD 1 - - -void agmgSetup(parAlmond_t *parAlmond, csr *A, dfloat *nullA, hlong *globalRowStarts, setupAide options); -void parAlmondReport(parAlmond_t *parAlmond); -void buildAlmondKernels(parAlmond_t *parAlmond); - -void kcycle(parAlmond_t *parAlmond, int k); -void device_kcycle(parAlmond_t *parAlmond, int k); - -void vcycle(parAlmond_t *parAlmond, int k); -void device_vcycle(parAlmond_t *parAlmond, int k); - -void pgmres(parAlmond_t *parAlmond, int maxIt, dfloat tol); -void device_pgmres(parAlmond_t *parAlmond, int maxIt, dfloat tol); - -void pcg(parAlmond_t *parAlmond, int maxIt, dfloat tol); -void device_pcg(parAlmond_t *parAlmond, int maxIt, dfloat tol); - -namespace agmg { - extern int rank; - extern int size; - extern MPI_Comm comm; -}; -#endif diff --git a/solvers/parALMOND/include/agmgLevel.h b/solvers/parALMOND/include/agmgLevel.h deleted file mode 100644 index 39fd92ff3..000000000 --- a/solvers/parALMOND/include/agmgLevel.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - - -void agmgAx (void **args, dfloat *x, dfloat *Ax); -void agmgCoarsen (void **args, dfloat *r, dfloat *Rr); -void agmgProlongate(void **args, dfloat *x, dfloat *Px); -void agmgSmooth (void **args, dfloat *rhs, dfloat *x, bool x_is_zero); - -void device_agmgAx (void **args, occa::memory &o_x, occa::memory &o_Ax); -void device_agmgCoarsen (void **args, occa::memory &o_r, occa::memory &o_Rr); -void device_agmgProlongate(void **args, occa::memory &o_x, occa::memory &o_Px); -void device_agmgSmooth (void **args, occa::memory &o_r, occa::memory &o_x, bool x_is_zero); - -void setupSmoother(parAlmond_t *parAlmond, agmgLevel *level, SmoothType s); -void setupExactSolve(parAlmond_t *parAlmond, agmgLevel *level, bool nullSpace, dfloat nullSpacePenalty); -void exactCoarseSolve(parAlmond_t *parAlmond, int N, dfloat *rhs, dfloat *x); -void device_exactCoarseSolve(parAlmond_t *parAlmond, int N, occa::memory o_rhs, occa::memory o_x); diff --git a/solvers/parALMOND/include/agmgMatrices.h b/solvers/parALMOND/include/agmgMatrices.h deleted file mode 100644 index 5b5c663e6..000000000 --- a/solvers/parALMOND/include/agmgMatrices.h +++ /dev/null @@ -1,63 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - - -//creators -csr * newCSRfromCOO(dlong N, hlong* globalRowStarts, - dlong NNZ, hlong *Ai, hlong *Aj, dfloat *Avals); -void freeCSR(csr *A); -dcoo *newDCOO(parAlmond_t *parAlmond, csr *B); -hyb * newHYB(parAlmond_t *parAlmond, csr *csrA); - - -void axpy(csr *A, dfloat alpha, dfloat *x, dfloat beta, dfloat *y, bool nullSpace, dfloat nullSpacePenalty); - -void axpy(parAlmond_t *parAlmond, dcoo *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y); - -void axpy(parAlmond_t *parAlmond, hyb *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y, bool nullSpace, dfloat nullSpacePenalty); - -void axpy(parAlmond_t *parAlmond, ell *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y); - -void ax(parAlmond_t *parAlmond, coo *C, dfloat alpha, occa::memory o_x, occa::memory o_y); - - -//smoothing -void smoothJacobi (parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero); -void smoothDampedJacobi(parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero); -void smoothChebyshev (parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero); -void smoothJacobi (parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero); -void smoothDampedJacobi(parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero); -void smoothChebyshev (parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero); - -//halo exchange -void csrHaloSetup(csr *A, hlong *globalColStarts); -void csrHaloExchange(csr *A, size_t Nbytes, void *sourceBuffer, void *sendBuffer, void *recvBuffer); -void csrHaloExchangeStart(csr *A, size_t Nbytes, void *sourceBuffer, void *sendBuffer, void *recvBuffer); -void csrHaloExchangeFinish(csr *A); -void dcooHaloExchangeStart(dcoo *A, size_t Nbytes, void *sendBuffer, void *recvBuffer); -void dcooHaloExchangeFinish(dcoo *A); -void hybHaloExchangeStart(hyb *A, size_t Nbytes, void *sendBuffer, void *recvBuffer); -void hybHaloExchangeFinish(hyb *A); diff --git a/solvers/parALMOND/include/vectorPrimitives.h b/solvers/parALMOND/include/vectorPrimitives.h deleted file mode 100644 index 6be049f58..000000000 --- a/solvers/parALMOND/include/vectorPrimitives.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - - - -dfloat innerProd(dlong n, dfloat *a, dfloat *b); - -void doubleInnerProd(dlong n, dfloat *aDotbc, dfloat *a, dfloat *b, dfloat *c); - -void kcycleCombinedOp1(dlong n, dfloat *aDotbc, dfloat *a, dfloat *b, dfloat *c, dfloat *w, bool weighted); - -void kcycleCombinedOp2(dlong n, dfloat *aDotbcd, dfloat *a, dfloat *b, dfloat *c, dfloat* d, dfloat *w, bool weighted); - -void vectorAdd(dlong n, dfloat alpha, dfloat *x, dfloat beta, dfloat *y); - -dfloat vectorAddInnerProd(dlong n, dfloat alpha, dfloat *x, dfloat beta, dfloat *y, dfloat *w, bool weighted); - -void dotStar(dlong m, dfloat *a, dfloat *b); - -void scaleVector(dlong m, dfloat *a, dfloat alpha); - -void setVector(dlong m, dfloat *a, dfloat alpha); - -dfloat sumVector(dlong m, dfloat *a); - -void addScalar(dlong m, dfloat alpha, dfloat *a); - -void randomize(dlong m, dfloat *a); - -dfloat maxEntry(dlong n, dfloat *a); - -void scaleVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a, dfloat alpha); - -void setVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a, dfloat alpha); - -dfloat sumVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a); - -void addScalar(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_a); - -void dotStar(parAlmond_t *parAlmond, dlong N, occa::memory o_a, occa::memory o_b); - -void dotStar(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_a, - occa::memory o_b, dfloat beta, occa::memory o_c); - -dfloat innerProd(parAlmond_t *parAlmond, dlong N, occa::memory o_x, occa::memory o_y); - -// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b, -void kcycleCombinedOp1(parAlmond_t *parAlmond, dlong n, dfloat *aDotbc, occa::memory o_a, - occa::memory o_b, occa::memory o_c, occa::memory o_w, bool weighted); - -// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d, -void kcycleCombinedOp2(parAlmond_t *parAlmond, dlong n, dfloat *aDotbcd, occa::memory o_a, - occa::memory o_b, occa::memory o_c, occa::memory o_d, - occa::memory o_w, bool weighted); - -// y = beta*y + alpha*x, and return y\dot y -dfloat vectorAddInnerProd(parAlmond_t *parAlmond, dlong n, dfloat alpha, occa::memory o_x, - dfloat beta, occa::memory o_y, - occa::memory o_w, bool weighted); - -void vectorAdd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y); - -void vectorAdd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x, - dfloat beta, occa::memory o_y, occa::memory o_z); diff --git a/solvers/parALMOND/makefile b/solvers/parALMOND/makefile deleted file mode 100644 index 385eb8d5d..000000000 --- a/solvers/parALMOND/makefile +++ /dev/null @@ -1,42 +0,0 @@ -sDir = ./src -iDir = ./include -objDir = ./ - -include ${OCCA_DIR}/scripts/Makefile - -sources = $(wildcard $(sDir)/*c) -includes = $(wildcard $(iDir)/*h) -objects = $(subst $(sDir)/,$(objDir)/,$(sources:.c=.o)) -deps = $(includes) \ -../../include/mesh.h \ -../../libs/gatherScatter/ogs.hpp \ -../../include/parAlmond.h - -flags = -DOCCA_VERSION_1_0 -I${OCCA_DIR}/include -I$(iDir) -I../../include -I../../libs/gatherScatter -libs = -L${OCCA_DIR}/lib -locca -llapack -lblas - -flags += -D DPWD='"${CURDIR}"' -CC = mpic++ -#flags += -fopenmp - -# Debug Option -ifeq ($(DEBUG), 1) -flags += -g -else -# <> For debugging purposes -flags += -O3 -DNDEBUG -fopenmp -endif - -#flags += -DINS_MPI=$(INS_MPI) -DINS_RENDER=$(INS_RENDER) -DINS_CLUSTER=$(INS_CLUSTER) - -all: lib - -lib: $(objects) $(deps) - ar -cr libparALMOND.a $(objects) - -$(objDir)/%.o:$(sDir)/%.c $(deps) #$(wildcard $(subst $(sDir),$(iDir),$(<:.cpp=.hpp))) - $(CC) $(flags) -o $@ $(libs) -c $< $(paths) - -clean: - rm -f libparALMOND.a - rm -f $(objDir)/*.o diff --git a/solvers/parALMOND/okl/agg_interpolate.okl b/solvers/parALMOND/okl/agg_interpolate.okl deleted file mode 100644 index 98dd590ea..000000000 --- a/solvers/parALMOND/okl/agg_interpolate.okl +++ /dev/null @@ -1,44 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -// Specialized interpolation operator for aggregation amg -// Assumes that each row has exactly one non-zero, P_coefs at column index that is -// stored in P_cols -// y = y + P*x, (Px is used to represent y in the @kernel) - -@kernel void agg_interpolate(const dlong n, - @restrict const dlong * P_rows, - @restrict const dlong * P_cols, - @restrict const dfloat * P_coefs, - @restrict const dfloat * x, - @restrict dfloat * Px){ - - for(dlong i=0;i-1) { - const dfloat coeffn = coefs[address]; - const dfloat xn = x[col]; - - result += coeffn*xn; - } - } - y[i] = alpha*result + betay;//beta*y[row]; - } - } -} - -@kernel void ellZeqAXPY(const dlong numRows, - const int nnzPerRow, - const dlong strideLength, - const dfloat alpha, - const dfloat beta, - @restrict const dlong * cols, - @restrict const dfloat * coefs, - @restrict const dfloat * x, - @restrict const dfloat * y, - @restrict dfloat * z){ - - // z = alpha * A * x + beta * y - for(dlong i=0;i -1) result += coefs[address]*x[col]; - } - z[i] = alpha*result + beta*y[i]; - } - } -} - -@kernel void ellJacobi(const dlong numRows, - const int nnzPerRow, - const dlong strideLength, - @restrict const dlong * cols, - @restrict const dfloat * coefs, - @restrict const dfloat * x, - @restrict const dfloat * r, - @restrict dfloat * z){ - - // z = r - (A-D)*x - for(dlong i=0;i -1) result -= coefs[address]*x[col]; - } - z[i] = result; - } - } -} - diff --git a/solvers/parALMOND/okl/kcycleCombinedOp.okl b/solvers/parALMOND/okl/kcycleCombinedOp.okl deleted file mode 100644 index b073b7094..000000000 --- a/solvers/parALMOND/okl/kcycleCombinedOp.okl +++ /dev/null @@ -1,198 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - - -// a.b, a.c, b.b -@kernel void kcycleCombinedOp1Kernel(const dlong Nblocks, - const dlong N, - @restrict const dfloat * a, - @restrict const dfloat * b, - @restrict const dfloat * c, - @restrict dfloat * ips){ - - for(dlong g=0;g= 1*p_RDIMX/2) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/2]; \ - if(tx>= 3*p_RDIMX/4) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/4]; \ - if(tx>= 7*p_RDIMX/8) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/8]; \ - if(tx>= 15*p_RDIMX/16) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/16]; \ - if(tx>= 31*p_RDIMX/32) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/32]; \ - if(tx==(p_RDIMX-1)) s_res[ty] = s_ip[ty][tx]; \ - } \ - } \ - \ - @barrier("local"); \ - \ - for(int ty=0;ty= 1*p_RDIMY/2) s_res[tx] += s_res[tx-p_RDIMY/2]; \ - if(tx >= 3*p_RDIMY/4) s_res[tx] += s_res[tx-p_RDIMY/4]; \ - if(tx >= 7*p_RDIMY/8) s_res[tx] += s_res[tx-p_RDIMY/8]; \ - if(tx==(p_RDIMY-1)) { \ - g_ip = s_res[p_RDIMY-1]; \ - } \ - } \ - } \ - } diff --git a/solvers/parALMOND/okl/vectorAddInnerProduct.okl b/solvers/parALMOND/okl/vectorAddInnerProduct.okl deleted file mode 100644 index a6d7cb2b0..000000000 --- a/solvers/parALMOND/okl/vectorAddInnerProduct.okl +++ /dev/null @@ -1,96 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - - -// y = beta*y + alpha*x -// ip = y.y -@kernel void vectorAddInnerProductKernel(const dlong Nblocks, - const dlong N, - const dfloat alpha, - const dfloat beta, - @restrict const dfloat * x, - @restrict dfloat * y, - @restrict dfloat * ip){ - - for(dlong b=0;blevels; - - dlong m = levels[k]->Nrows; - // dlong n = levels[k]->Ncols; - - //check for base level - if(k==parAlmond->numLevels-1) { - if (parAlmond->invCoarseA != NULL) { - //use exact sovler - exactCoarseSolve(parAlmond, m, levels[k]->rhs, levels[k]->x); - } else { - levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, true); - } - return; - } - - char name[BUFSIZ]; - sprintf(name, "host kcycle level %d", k); - occaTimerTic(parAlmond->device,name); - - dlong mCoarse = levels[k+1]->Nrows; - // dlong nCoarse = levels[k+1]->Ncols; - - // zero out x - //setVector(m, levels[k]->x, 0.0); - - levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, true); - - // res = r - A*x - levels[k]->Ax(levels[k]->AxArgs,levels[k]->x,levels[k]->res); - vectorAdd(m, 1.0, levels[k]->rhs, -1.0, levels[k]->res); - - // coarsen the residual to next level, checking if the residual needs to be gathered after - if (levels[k+1]->gatherLevel==true) { - levels[k+1]->coarsen(levels[k+1]->coarsenArgs, levels[k]->res, levels[k+1]->Srhs); - levels[k+1]->gather (levels[k+1]->gatherArgs, levels[k+1]->Srhs, levels[k+1]->rhs); - } else { - levels[k+1]->coarsen(levels[k+1]->coarsenArgs, levels[k]->res, levels[k+1]->rhs); - } - - if(k>2) { - vcycle(parAlmond,k+1); - //kcycle(parAlmond, k+1); - } else{ - dfloat *ckp1 = levels[k+1]->ckp1; - dfloat *vkp1 = levels[k+1]->vkp1; - dfloat *wkp1 = levels[k+1]->wkp1; - dfloat *dkp1 = levels[k+1]->x; - dfloat *rkp1 = levels[k+1]->rhs; - dfloat *w = levels[k+1]->weight; - bool weighted = levels[k+1]->weightedInnerProds; - - // first inner krylov iteration - kcycle(parAlmond, k+1); - - //ckp1 = x - memcpy(ckp1,levels[k+1]->x,mCoarse*sizeof(dfloat)); - - // v = A*c - levels[k+1]->Ax(levels[k+1]->AxArgs,ckp1,vkp1); - - dfloat rhoLocal[3], rhoGlobal[3]; - dfloat rho1, alpha1, norm_rkp1; - dfloat norm_rktilde_p, norm_rktilde_pGlobal; - - if(parAlmond->ktype == PCG) - kcycleCombinedOp1(mCoarse, rhoLocal, ckp1, rkp1, vkp1, w, weighted); - - if(parAlmond->ktype == GMRES) - kcycleCombinedOp1(mCoarse, rhoLocal, vkp1, rkp1, vkp1, w, weighted); - - MPI_Allreduce(rhoLocal,rhoGlobal,3,MPI_DFLOAT,MPI_SUM,agmg::comm); - - alpha1 = rhoGlobal[0]; - rho1 = rhoGlobal[1]; - norm_rkp1 = sqrt(rhoGlobal[2]); - - // rkp1 = rkp1 - (alpha1/rho1)*vkp1 - norm_rktilde_p = vectorAddInnerProd(mCoarse, -alpha1/rho1, vkp1, 1.0, rkp1, w, weighted); - MPI_Allreduce(&norm_rktilde_p,&norm_rktilde_pGlobal,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - norm_rktilde_pGlobal = sqrt(norm_rktilde_pGlobal); - - dfloat t = 0.2; - - if(norm_rktilde_pGlobal < t*norm_rkp1){ - // x = (alpha1/rho1)*x - scaleVector(mCoarse, levels[k+1]->x, alpha1/rho1); - } else{ - - kcycle(parAlmond, k+1); - - // w = A*d - levels[k+1]->Ax(levels[k+1]->AxArgs,dkp1,wkp1); - - dfloat gamma, beta, alpha2; - - if(parAlmond->ktype == PCG) - kcycleCombinedOp2(mCoarse,rhoLocal,dkp1,vkp1,wkp1,rkp1, w, weighted); - - if(parAlmond->ktype == GMRES) - kcycleCombinedOp2(mCoarse,rhoLocal,wkp1,vkp1,wkp1,rkp1, w, weighted); - - MPI_Allreduce(rhoLocal,rhoGlobal,3,MPI_DFLOAT,MPI_SUM,agmg::comm); - - gamma = rhoGlobal[0]; - beta = rhoGlobal[1]; - alpha2 = rhoGlobal[2]; - - if(fabs(rho1) > (dfloat) 1e-20){ - - dfloat rho2 = beta - gamma*gamma/rho1; - - if(fabs(rho2) > (dfloat) 1e-20){ - // levels[k+1]->x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ckp1 + (alpha2/rho2)*dkp1 - dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2); - dfloat b = alpha2/rho2; - - vectorAdd(mCoarse, a, ckp1, b, levels[k+1]->x); - } - } - } - } - - if (levels[k+1]->gatherLevel==true) { - levels[k+1]->scatter(levels[k+1]->scatterArgs, levels[k+1]->x, levels[k+1]->Sx); - levels[k+1]->prolongate(levels[k+1]->prolongateArgs, levels[k+1]->Sx, levels[k]->x); - } else { - levels[k+1]->prolongate(levels[k+1]->prolongateArgs, levels[k+1]->x, levels[k]->x); - } - - levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, false); - - occaTimerToc(parAlmond->device,name); -} - - -void device_kcycle(parAlmond_t *parAlmond, int k){ - - agmgLevel **levels = parAlmond->levels; - - dlong m = levels[k]->Nrows; - // dlong n = levels[k]->Ncols; - - if(m < GPU_CPU_SWITCH_SIZE){ - levels[k]->o_rhs.copyTo(levels[k]->rhs, m*sizeof(dfloat)); - kcycle(parAlmond, k); - levels[k]->o_x.copyFrom(levels[k]->x, m*sizeof(dfloat)); - return; - } - - //check for base level - if(k==parAlmond->numLevels-1) { - if (parAlmond->invCoarseA != NULL) { - //use exact sovler - device_exactCoarseSolve(parAlmond, m, levels[k]->o_rhs, levels[k]->o_x); - } else { - levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, true); - } - return; - } - - dlong mCoarse = levels[k+1]->Nrows; - // dlong nCoarse = levels[k+1]->Ncols; - - char name[BUFSIZ]; - sprintf(name, "device kcycle level %d", k); - occaTimerTic(parAlmond->device,name); - - // zero out x - //setVector(parAlmond, m, levels[k]->o_x, 0.0); - - levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, true); - - // res = rhs - A*x - levels[k]->device_Ax(levels[k]->AxArgs,levels[k]->o_x,levels[k]->o_res); - vectorAdd(parAlmond, m, 1.0, levels[k]->o_rhs, -1.0, levels[k]->o_res); - - // coarsen the residual to next level, checking if the residual needs to be gathered after - if (levels[k+1]->gatherLevel==true) { - levels[k+1]->device_coarsen(levels[k+1]->coarsenArgs, levels[k]->o_res, levels[k+1]->o_Srhs); - levels[k+1]->device_gather (levels[k+1]->gatherArgs, levels[k+1]->o_Srhs, levels[k+1]->o_rhs); - } else { - levels[k+1]->device_coarsen(levels[k+1]->coarsenArgs, levels[k]->o_res, levels[k+1]->o_rhs); - } - - if(k>2) { - device_vcycle(parAlmond,k+1); - //device_kcycle(parAlmond, k+1); - } else{ - // first inner krylov iteration - device_kcycle(parAlmond,k+1); - - //ckp1 = levels[k+1]->x; - if (mCoarse) - levels[k+1]->o_ckp1.copyFrom(levels[k+1]->o_x); - - // v = A*c - levels[k+1]->device_Ax(levels[k+1]->AxArgs,levels[k+1]->o_ckp1,levels[k+1]->o_vkp1); - - dfloat rhoLocal[3], rhoGlobal[3]; - dfloat rho1, alpha1, norm_rkp1; - dfloat norm_rktilde_pLocal, norm_rktilde_pGlobal; - - // kcycleCombinedOp1(parAlmond,N,aDotbc,a,b,c,w,bool) - // returns aDotbc[0] = a.b, aDotbc[1] = a.c, aDotbc[2] = b.b - // or aDotbc[0] = w.a.b, aDotbc[1] = w.a.c, aDotbc[2] = w.b.b - if(parAlmond->ktype == PCG) - kcycleCombinedOp1(parAlmond, mCoarse, rhoLocal, - levels[k+1]->o_ckp1, - levels[k+1]->o_rhs, - levels[k+1]->o_vkp1, - levels[k+1]->o_weight, - levels[k+1]->weightedInnerProds); - - if(parAlmond->ktype == GMRES) - kcycleCombinedOp1(parAlmond, mCoarse, rhoLocal, - levels[k+1]->o_vkp1, - levels[k+1]->o_rhs, - levels[k+1]->o_vkp1, - levels[k+1]->o_weight, - levels[k+1]->weightedInnerProds); - - MPI_Allreduce(rhoLocal,rhoGlobal,3,MPI_DFLOAT,MPI_SUM,agmg::comm); - - alpha1 = rhoGlobal[0]; - rho1 = rhoGlobal[1]; - norm_rkp1 = sqrt(rhoGlobal[2]); - - // rkp1 = rkp1 - (alpha1/rho1)*vkp1 - norm_rktilde_pLocal = vectorAddInnerProd(parAlmond, mCoarse, -alpha1/rho1, - levels[k+1]->o_vkp1, 1.0, - levels[k+1]->o_rhs, - levels[k+1]->o_weight, - levels[k+1]->weightedInnerProds); - MPI_Allreduce(&norm_rktilde_pLocal,&norm_rktilde_pGlobal,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - norm_rktilde_pGlobal = sqrt(norm_rktilde_pGlobal); - - dfloat t = 0.2; - if(norm_rktilde_pGlobal < t*norm_rkp1){ - // levels[k+1]->x = (alpha1/rho1)*x - scaleVector(parAlmond,mCoarse, levels[k+1]->o_x, alpha1/rho1); - } else{ - - device_kcycle(parAlmond,k+1); - - // w = A*x - levels[k+1]->device_Ax(levels[k+1]->AxArgs,levels[k+1]->o_x,levels[k+1]->o_wkp1); - - dfloat gamma, beta, alpha2; - - // kcycleCombinedOp2(parAlmond,N,aDotbc,a,b,c,d,w,bool) - // returns aDotbcd[0] = a.b, aDotbcd[1] = a.c, aDotbcd[2] = a.d, - // or aDotbcd[0] = w.a.b, aDotbcd[1] = w.a.c, aDotbcd[2] = w.a.d, - if(parAlmond->ktype == PCG) - kcycleCombinedOp2(parAlmond,mCoarse,rhoLocal, - levels[k+1]->o_x, - levels[k+1]->o_vkp1, - levels[k+1]->o_wkp1, - levels[k+1]->o_rhs, - levels[k+1]->o_weight, - levels[k+1]->weightedInnerProds); - - if(parAlmond->ktype == GMRES) - kcycleCombinedOp2(parAlmond,mCoarse,rhoLocal, - levels[k+1]->o_wkp1, - levels[k+1]->o_vkp1, - levels[k+1]->o_wkp1, - levels[k+1]->o_rhs, - levels[k+1]->o_weight, - levels[k+1]->weightedInnerProds); - - MPI_Allreduce(rhoLocal,rhoGlobal,3,MPI_DFLOAT,MPI_SUM,agmg::comm); - - gamma = rhoGlobal[0]; - beta = rhoGlobal[1]; - alpha2 = rhoGlobal[2]; - - if(fabs(rho1) > (dfloat) 1e-20){ - - dfloat rho2 = beta - gamma*gamma/rho1; - - if(fabs(rho2) > (dfloat) 1e-20){ - // levels[k+1]->x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ckp1 + (alpha2/rho2)*dkp1 - dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2); - dfloat b = alpha2/rho2; - - vectorAdd(parAlmond, mCoarse, a, levels[k+1]->o_ckp1, - b, levels[k+1]->o_x); - } - } - } - } - - if (levels[k+1]->gatherLevel==true) { - levels[k+1]->device_scatter (levels[k+1]->scatterArgs, levels[k+1]->o_x, levels[k+1]->o_Sx); - levels[k+1]->device_prolongate(levels[k+1]->prolongateArgs, levels[k+1]->o_Sx, levels[k]->o_x); - } else { - levels[k+1]->device_prolongate(levels[k+1]->prolongateArgs, levels[k+1]->o_x, levels[k]->o_x); - } - - levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, false); - - occaTimerToc(parAlmond->device,name); -} - - - -void vcycle(parAlmond_t *parAlmond, int k) { - - agmgLevel **levels = parAlmond->levels; - - const dlong m = levels[k]->Nrows; - - //check for base level - if(k==parAlmond->numLevels-1) { - if (parAlmond->invCoarseA != NULL) { - //use exact sovler - exactCoarseSolve(parAlmond, m, levels[k]->rhs, levels[k]->x); - } else { - levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, true); - } - return; - } - - char name[BUFSIZ]; - sprintf(name, "host vcycle level %d", k); - occaTimerTic(parAlmond->device,name); - - // const int mCoarse = levels[k+1]->Nrows; - - // zero out x - //setVector(m, levels[k]->x, 0.0); - - levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, true); - - // res = rhs - A*x - levels[k]->Ax(levels[k]->AxArgs,levels[k]->x,levels[k]->res); - vectorAdd(m, 1.0, levels[k]->rhs, -1.0, levels[k]->res); - - // coarsen the residual to next level, checking if the residual needs to be gathered after - if (levels[k+1]->gatherLevel==true) { - levels[k+1]->coarsen(levels[k+1]->coarsenArgs, levels[k]->res, levels[k+1]->Srhs); - levels[k+1]->gather (levels[k+1]->gatherArgs, levels[k+1]->Srhs, levels[k+1]->rhs); - } else { - levels[k+1]->coarsen(levels[k+1]->coarsenArgs, levels[k]->res, levels[k+1]->rhs); - } - - vcycle(parAlmond,k+1); - - if (levels[k+1]->gatherLevel==true) { - levels[k+1]->scatter(levels[k+1]->scatterArgs, levels[k+1]->x, levels[k+1]->Sx); - levels[k+1]->prolongate(levels[k+1]->prolongateArgs, levels[k+1]->Sx, levels[k]->x); - } else { - levels[k+1]->prolongate(levels[k+1]->prolongateArgs, levels[k+1]->x, levels[k]->x); - } - - levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x,false); - - occaTimerToc(parAlmond->device,name); -} - - -void device_vcycle(parAlmond_t *parAlmond, int k){ - - agmgLevel **levels = parAlmond->levels; - - const dlong m = levels[k]->Nrows; - // const dlong mCoarse = levels[k+1]->Nrows; - - // switch to cpu if the problem size is too small for gpu - if(m < GPU_CPU_SWITCH_SIZE){ - levels[k]->o_rhs.copyTo(levels[k]->rhs, m*sizeof(dfloat)); - vcycle(parAlmond, k); - levels[k]->o_x.copyFrom(levels[k]->x, m*sizeof(dfloat)); - return; - } - - //check for base level - if (k==parAlmond->numLevels-1) { - if (parAlmond->invCoarseA != NULL) { - //use exact sovler - device_exactCoarseSolve(parAlmond, m, levels[k]->o_rhs, levels[k]->o_x); - } else { - levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, true); - } - return; - } - - char name[BUFSIZ]; - sprintf(name, "device vcycle level %d", k); - occaTimerTic(parAlmond->device,name); - - // zero out x - //setVector(parAlmond, m, levels[k]->o_x, 0.0); - - levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, true); - - // res = rhs - A*x - levels[k]->device_Ax(levels[k]->AxArgs,levels[k]->o_x,levels[k]->o_res); - vectorAdd(parAlmond, m, 1.0, levels[k]->o_rhs, -1.0, levels[k]->o_res); - - // coarsen the residual to next level, checking if the residual needs to be gathered after - if (levels[k+1]->gatherLevel==true) { - levels[k+1]->device_coarsen(levels[k+1]->coarsenArgs, levels[k]->o_res, levels[k+1]->o_Srhs); - levels[k+1]->device_gather (levels[k+1]->gatherArgs, levels[k+1]->o_Srhs, levels[k+1]->o_rhs); - } else { - levels[k+1]->device_coarsen(levels[k+1]->coarsenArgs, levels[k]->o_res, levels[k+1]->o_rhs); - } - - device_vcycle(parAlmond, k+1); - - if (levels[k+1]->gatherLevel==true) { - levels[k+1]->device_scatter (levels[k+1]->scatterArgs, levels[k+1]->o_x, levels[k+1]->o_Sx); - levels[k+1]->device_prolongate(levels[k+1]->prolongateArgs, levels[k+1]->o_Sx, levels[k]->o_x); - } else { - levels[k+1]->device_prolongate(levels[k+1]->prolongateArgs, levels[k+1]->o_x, levels[k]->o_x); - } - - levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x,false); - - occaTimerToc(parAlmond->device,name); -} diff --git a/solvers/parALMOND/src/agmgLevel.c b/solvers/parALMOND/src/agmgLevel.c deleted file mode 100644 index 67bd7a66b..000000000 --- a/solvers/parALMOND/src/agmgLevel.c +++ /dev/null @@ -1,640 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "agmg.h" - -// parAlmond's function call-backs -void agmgAx(void **args, dfloat *x, dfloat *Ax){ - parAlmond_t *parAlmond = (parAlmond_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - axpy(level->A, 1.0, x, 0.0, Ax,parAlmond->nullSpace,parAlmond->nullSpacePenalty); -} - -void agmgCoarsen(void **args, dfloat *r, dfloat *Rr){ - // parAlmond_t *parAlmond = (parAlmond_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - axpy(level->R, 1.0, r, 0.0, Rr,false,0.); -} - -void agmgProlongate(void **args, dfloat *x, dfloat *Px){ - // parAlmond_t *parAlmond = (parAlmond_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - axpy(level->P, 1.0, x, 1.0, Px,false,0.); -} - -void agmgSmooth(void **args, dfloat *rhs, dfloat *x, bool x_is_zero){ - parAlmond_t *parAlmond = (parAlmond_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - if(level->stype == JACOBI){ - smoothJacobi(parAlmond, level, level->A, rhs, x, x_is_zero); - } else if(level->stype == DAMPED_JACOBI){ - smoothDampedJacobi(parAlmond, level, level->A, rhs, x, x_is_zero); - } else if(level->stype == CHEBYSHEV){ - smoothChebyshev(parAlmond, level, level->A, rhs, x, x_is_zero); - } -} - -void device_agmgAx(void **args, occa::memory &o_x, occa::memory &o_Ax){ - parAlmond_t *parAlmond = (parAlmond_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - axpy(parAlmond,level->deviceA, 1.0, o_x, 0.0, o_Ax,parAlmond->nullSpace,parAlmond->nullSpacePenalty); -} - -void device_agmgCoarsen(void **args, occa::memory &o_r, occa::memory &o_Rr){ - parAlmond_t *parAlmond = (parAlmond_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - axpy(parAlmond, level->deviceR, 1.0, o_r, 0.0, o_Rr,false,0.); -} - -void device_agmgProlongate(void **args, occa::memory &o_x, occa::memory &o_Px){ - parAlmond_t *parAlmond = (parAlmond_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - axpy(parAlmond, level->dcsrP, 1.0, o_x, 1.0, o_Px); -} - -void device_agmgSmooth(void **args, occa::memory &o_rhs, occa::memory &o_x, bool x_is_zero){ - parAlmond_t *parAlmond = (parAlmond_t *) args[0]; - agmgLevel *level = (agmgLevel *) args[1]; - - if(level->stype == JACOBI){ - smoothJacobi(parAlmond, level, level->deviceA, o_rhs, o_x, x_is_zero); - } else if(level->stype == DAMPED_JACOBI){ - smoothDampedJacobi(parAlmond, level, level->deviceA, o_rhs, o_x, x_is_zero); - } else if(level->stype == CHEBYSHEV){ - smoothChebyshev(parAlmond, level, level->deviceA, o_rhs, o_x, x_is_zero); - } -} - -dfloat rhoDinvA(parAlmond_t *parAlmond, csr *A, dfloat *invD); - -void setupSmoother(parAlmond_t *parAlmond, agmgLevel *level, SmoothType s){ - - level->stype = s; - - if((s == DAMPED_JACOBI)||(s == CHEBYSHEV)){ - // estimate rho(invD * A) - dfloat rho=0; - - if(level->A->Nrows) - level->A->diagInv = (dfloat *) calloc(level->A->Nrows, sizeof(dfloat)); - - for (dlong i=0;iA->Nrows;i++) { - dfloat diag = level->A->diagCoefs[level->A->diagRowStarts[i]]; - if (parAlmond->nullSpace) { - diag += parAlmond->nullSpacePenalty*level->A->null[i]*level->A->null[i]; - } - level->A->diagInv[i] = 1.0/diag; - } - - rho = rhoDinvA(parAlmond, level->A, level->A->diagInv); - - if (s == DAMPED_JACOBI) { - - level->smoother_params = (dfloat *) calloc(1,sizeof(dfloat)); - - level->smoother_params[0] = (4./3.)/rho; - - //temp storage for smoothing - if (level->Ncols) level->smootherResidual = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); - if (level->Ncols) level->o_smootherResidual = parAlmond->device.malloc(level->Ncols*sizeof(dfloat),level->smootherResidual); - - } else if (s == CHEBYSHEV) { - - level->smoother_params = (dfloat *) calloc(2,sizeof(dfloat)); - - level->smoother_params[0] = rho; - level->smoother_params[1] = rho/10.; - - //temp storage for smoothing - if (level->Ncols) level->smootherResidual = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); - if (level->Ncols) level->smootherResidual2 = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); - if (level->Ncols) level->smootherUpdate = (dfloat *) calloc(level->Ncols,sizeof(dfloat)); - if (level->Ncols) level->o_smootherResidual = parAlmond->device.malloc(level->Ncols*sizeof(dfloat),level->smootherResidual); - if (level->Ncols) level->o_smootherResidual2 = parAlmond->device.malloc(level->Ncols*sizeof(dfloat),level->smootherResidual); - if (level->Ncols) level->o_smootherUpdate = parAlmond->device.malloc(level->Ncols*sizeof(dfloat),level->smootherUpdate); - } - } -} - -extern "C"{ - void dgeev_(char *JOBVL, char *JOBVR, int *N, double *A, int *LDA, double *WR, double *WI, - double *VL, int *LDVL, double *VR, int *LDVR, double *WORK, int *LWORK, int *INFO ); -} - - -static void eig(const int Nrows, double *A, double *WR, - double *WI){ - - if(Nrows){ - int NB = 256; - char JOBVL = 'V'; - char JOBVR = 'V'; - int N = Nrows; - int LDA = Nrows; - int LWORK = (NB+2)*N; - - double *WORK = new double[LWORK]; - double *VL = new double[Nrows*Nrows]; - double *VR = new double[Nrows*Nrows]; - - int INFO = -999; - - dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI, - VL, &LDA, VR, &LDA, WORK, &LWORK, &INFO); - - - assert(INFO == 0); - - delete [] VL; - delete [] VR; - delete [] WORK; - } -} - -dfloat rhoDinvA(parAlmond_t* parAlmond,csr *A, dfloat *invD){ - - const dlong N = A->Nrows; - const dlong M = A->Ncols; - - int k = 10; - - int rank, size; - rank = agmg::rank; - size = agmg::size; - - hlong Nlocal = (hlong) N; - hlong Ntotal = 0; - MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, agmg::comm); - if(k > Ntotal) - k = (int) Ntotal; - - // do an arnoldi - - // allocate memory for Hessenberg matrix - double *H = (double *) calloc(k*k,sizeof(double)); - - // allocate memory for basis - dfloat **V = (dfloat **) calloc(k+1, sizeof(dfloat *)); - dfloat *Vx = (dfloat *) calloc(M, sizeof(dfloat)); - - for(int i=0; i<=k; i++) - V[i] = (dfloat *) calloc(N, sizeof(dfloat)); - - // generate a random vector for initial basis vector - for (dlong i=0;inullSpace,parAlmond->nullSpacePenalty); - - dotStar(N, invD, V[j+1]); - - // modified Gram-Schmidth - for(int i=0; i<=j; i++){ - // H(i,j) = v[i]'*A*v[j] - dfloat hij = innerProd(N, V[i], V[j+1]); - dfloat ghij = 0; - MPI_Allreduce(&hij, &ghij, 1, MPI_DFLOAT, MPI_SUM, agmg::comm); - - // v[j+1] = v[j+1] - hij*v[i] - vectorAdd(N,-ghij, V[i], 1.0, V[j+1]); - - H[i + j*k] = (double) ghij; - } - - if(j+1 < k){ - - dfloat norm_vj = 0.; - for (dlong i=0;ioptions.compareArgs("VERBOSE","TRUE"))) printf("weight = %g \n", rho); - - return rho; -} - -void matrixInverse(int N, dfloat *A); - -//set up exact solver using xxt -void setupExactSolve(parAlmond_t *parAlmond, agmgLevel *level, bool nullSpace, dfloat nullSpacePenalty) { - - int rank, size; - rank = agmg::rank; - size = agmg::size; - - //copy the global coarse partition as ints - int *coarseOffsets = (int* ) calloc(size+1,sizeof(int)); - for (int r=0;rglobalRowStarts[r]; - - int coarseTotal = coarseOffsets[size]; - int coarseOffset = coarseOffsets[rank]; - - csr *A = level->A; - int N = (int) level->Nrows; - - int localNNZ; - int *rows; - int *cols; - dfloat *vals; - - if((rank==0)&&(parAlmond->options.compareArgs("VERBOSE","TRUE"))) printf("Setting up coarse solver...");fflush(stdout); - - if(!nullSpace) { - //if no nullspace, use sparse A - localNNZ = (int) (A->diagNNZ+A->offdNNZ); - - if (localNNZ) { - rows = (int *) calloc(localNNZ,sizeof(int)); - cols = (int *) calloc(localNNZ,sizeof(int)); - vals = (dfloat *) calloc(localNNZ,sizeof(dfloat)); - } - - //populate matrix - int cnt = 0; - for (int n=0;ndiagRowStarts[n]; - int end = (int) A->diagRowStarts[n+1]; - for (int m=start;mdiagCols[m] + coarseOffset); - vals[cnt] = A->diagCoefs[m]; - cnt++; - } - start = (int) A->offdRowStarts[n]; - end = (int) A->offdRowStarts[n+1]; - for (dlong m=A->offdRowStarts[n];moffdRowStarts[n+1];m++) { - rows[cnt] = n + coarseOffset; - cols[cnt] = (int) A->colMap[A->offdCols[m]]; - vals[cnt] = A->offdCoefs[m]; - cnt++; - } - } - } else { - localNNZ = (int) (A->Nrows*coarseTotal); //A is dense due to nullspace augmentation - - if (localNNZ) { - rows = (int *) calloc(localNNZ,sizeof(int)); - cols = (int *) calloc(localNNZ,sizeof(int)); - vals = (dfloat *) calloc(localNNZ,sizeof(dfloat)); - } - - //gather null vector - dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); - int *nullCounts = (int*) calloc(size,sizeof(int)); - for (int r=0;rnull, N, MPI_DFLOAT, nullTotal, nullCounts, coarseOffsets, MPI_DFLOAT, agmg::comm); - - //populate matrix - for (int n=0;ndiagRowStarts[n]; - int end = (int) A->diagRowStarts[n+1]; - for (int m=start;mdiagCols[m] + coarseOffset); - vals[n*coarseTotal+col] += A->diagCoefs[m]; - } - start = (int) A->offdRowStarts[n]; - end = (int) A->offdRowStarts[n+1]; - for (int m=start;mcolMap[A->offdCols[m]]; - vals[n*coarseTotal+col] += A->offdCoefs[m]; - } - } - } - - //ge the nonzero counts from all ranks - int *NNZ = (int*) calloc(size,sizeof(int)); - int *NNZoffsets = (int*) calloc(size+1,sizeof(int)); - MPI_Allgather(&localNNZ, 1, MPI_INT, NNZ, 1, MPI_INT, agmg::comm); - - int totalNNZ = 0; - for (int r=0;rinvCoarseA = (dfloat *) calloc(A->Nrows*coarseTotal,sizeof(dfloat)); - for (int n=0;ninvCoarseA[n*coarseTotal+m] = coarseA[(n+coarseOffset)*coarseTotal+m]; - } - } - - parAlmond->coarseTotal = coarseTotal; - parAlmond->coarseOffset = coarseOffset; - parAlmond->coarseOffsets = coarseOffsets; - parAlmond->coarseCounts = (int*) calloc(size,sizeof(int)); - for (int r=0;rcoarseCounts[r] = coarseOffsets[r+1]-coarseOffsets[r]; - - parAlmond->xCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); - parAlmond->rhsCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); - - if (localNNZ) { - free(rows); - free(cols); - free(vals); - } - - if (totalNNZ) { - free(Arows); - free(Acols); - free(Avals); - } - - if(coarseTotal) { - free(coarseA); - } - - if((rank==0)&&(parAlmond->options.compareArgs("VERBOSE","TRUE"))) printf("done.\n"); -} - - -void exactCoarseSolve(parAlmond_t *parAlmond, int N, dfloat *rhs, dfloat *x) { - - //gather the full vector - MPI_Allgatherv(rhs, N, MPI_DFLOAT, parAlmond->rhsCoarse, parAlmond->coarseCounts, parAlmond->coarseOffsets, MPI_DFLOAT, agmg::comm); - - //multiply by local part of the exact matrix inverse - #pragma omp parallel for - for (int n=0;ncoarseTotal;m++) { - x[n] += parAlmond->invCoarseA[n*parAlmond->coarseTotal+m]*parAlmond->rhsCoarse[m]; - } - } -} - -void device_exactCoarseSolve(parAlmond_t *parAlmond, int N, occa::memory o_rhs, occa::memory o_x) { - - dfloat *rhs = parAlmond->levels[parAlmond->numLevels-1]->rhs; - dfloat *x = parAlmond->levels[parAlmond->numLevels-1]->x; - - //use coarse solver - o_rhs.copyTo(rhs); - //gather the full vector - MPI_Allgatherv(rhs, N, MPI_DFLOAT, parAlmond->rhsCoarse, parAlmond->coarseCounts, parAlmond->coarseOffsets, MPI_DFLOAT, agmg::comm); - - //multiply by local part of the exact matrix inverse - #pragma omp parallel for - for (int n=0;ncoarseTotal;m++) { - x[n] += parAlmond->invCoarseA[n*parAlmond->coarseTotal+m]*parAlmond->rhsCoarse[m]; - } - } - - o_x.copyFrom(x); -} - -#if 0 -//set up exact solver using xxt -void setupExactSolve(parAlmond_t *parAlmond, agmgLevel *level, bool nullSpace, dfloat nullSpacePenalty) { - - int rank, size; - rank = agmg::rank; - size = agmg::size; - - int* coarseOffsets = level->globalRowStarts; - int coarseTotal = coarseOffsets[size]; - int coarseOffset = coarseOffsets[rank]; - - int *globalNumbering = (int *) calloc(coarseTotal,sizeof(int)); - for (int n=0;nA; - int N = level->Nrows; - - int totalNNZ; - int *rows; - int *cols; - dfloat *vals; - - if(!nullSpace) { - //if no nullspace, use sparse A - totalNNZ = A->diagNNZ+A->offdNNZ; - if (totalNNZ) { - rows = (int *) calloc(totalNNZ,sizeof(int)); - cols = (int *) calloc(totalNNZ,sizeof(int)); - vals = (dfloat *) calloc(totalNNZ,sizeof(dfloat)); - } - - //populate matrix - int cnt = 0; - for (int n=0;ndiagRowStarts[n];mdiagRowStarts[n+1];m++) { - rows[cnt] = n + coarseOffset; - cols[cnt] = A->diagCols[m] + coarseOffset; - vals[cnt] = A->diagCoefs[m]; - cnt++; - } - for (int m=A->offdRowStarts[n];moffdRowStarts[n+1];m++) { - rows[cnt] = n + coarseOffset; - cols[cnt] = A->colMap[A->offdCols[m]]; - vals[cnt] = A->offdCoefs[m]; - cnt++; - } - } - } else { - totalNNZ = A->Nrows*coarseTotal; //A is dense due to nullspace augmentation - if (totalNNZ) { - rows = (int *) calloc(totalNNZ,sizeof(int)); - cols = (int *) calloc(totalNNZ,sizeof(int)); - vals = (dfloat *) calloc(totalNNZ,sizeof(dfloat)); - } - - //gather null vector - dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); - int *nullCounts = (int*) calloc(size,sizeof(int)); - for (int r=0;rnull, A->Nrows, MPI_DFLOAT, nullTotal, nullCounts, coarseOffsets, MPI_DFLOAT, agmg::comm); - - //populate matrix - for (int n=0;ndiagRowStarts[n];mdiagRowStarts[n+1];m++) { - int col = A->diagCols[m] + coarseOffset; - vals[n*coarseTotal+col] += A->diagCoefs[m]; - } - for (int m=A->offdRowStarts[n];moffdRowStarts[n+1];m++) { - int col = A->colMap[A->offdCols[m]]; - vals[n*coarseTotal+col] += A->offdCoefs[m]; - } - } - } - - parAlmond->ExactSolve = xxtSetup(A->Nrows, - globalNumbering, - totalNNZ, - rows, - cols, - vals, - 0, - "int", - dfloatString); - - parAlmond->coarseTotal = coarseTotal; - parAlmond->coarseOffset = coarseOffset; - - parAlmond->xCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); - parAlmond->rhsCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat)); - - free(globalNumbering); - if (totalNNZ) { - free(rows); - free(cols); - free(vals); - } - - printf("Done UberCoarse setup\n"); -} - - -void exactCoarseSolve(parAlmond_t *parAlmond, int N, dfloat *rhs, dfloat *x) { - - //use coarse solver - for (int n=0;ncoarseTotal;n++) - parAlmond->rhsCoarse[n] =0.; - - for (int n=0;nrhsCoarse[n+parAlmond->coarseOffset] = rhs[n]; - - xxtSolve(parAlmond->xCoarse, parAlmond->ExactSolve, parAlmond->rhsCoarse); - - for (int n=0;nxCoarse[n+parAlmond->coarseOffset]; - -} - -void device_exactCoarseSolve(parAlmond_t *parAlmond, int N, occa::memory o_rhs, occa::memory o_x) { - - //use coarse solver - for (int n=0;ncoarseTotal;n++) - parAlmond->rhsCoarse[n] =0.; - - o_rhs.copyTo(parAlmond->rhsCoarse+parAlmond->coarseOffset); - xxtSolve(parAlmond->xCoarse, parAlmond->ExactSolve, parAlmond->rhsCoarse); - o_x.copyFrom(parAlmond->xCoarse+parAlmond->coarseOffset,N*sizeof(dfloat)); -} -#endif diff --git a/solvers/parALMOND/src/agmgMatrices.c b/solvers/parALMOND/src/agmgMatrices.c deleted file mode 100644 index 373fc52d3..000000000 --- a/solvers/parALMOND/src/agmgMatrices.c +++ /dev/null @@ -1,1155 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "agmg.h" - -csr * newCSRfromCOO(dlong N, hlong* globalRowStarts, - dlong nnz, hlong *Ai, hlong *Aj, dfloat *Avals){ - - int size, rank; - rank = agmg::rank; - size = agmg::size; - - csr *A = (csr *) calloc(1,sizeof(csr)); - - A->Nrows = N; - A->Ncols = N; - - A->NlocalCols = N; - - hlong globalOffset = globalRowStarts[rank]; - - //first, count number of local, and non-local non-zeros - dlong diagNNZ=0; - dlong offdNNZ=0; - for (dlong n=0;nglobalOffset+N-1)) offdNNZ++; - else diagNNZ++; - } - - dlong *diagAi, *diagAj; - dlong *offdAi; - hlong *offdAj; - dfloat *diagAvals, *offdAvals; - - if (diagNNZ) { - diagAi = (dlong *) calloc(diagNNZ, sizeof(dlong)); - diagAj = (dlong *) calloc(diagNNZ, sizeof(dlong)); - diagAvals = (dfloat *) calloc(diagNNZ, sizeof(dfloat)); - } - if (offdNNZ) { - offdAi = (dlong *) calloc(offdNNZ, sizeof(dlong)); - offdAj = (hlong *) calloc(offdNNZ, sizeof(hlong)); - offdAvals = (dfloat *) calloc(offdNNZ, sizeof(dfloat)); - } - - //split into local and non-local COO matrices - diagNNZ =0; - offdNNZ =0; - for (dlong n=0;nglobalOffset+N-1)) { - offdAi[offdNNZ] = (dlong) Ai[n] - globalOffset; //local index - offdAj[offdNNZ] = Aj[n]; //global index - offdAvals[offdNNZ] = Avals[n]; - offdNNZ++; - } else { - diagAi[diagNNZ] = (dlong) Ai[n] - globalOffset; //local index - diagAj[diagNNZ] = (dlong) Aj[n] - globalOffset; //local index - diagAvals[diagNNZ] = Avals[n]; - diagNNZ++; - } - } - - A->diagNNZ = diagNNZ; - A->offdNNZ = offdNNZ; - - if (N) { - A->diagRowStarts = (dlong *) calloc(N+1,sizeof(dlong)); - A->offdRowStarts = (dlong *) calloc(N+1,sizeof(dlong)); - } - if (diagNNZ) { - A->diagCols = (dlong *) calloc(diagNNZ, sizeof(dlong)); - A->diagCoefs = (dfloat *) calloc(diagNNZ, sizeof(dfloat)); - } - hlong* offdCols; - if (offdNNZ) { - offdCols = (hlong *) calloc(offdNNZ,sizeof(hlong)); - A->offdCols = (dlong *) calloc(offdNNZ,sizeof(dlong)); - A->offdCoefs = (dfloat *) calloc(offdNNZ, sizeof(dfloat)); - } - - // Convert to csr storage, assumes orginal matrix was presorted by rows - for(dlong n=0;ndiagRowStarts[row+1]++; - } - for(dlong n=0;noffdRowStarts[row+1]++; - } - //cumulative sum - for (dlong i=0;iNrows;i++) { - A->diagRowStarts[i+1] += A->diagRowStarts[i]; - A->offdRowStarts[i+1] += A->offdRowStarts[i]; - } - - //copy input data into struct - if (diagNNZ) { - for (dlong i=0; idiagRowStarts[i]; - int cnt = 1; - for (dlong j=A->diagRowStarts[i]; jdiagRowStarts[i+1]; j++) { - if (diagAj[j] == i) { //move diagonal to first entry - A->diagCols[start] = diagAj[j]; - A->diagCoefs[start] = diagAvals[j]; - } else { - A->diagCols[start+cnt] = diagAj[j]; - A->diagCoefs[start+cnt] = diagAvals[j]; - cnt++; - } - } - } - } - - //record global indexing of columns - A->colMap = (hlong *) calloc(A->Ncols, sizeof(hlong)); - for (dlong i=0;iNcols;i++) - A->colMap[i] = i + globalOffset; - - if (offdNNZ) { - for (dlong i=0; ioffdRowStarts[i]; - int cnt = 0; - for (dlong j=A->offdRowStarts[i]; joffdRowStarts[i+1]; j++) { - offdCols[start+cnt] = offdAj[j]; - A->offdCoefs[start+cnt] = offdAvals[j]; - cnt++; - } - } - - //we now need to reorder the x vector for the halo, and shift the column indices - hlong *col = (hlong *) calloc(A->offdNNZ,sizeof(hlong)); - for (dlong n=0;nNHalo = 0; - for (dlong n=1;nNHalo] = col[n]; - A->NHalo++; //number of unique columns - - A->Ncols += A->NHalo; - - //save global column ids in colMap - A->colMap = (hlong *) realloc(A->colMap, A->Ncols*sizeof(hlong)); - for (dlong n=0; nNHalo; n++) - A->colMap[n+A->NlocalCols] = col[n]; - free(col); - - //shift the column indices to local indexing - for (dlong n=0;nNlocalCols;mNcols;m++) { - if (gcol == A->colMap[m]) - A->offdCols[n] = m; - } - } - } - - if (diagNNZ) { - free(diagAi); - free(diagAj); - free(diagAvals); - } - if (offdNNZ) { - free(offdAi); - free(offdAj); - free(offdAvals); - free(offdCols); - } - - csrHaloSetup(A,globalRowStarts); - - return A; -} - -void freeCSR(csr *A) { - if (A->diagNNZ) { - free(A->diagRowStarts); - free(A->diagCols); - free(A->diagCoefs); - } - if (A->offdNNZ) { - free(A->offdRowStarts); - free(A->offdCols); - free(A->offdCoefs); - } - if (A->Ncols) { - free(A->colMap); - } - free(A->haloSendRequests); - free(A->haloRecvRequests); - free(A->NsendPairs); - free(A->NrecvPairs); - if (A->NsendTotal) { - free(A->sendBuffer); - free(A->haloElementList); - } - - free(A); -} - -//create a device version of a coo matrix -dcoo *newDCOO(parAlmond_t *parAlmond, csr *B){ - - dcoo *A = (dcoo *) calloc(1,sizeof(dcoo)); - - A->Nrows = B->Nrows; - A->Ncols = B->Ncols; - - A->NHalo = B->NHalo; - A->NlocalCols = B->NlocalCols; - - A->diagNNZ = B->diagNNZ; - A->offdNNZ = B->offdNNZ; - - dlong *diagRows; - dlong *offdRows; - if (B->diagNNZ) - diagRows = (dlong *) calloc(B->diagNNZ,sizeof(dlong)); - if (B->offdNNZ) - offdRows = (dlong *) calloc(B->offdNNZ,sizeof(dlong)); - - dlong diagCnt =0; - dlong offdCnt =0; - for (dlong i=0;iNrows;i++) { - for (dlong j=B->diagRowStarts[i];jdiagRowStarts[i+1];j++) - diagRows[diagCnt++] = i; - - for (dlong j=B->offdRowStarts[i];joffdRowStarts[i+1];j++) - offdRows[offdCnt++] = i; - } - - //copy to device - if(B->diagNNZ){ - A->o_diagRows = parAlmond->device.malloc(A->diagNNZ*sizeof(dlong), diagRows); - A->o_diagCols = parAlmond->device.malloc(A->diagNNZ*sizeof(dlong), B->diagCols); - A->o_diagCoefs = parAlmond->device.malloc(A->diagNNZ*sizeof(dfloat), B->diagCoefs); - } - if(B->offdNNZ){ - A->o_offdRows = parAlmond->device.malloc(A->offdNNZ*sizeof(dlong), offdRows); - A->o_offdCols = parAlmond->device.malloc(A->offdNNZ*sizeof(dlong), B->offdCols); - A->o_offdCoefs = parAlmond->device.malloc(A->offdNNZ*sizeof(dfloat), B->offdCoefs); - } - - A->NrecvTotal = B->NrecvTotal; - A->NsendTotal = B->NsendTotal; - A->haloElementList = B->haloElementList; - if (A->NsendTotal) - A->o_haloElementList = parAlmond->device.malloc(A->NsendTotal*sizeof(dlong),A->haloElementList); - A->NsendPairs = B->NsendPairs; - A->NrecvPairs = B->NrecvPairs; - A->NsendMessages = B->NsendMessages; - A->NrecvMessages = B->NrecvMessages; - - if (A->NrecvTotal) A->recvBuffer = (dfloat *) malloc(A->NrecvTotal*sizeof(dfloat)); - if (A->NsendTotal) { -#if 0 - occa::memory o_haloBuffer = parAlmond->device.mappedAlloc(A->NsendTotal*sizeof(dfloat), NULL); - A->sendBuffer = (dfloat*) o_haloBuffer.getMappedPointer(); -#endif - A->sendBuffer = (dfloat*) occaHostMallocPinned(parAlmond->device, A->NsendTotal*sizeof(dfloat), NULL, A->o_haloBuffer); - } - - A->haloSendRequests = B->haloSendRequests; - A->haloRecvRequests = B->haloRecvRequests; - - return A; -} - -hyb * newHYB(parAlmond_t *parAlmond, csr *csrA) { - - hyb *A = (hyb *) calloc(1,sizeof(hyb)); - - A->Nrows = csrA->Nrows; - A->Ncols = csrA->Ncols; - - A->NlocalCols = csrA->NlocalCols; - A->NHalo = csrA->NHalo; - - int *rowCounters; - if (csrA->Nrows) - rowCounters = (int*) calloc(csrA->Nrows, sizeof(int)); - - int maxNnzPerRow = 0; - int minNnzPerRow = 0; - if (csrA->Nrows) - minNnzPerRow = (int) csrA->diagRowStarts[1] - csrA->diagRowStarts[0]; - - for(dlong i=0; iNrows; i++) { - int rowNnz = (int) csrA->diagRowStarts[i+1] - csrA->diagRowStarts[i]; - rowCounters[i] = rowNnz; - - maxNnzPerRow = (rowNnz > maxNnzPerRow) ? rowNnz : maxNnzPerRow; - minNnzPerRow = (rowNnz < minNnzPerRow) ? rowNnz : minNnzPerRow; - } - - // create bins - int numBins = maxNnzPerRow - minNnzPerRow + 1; - - //zero row check - if (numBins<0) numBins =0; - - int *bins; - if (numBins) - bins = (int *) calloc(numBins, sizeof(int)); - - for(dlong i=0; iNrows; i++){ - bins[rowCounters[i]-minNnzPerRow]++; - } - - dfloat threshold = 2.0/3.0; - dlong totalNNZ = csrA->diagNNZ+csrA->offdNNZ; - int nnzPerRow = 0; - dlong nnz = 0; - - //increase the nnz per row in E until it holds threshold*totalnnz nonzeros - for(int i=0; i threshold*totalNNZ)||(i==numBins-1)){ - nnzPerRow = i+minNnzPerRow; - break; - } - } - - A->E = (ell *) calloc(1, sizeof(ell)); - - A->E->Nrows = csrA->Nrows; - A->E->Ncols = csrA->Ncols; - A->E->nnzPerRow = nnzPerRow; - A->E->strideLength = csrA->Nrows; - - dlong *Ecols; - dfloat *Ecoefs; - if(nnzPerRow&&csrA->Nrows){ - Ecols = (dlong *) calloc(csrA->Nrows*nnzPerRow, sizeof(dlong)); - Ecoefs = (dfloat *) calloc(csrA->Nrows*nnzPerRow, sizeof(dfloat)); - } - - dlong nnzC = 0; - - // count the number of nonzeros to be stored in coo format - for(dlong i=0; iNrows; i++) { - //excess from row in diag - if(rowCounters[i] > nnzPerRow) nnzC += (rowCounters[i] - nnzPerRow); - - //all of offd - int offdRowNnz = (int) csrA->offdRowStarts[i+1]-csrA->offdRowStarts[i]; - - nnzC += offdRowNnz; - } - - A->E->actualNNZ = totalNNZ - nnzC; - - A->C = (coo *) calloc(1, sizeof(coo)); - - A->C->Nrows = csrA->Nrows; - A->C->Ncols = csrA->Ncols; - A->C->nnz = nnzC; - - dlong *Coffsets; - dlong *Ccols; - dfloat *Ccoefs; - - Coffsets = (dlong *) calloc(csrA->Nrows+1, sizeof(dlong)); - if (nnzC) { - Ccols = (dlong *) calloc(nnzC, sizeof(dlong)); - Ccoefs = (dfloat *) calloc(nnzC, sizeof(dfloat)); - } - - nnzC = 0; - for(dlong i=0; iNrows; i++){ - dlong Jstart = csrA->diagRowStarts[i]; - dlong Jend = csrA->diagRowStarts[i+1]; - int rowNnz = (int) Jend - Jstart; - - // store only min of nnzPerRow and rowNnz - int maxNnz = (nnzPerRow >= rowNnz) ? rowNnz : nnzPerRow; - - for(int c=0; cE->strideLength] = csrA->diagCols[Jstart+c]; - Ecoefs[i+c*A->E->strideLength] = csrA->diagCoefs[Jstart+c]; - } - - // store the remaining in coo format - if(rowNnz > nnzPerRow){ - for(int c=nnzPerRow; cdiagCols[Jstart+c]; - Ccoefs[nnzC] = csrA->diagCoefs[Jstart+c]; - nnzC++; - } - } - - //add the offd non-zeros - for (dlong j=csrA->offdRowStarts[i];joffdRowStarts[i+1];j++) { - Coffsets[i+1]++; - Ccols[nnzC] = csrA->offdCols[j]; - Ccoefs[nnzC] = csrA->offdCoefs[j]; - nnzC++; - } - } - - //use counts to create offsets - for (dlong i=0;iNrows;i++) - Coffsets[i+1] += Coffsets[i]; - - // copy the data to device memory - if(csrA->Nrows) { - free(rowCounters); free(bins); - } - - //copy null vector if present - if(csrA->null&&csrA->Nrows) - A->o_null = parAlmond->device.malloc(csrA->Nrows*sizeof(dfloat), csrA->null); - - if (csrA->diagInv&&csrA->Nrows) - A->o_diagInv = parAlmond->device.malloc(csrA->Nrows*sizeof(dfloat), csrA->diagInv); - - if(A->E->nnzPerRow&&csrA->Nrows){ - A->E->o_cols = parAlmond->device.malloc(csrA->Nrows*nnzPerRow*sizeof(dlong), Ecols); - A->E->o_coefs = parAlmond->device.malloc(csrA->Nrows*nnzPerRow*sizeof(dfloat), Ecoefs); - free(Ecols); free(Ecoefs); - } - - if(A->C->nnz){ - A->C->o_offsets = parAlmond->device.malloc((csrA->Nrows+1)*sizeof(dlong), Coffsets); - A->C->o_cols = parAlmond->device.malloc(A->C->nnz*sizeof(dlong), Ccols); - A->C->o_coefs = parAlmond->device.malloc(A->C->nnz*sizeof(dfloat), Ccoefs); - - free(Ccols); free(Ccoefs); - } - - free(Coffsets); - - A->NrecvTotal = csrA->NrecvTotal; - A->NsendTotal = csrA->NsendTotal; - A->haloElementList = csrA->haloElementList; - if (A->NsendTotal) A->o_haloElementList = parAlmond->device.malloc(A->NsendTotal*sizeof(dlong),A->haloElementList); - A->NsendPairs = csrA->NsendPairs; - A->NrecvPairs = csrA->NrecvPairs; - A->NsendMessages = csrA->NsendMessages; - A->NrecvMessages = csrA->NrecvMessages; - A->haloSendRequests = csrA->haloSendRequests; - A->haloRecvRequests = csrA->haloRecvRequests; - - if (A->NrecvTotal) A->recvBuffer = (dfloat *) malloc(A->NrecvTotal*sizeof(dfloat)); - if (A->NsendTotal) { -#if 0 - occa::memory o_haloBuffer = parAlmond->device.mappedAlloc(A->NsendTotal*sizeof(dfloat), NULL); - A->sendBuffer = (dfloat*) o_haloBuffer.getMappedPointer(); -#endif - A->sendBuffer = (dfloat*) occaHostMallocPinned(parAlmond->device, A->NsendTotal*sizeof(dfloat), NULL, A->o_haloBuffer); - } - - return A; -} - - -void axpy(csr *A, dfloat alpha, dfloat *x, dfloat beta, dfloat *y, bool nullSpace, dfloat nullSpacePenalty) { - - dfloat alphaG = 0.; - - if (A->NsendTotal + A->NrecvTotal) - csrHaloExchangeStart(A, sizeof(dfloat), x, A->sendBuffer, x+A->NlocalCols); - - // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j]) - #pragma omp parallel for - for(dlong i=0; iNrows; i++){ //local - dfloat result = 0.0; - for(dlong jj=A->diagRowStarts[i]; jjdiagRowStarts[i+1]; jj++) - result += (A->diagCoefs[jj]*x[A->diagCols[jj]]); - - y[i] = alpha*result + beta*y[i]; - } - - //rank 1 correction if there is a nullspace - if (nullSpace) { - dfloat alphaL = innerProd(A->Nrows, A->null, x); - MPI_Allreduce(&alphaL, &alphaG, 1, MPI_DFLOAT, MPI_SUM, agmg::comm); - alphaG *= nullSpacePenalty; - } - - if (A->NsendTotal + A->NrecvTotal) - csrHaloExchangeFinish(A); - - #pragma omp parallel for - for(dlong i=0; iNrows; i++){ //nonlocal - dfloat result = 0.0; - for(dlong jj=A->offdRowStarts[i]; jjoffdRowStarts[i+1]; jj++) - result += (A->offdCoefs[jj]*x[A->offdCols[jj]]); - - y[i] += alpha*result; - } - - //add the correction - if (nullSpace) - vectorAdd(A->Nrows, alpha*alphaG, A->null, 1., y); -} - -void axpy(parAlmond_t *parAlmond, dcoo *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y) { - - occaTimerTic(parAlmond->device,"dcoo axpy"); - if (A->NsendTotal) { - parAlmond->device.finish(); - parAlmond->device.setStream(parAlmond->dataStream); - parAlmond->haloExtract(A->NsendTotal, 1, A->o_haloElementList, o_x, A->o_haloBuffer); - - //copy from device - A->o_haloBuffer.copyTo(A->sendBuffer,"async: true"); - parAlmond->device.setStream(parAlmond->defaultStream); - } - - if (A->NsendTotal + A->NrecvTotal){ - parAlmond->device.setStream(parAlmond->dataStream); - parAlmond->device.finish(); - dcooHaloExchangeStart(A, sizeof(dfloat), A->sendBuffer, A->recvBuffer); - parAlmond->device.setStream(parAlmond->defaultStream); - } - - if (A->diagNNZ) - parAlmond->agg_interpolateKernel(A->diagNNZ, A->o_diagRows, A->o_diagCols, A->o_diagCoefs, o_x, o_y); - - if (A->NsendTotal + A->NrecvTotal) - dcooHaloExchangeFinish(A); - - //copy back to device - if(A->NrecvTotal){ - parAlmond->device.setStream(parAlmond->dataStream); - o_x.copyFrom(A->recvBuffer,A->NrecvTotal*sizeof(dfloat),A->NlocalCols*sizeof(dfloat),"async: true"); - parAlmond->device.finish(); - parAlmond->device.setStream(parAlmond->defaultStream); - parAlmond->device.finish(); - } - - if (A->offdNNZ) - parAlmond->agg_interpolateKernel(A->offdNNZ, A->o_offdRows, A->o_offdCols, A->o_offdCoefs, o_x, o_y); - - occaTimerToc(parAlmond->device,"dcoo axpy"); -} - -void axpy(parAlmond_t *parAlmond, hyb *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y, bool nullSpace, dfloat nullSpacePenalty) { - - dfloat alphaG = 0.; - - occaTimerTic(parAlmond->device,"hyb axpy"); - if (A->NsendTotal) { - parAlmond->device.finish(); - parAlmond->device.setStream(parAlmond->dataStream); - - parAlmond->haloExtract(A->NsendTotal, 1, A->o_haloElementList, o_x, A->o_haloBuffer); - - //copy from device - A->o_haloBuffer.copyTo(A->sendBuffer,"async: true"); - - parAlmond->device.setStream(parAlmond->defaultStream); - } - - // y <-- alpha*E*x+beta*y - axpy(parAlmond, A->E, alpha, o_x, beta, o_y); - - if (A->NsendTotal+A->NrecvTotal){ - parAlmond->device.setStream(parAlmond->dataStream); - parAlmond->device.finish(); - hybHaloExchangeStart(A, sizeof(dfloat),A->sendBuffer, A->recvBuffer); - parAlmond->device.setStream(parAlmond->defaultStream); - } - - //rank 1 correction if there is a nullspace - if (nullSpace) { - dfloat alphaL = innerProd(parAlmond, A->Nrows, A->o_null, o_x); - MPI_Allreduce(&alphaL, &alphaG, 1, MPI_DFLOAT, MPI_SUM, agmg::comm); - alphaG *= nullSpacePenalty; - } - - if (A->NsendTotal+A->NrecvTotal) - hybHaloExchangeFinish(A); - - //copy back to device - if (A->NrecvTotal){ - parAlmond->device.setStream(parAlmond->dataStream); - o_x.copyFrom(A->recvBuffer,A->NrecvTotal*sizeof(dfloat),A->NlocalCols*sizeof(dfloat),"async: true"); - parAlmond->device.finish(); - parAlmond->device.setStream(parAlmond->defaultStream); - parAlmond->device.finish(); - } - - // y <-- alpha*C*x + y - if (A->C->nnz) - ax(parAlmond, A->C, alpha, o_x, o_y); - - //add the correction - if (nullSpace) - vectorAdd(parAlmond, A->Nrows, alpha*alphaG, A->o_null, 1., o_y); - - occaTimerToc(parAlmond->device,"hyb axpy"); -} - -void axpy(parAlmond_t *parAlmond, ell *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y) { - - if(A->actualNNZ){ - occaTimerTic(parAlmond->device,"ell axpy"); - parAlmond->ellAXPYKernel(A->Nrows, A->nnzPerRow, A->strideLength, - alpha, beta, A->o_cols, A->o_coefs, o_x, o_y); - occaTimerToc(parAlmond->device,"ell axpy"); - } -} - -void ax(parAlmond_t *parAlmond, coo *C, dfloat alpha, occa::memory o_x, occa::memory o_y) { - - // do block-wise product - if(C->nnz){ - occaTimerTic(parAlmond->device,"coo ax"); - parAlmond->cooAXKernel(C->Nrows, alpha, C->o_offsets, C->o_cols, C->o_coefs,o_x, o_y); - occaTimerToc(parAlmond->device,"coo ax"); - } -} - -void smoothJacobi(parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero) { - - // x = x + inv(D)*(b-A*x) - if(x_is_zero){ - #pragma omp parallel for - for(dlong i=0; iNrows; i++){ - x[i] = A->diagInv[i]*r[i]; - } - return; - } - - dfloat *res = level->smootherResidual; - #pragma omp parallel for - for(dlong i=0; iNrows; i++){ - res[i] = r[i]; - } - - axpy(A, -1.0, x, 1.0, res,parAlmond->nullSpace,parAlmond->nullSpacePenalty); - - // update x - #pragma omp parallel for - for (dlong i=0;iNrows;i++) - x[i] = x[i] + A->diagInv[i]*res[i]; - -} - - -void smoothDampedJacobi(parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero) { - - // dfloat alphaG = 0.; - dfloat alpha = level->smoother_params[0]; - - if(x_is_zero){ - #pragma omp parallel for - for(dlong i=0; iNrows; i++){ - x[i] = alpha*A->diagInv[i]*r[i]; - } - return; - } - - dfloat *res = level->smootherResidual; - #pragma omp parallel for - for(dlong i=0; iNrows; i++){ - res[i] = r[i]; - } - - axpy(A, -1.0, x, 1.0, res,parAlmond->nullSpace,parAlmond->nullSpacePenalty); - - // copy the buffer vector to x - #pragma omp parallel for - for (dlong i=0;iNrows;i++) - x[i] = x[i] + alpha*A->diagInv[i]*res[i]; -} - -void smoothChebyshev(parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero) { - - dfloat lambdaN = level->smoother_params[0]; - dfloat lambda1 = level->smoother_params[1]; - - dfloat theta = 0.5*(lambdaN+lambda1); - dfloat delta = 0.5*(lambdaN-lambda1); - dfloat invTheta = 1.0/theta; - dfloat sigma = theta/delta; - dfloat rho_n = 1./sigma; - dfloat rho_np1; - - dfloat *res = level->smootherResidual; - dfloat *Ad = level->smootherResidual2; - dfloat *d = level->smootherUpdate; - - // dfloat alphaG = 0.; - - if(x_is_zero){ //skip the Ax if x is zero - #pragma omp parallel for - for(dlong i=0; iNrows; i++){ - res[i] = A->diagInv[i]*r[i]; - x[i] = 0.; - d[i] = invTheta*res[i]; - } - } else { - - level->Ax(level->AxArgs,x,res); - - #pragma omp parallel for - for(dlong i=0; iNrows; i++){ - res[i] = A->diagInv[i]*(r[i]-res[i]); - d[i] = invTheta*res[i]; - } - } - - for (int k=0;kChebyshevIterations;k++) { - //x_k+1 = x_k + d_k - vectorAdd(A->Nrows, 1.0, d, 1.0, x); - - //r_k+1 = r_k - D^{-1}Ad_k - level->Ax(level->AxArgs,d,Ad); - #pragma omp parallel for - for(dlong i=0; iNrows; i++) { - res[i] = res[i] - A->diagInv[i]*Ad[i]; - } - - rho_np1 = 1.0/(2.*sigma-rho_n); - - //d_k+1 = rho_k+1*rho_k*d_k + 2*rho_k+1*r_k+1/delta - vectorAdd(A->Nrows, 2.0*rho_np1/delta, res, rho_np1*rho_n, d); - rho_n = rho_np1; - } - //x_k+1 = x_k + d_k - vectorAdd(A->Nrows, 1.0, d, 1.0, x); -} - -void smoothJacobi(parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero) { - - // dfloat alphaG = 0.; - - occaTimerTic(parAlmond->device,"hyb smoothJacobi"); - if(x_is_zero){ - if (A->Nrows) - dotStar(parAlmond, A->Nrows, 1.0, A->o_diagInv, o_r, 0.0, o_x); - occaTimerToc(parAlmond->device,"hyb smoothJacobi"); - return; - } - - occa::memory o_res = level->o_smootherResidual; - - o_res.copyFrom(o_r,A->Nrows*sizeof(dfloat)); - axpy(parAlmond, A, -1.0, o_x, 1.0, o_res,parAlmond->nullSpace,parAlmond->nullSpacePenalty); - - // x = x + inv(D)*(r-A*x) - dotStar(parAlmond, A->Nrows, 1.0, A->o_diagInv, o_res, 1.0, o_x); - occaTimerToc(parAlmond->device,"hyb smoothJacobi"); -} - -void smoothDampedJacobi(parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero){ - - // dfloat alphaG = 0.; - dfloat alpha = level->smoother_params[0]; - - occaTimerTic(parAlmond->device,"hyb smoothDampedJacobi"); - if(x_is_zero){ - if (A->Nrows) - dotStar(parAlmond, A->Nrows, alpha, A->o_diagInv, o_r, 0.0, o_x); - occaTimerToc(parAlmond->device,"hyb smoothDampedJacobi"); - return; - } - - occa::memory o_res = level->o_smootherResidual; - - o_res.copyFrom(o_r,A->Nrows*sizeof(dfloat)); - axpy(parAlmond, A, -1.0, o_x, 1.0, o_res,parAlmond->nullSpace,parAlmond->nullSpacePenalty); - - // x = x + alpha*inv(D)*(r-A*x) - dotStar(parAlmond, A->Nrows, alpha, A->o_diagInv, o_res, 1.0, o_x); - occaTimerToc(parAlmond->device,"hyb smoothDampedJacobi"); -} - -void smoothChebyshev(parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero) { - - dfloat lambdaN = level->smoother_params[0]; - dfloat lambda1 = level->smoother_params[1]; - - dfloat theta = 0.5*(lambdaN+lambda1); - dfloat delta = 0.5*(lambdaN-lambda1); - dfloat invTheta = 1.0/theta; - dfloat sigma = theta/delta; - dfloat rho_n = 1./sigma; - dfloat rho_np1; - - occa::memory o_res = level->o_smootherResidual; - occa::memory o_Ad = level->o_smootherResidual2; - occa::memory o_d = level->o_smootherUpdate; - - // dfloat alphaG = 0.; - - occaTimerTic(parAlmond->device,"hyb smoothChebyshev"); - - if(x_is_zero){ //skip the Ax if x is zero - //res = D^{-1}r - dotStar(parAlmond, A->Nrows, 1.0, A->o_diagInv, o_r, 0.0, o_res); - setVector(parAlmond, A->Nrows, o_x, 0.0); - //d = invTheta*res - vectorAdd(parAlmond, A->Nrows, invTheta, o_res, 0.0, o_d); - - } else { - - //res = D^{-1}(r-Ax) - level->device_Ax(level->AxArgs,o_x,o_res); - vectorAdd(parAlmond, A->Nrows, 1.0, o_r, -1.0, o_res); - dotStar(parAlmond, A->Nrows, A->o_diagInv, o_res); - - //d = invTheta*res - vectorAdd(parAlmond, A->Nrows, invTheta, o_res, 0.0, o_d); - } - - for (int k=0;kChebyshevIterations;k++) { - //x_k+1 = x_k + d_k - vectorAdd(parAlmond, A->Nrows, 1.0, o_d, 1.0, o_x); - - //r_k+1 = r_k - D^{-1}Ad_k - level->device_Ax(level->AxArgs,o_d,o_Ad); - dotStar(parAlmond, A->Nrows, -1.0, A->o_diagInv, o_Ad, 1.0, o_res); - - rho_np1 = 1.0/(2.*sigma-rho_n); - - //d_k+1 = rho_k+1*rho_k*d_k + 2*rho_k+1*r_k+1/delta - vectorAdd(parAlmond, A->Nrows, 2.0*rho_np1/delta, o_res, rho_np1*rho_n, o_d); - rho_n = rho_np1; - } - //x_k+1 = x_k + d_k - vectorAdd(parAlmond, A->Nrows, 1.0, o_d, 1.0, o_x); - - occaTimerToc(parAlmond->device,"hyb smoothChebyshev"); -} - - -// set up halo infomation for inter-processor MPI -// exchange of trace nodes -void csrHaloSetup(csr *A, hlong *globalColStarts){ - - // MPI info - int rank, size; - rank = agmg::rank; - size = agmg::size; - - // non-blocking MPI isend/irecv requests (used in meshHaloExchange) - A->haloSendRequests = calloc(size, sizeof(MPI_Request)); - A->haloRecvRequests = calloc(size, sizeof(MPI_Request)); - - // count number of halo element nodes to swap - A->NrecvTotal = 0; - A->NsendPairs = (int*) calloc(size, sizeof(int)); - A->NrecvPairs = (int*) calloc(size, sizeof(int)); - for(dlong n=A->NlocalCols;nNcols;++n){ //for just the halo - hlong id = A->colMap[n]; // global index - for (int r=0;rNrecvTotal++; - A->NrecvPairs[r]++; - } - } - } - - MPI_Alltoall(A->NrecvPairs, 1, MPI_INT, A->NsendPairs, 1, MPI_INT, agmg::comm); - - A->NsendTotal = 0; - for (int r=0;rNsendTotal += A->NsendPairs[r]; - - hlong *ghaloElementList; - if (A->NsendTotal) { - ghaloElementList = (hlong *) calloc(A->NsendTotal,sizeof(hlong)); - A->haloElementList = (dlong *) calloc(A->NsendTotal,sizeof(dlong)); - } - - // count number of MPI messages in halo exchange - A->NsendMessages = 0; - A->NrecvMessages = 0; - for(int r=0;rNsendPairs[r]) - A->NsendMessages++; - if(A->NrecvPairs[r]) - A->NrecvMessages++; - } - - //exchange the needed ids - int tag = 999; - dlong recvOffset = A->NlocalCols; - int sendOffset = 0; - int sendMessage = 0, recvMessage = 0; - for(int r=0;rNsendPairs[r]) { - MPI_Irecv(ghaloElementList+sendOffset, A->NsendPairs[r], MPI_HLONG, r, tag, - agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage); - sendOffset += A->NsendPairs[r]; - ++sendMessage; - } - if(A->NrecvPairs[r]){ - MPI_Isend(A->colMap+recvOffset, A->NrecvPairs[r], MPI_HLONG, r, tag, - agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage); - recvOffset += A->NrecvPairs[r]; - ++recvMessage; - } - } - - // Wait for all sent messages to have left and received messages to have arrived - MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status)); - MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status)); - - MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus); - MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus); - - free(recvStatus); - free(sendStatus); - - //shift to local ids - for (int n=0;nNsendTotal;n++) - A->haloElementList[n] = (dlong) ghaloElementList[n] - globalColStarts[rank]; - - if (A->NsendTotal) - A->sendBuffer = (dfloat *) calloc(A->NsendTotal,sizeof(dfloat)); - - A->totalHaloPairs = A->NsendTotal+A->NrecvTotal; -} - -void csrHaloExchange(csr *A, - size_t Nbytes, // message size per element - void *sourceBuffer, - void *sendBuffer, // temporary buffer - void *recvBuffer) { - // MPI info - int rank, size; - rank = agmg::rank; - size = agmg::size; - - int tag = 999; - - // copy data from outgoing elements into temporary send buffer - for(int i=0;iNsendTotal;++i){ - // outgoing element - dlong id = A->haloElementList[i]; - - memcpy(((char*)sendBuffer)+i*Nbytes, ((char*)sourceBuffer)+id*Nbytes, Nbytes); - } - - // initiate immediate send and receives to each other process as needed - int recvOffset = 0; - int sendOffset = 0; - int sendMessage = 0, recvMessage = 0; - for(int r=0;rNrecvTotal) { - if(A->NrecvPairs[r]) { - MPI_Irecv(((char*)recvBuffer)+recvOffset, A->NrecvPairs[r]*Nbytes, MPI_CHAR, r, tag, - agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage); - recvOffset += A->NrecvPairs[r]*Nbytes; - ++recvMessage; - } - } - if (A->NsendTotal) { - if(A->NsendPairs[r]){ - MPI_Isend(((char*)sendBuffer)+sendOffset, A->NsendPairs[r]*Nbytes, MPI_CHAR, r, tag, - agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage); - sendOffset += A->NsendPairs[r]*Nbytes; - ++sendMessage; - } - } - } - - // Wait for all sent messages to have left and received messages to have arrived - if (A->NsendTotal) { - MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus); - free(sendStatus); - } - if (A->NrecvTotal) { - MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus); - free(recvStatus); - } -} - -void csrHaloExchangeStart(csr *A, - size_t Nbytes, // message size per element - void *sourceBuffer, - void *sendBuffer, // temporary buffer - void *recvBuffer) { - // MPI info - int rank, size; - rank = agmg::rank; - size = agmg::size; - - int tag = 999; - - // copy data from outgoing elements into temporary send buffer - for(int i=0;iNsendTotal;++i){ - // outgoing element - dlong id = A->haloElementList[i]; - - memcpy(((char*)sendBuffer)+i*Nbytes, ((char*)sourceBuffer)+id*Nbytes, Nbytes); - } - - // initiate immediate send and receives to each other process as needed - int recvOffset = 0; - int sendOffset = 0; - int sendMessage = 0, recvMessage = 0; - for(int r=0;rNrecvTotal) { - if(A->NrecvPairs[r]) { - MPI_Irecv(((char*)recvBuffer)+recvOffset, A->NrecvPairs[r]*Nbytes, MPI_CHAR, r, tag, - agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage); - recvOffset += A->NrecvPairs[r]*Nbytes; - ++recvMessage; - } - } - if (A->NsendTotal) { - if(A->NsendPairs[r]){ - MPI_Isend(((char*)sendBuffer)+sendOffset, A->NsendPairs[r]*Nbytes, MPI_CHAR, r, tag, - agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage); - sendOffset += A->NsendPairs[r]*Nbytes; - ++sendMessage; - } - } - } -} - -void csrHaloExchangeFinish(csr *A) { - // Wait for all sent messages to have left and received messages to have arrived - if (A->NsendTotal) { - MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus); - free(sendStatus); - } - if (A->NrecvTotal) { - MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus); - free(recvStatus); - } -} - -void dcooHaloExchangeStart(dcoo *A, size_t Nbytes, void *sendBuffer, void *recvBuffer) { - // MPI info - int rank, size; - rank = agmg::rank; - size = agmg::size; - - // count outgoing and incoming meshes - int tag = 999; - - // initiate immediate send and receives to each other process as needed - int recvOffset = 0; - int sendOffset = 0; - int sendMessage = 0, recvMessage = 0; - for(int r=0;rNrecvTotal) { - if(A->NrecvPairs[r]) { - MPI_Irecv(((char*)A->recvBuffer)+recvOffset, A->NrecvPairs[r]*Nbytes, MPI_CHAR, r, tag, - agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage); - recvOffset += A->NrecvPairs[r]*Nbytes; - ++recvMessage; - } - } - if (A->NsendTotal) { - if(A->NsendPairs[r]){ - MPI_Isend(((char*)A->sendBuffer)+sendOffset, A->NsendPairs[r]*Nbytes, MPI_CHAR, r, tag, - agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage); - sendOffset += A->NsendPairs[r]*Nbytes; - ++sendMessage; - } - } - } -} - -void dcooHaloExchangeFinish(dcoo *A) { - // Wait for all sent messages to have left and received messages to have arrived - if (A->NsendTotal) { - MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus); - free(sendStatus); - } - if (A->NrecvTotal) { - MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus); - free(recvStatus); - } -} - -void hybHaloExchangeStart(hyb *A, size_t Nbytes, void *sendBuffer, void *recvBuffer) { - // MPI info - int rank, size; - rank = agmg::rank; - size = agmg::size; - - // count outgoing and incoming meshes - int tag = 999; - - // initiate immediate send and receives to each other process as needed - int recvOffset = 0; - int sendOffset = 0; - int sendMessage = 0, recvMessage = 0; - for(int r=0;rNrecvTotal) { - if(A->NrecvPairs[r]) { - MPI_Irecv(((char*)recvBuffer)+recvOffset, A->NrecvPairs[r]*Nbytes, MPI_CHAR, r, tag, - agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage); - recvOffset += A->NrecvPairs[r]*Nbytes; - ++recvMessage; - } - } - if (A->NsendTotal) { - if(A->NsendPairs[r]){ - MPI_Isend(((char*)sendBuffer)+sendOffset, A->NsendPairs[r]*Nbytes, MPI_CHAR, r, tag, - agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage); - sendOffset += A->NsendPairs[r]*Nbytes; - ++sendMessage; - } - } - } -} - -void hybHaloExchangeFinish(hyb *A) { - // Wait for all sent messages to have left and received messages to have arrived - if (A->NsendTotal) { - MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus); - free(sendStatus); - } - if (A->NrecvTotal) { - MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus); - free(recvStatus); - } -} - diff --git a/solvers/parALMOND/src/agmgSetup.c b/solvers/parALMOND/src/agmgSetup.c deleted file mode 100644 index 8464ab11a..000000000 --- a/solvers/parALMOND/src/agmgSetup.c +++ /dev/null @@ -1,1757 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "agmg.h" - -csr *strong_graph(csr *A, dfloat threshold); -bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i); -hlong *form_aggregates(agmgLevel *level, csr *C); -void find_aggregate_owners(agmgLevel *level, hlong* FineToCoarse, setupAide options); -csr *construct_interpolator(agmgLevel *level, hlong *FineToCoarse, dfloat **nullCoarseA); -csr *transpose(agmgLevel* level, csr *A, hlong *globalRowStarts, hlong *globalColStarts); -csr *galerkinProd(agmgLevel *level, csr *R, csr *A, csr *P); -void coarsenAgmgLevel(agmgLevel *level, csr **coarseA, csr **P, csr **R, dfloat **nullCoarseA, setupAide options); - - -void agmgSetup(parAlmond_t *parAlmond, csr *A, dfloat *nullA, hlong *globalRowStarts, setupAide options){ - - int rank, size; - rank = agmg::rank; - size = agmg::size; - - // approximate Nrows at coarsest level - int gCoarseSize = 1000; - - double seed = (double) rank; - srand48(seed); - - agmgLevel **levels = parAlmond->levels; - - int lev = parAlmond->numLevels; //add this level to the end of the chain - - levels[lev] = (agmgLevel *) calloc(1,sizeof(agmgLevel)); - levels[lev]->gatherLevel = false; - levels[lev]->weightedInnerProds = false; - parAlmond->numLevels++; - - //copy A matrix and null vector - levels[lev]->A = A; - levels[lev]->A->null = nullA; - - levels[lev]->Nrows = A->Nrows; - levels[lev]->Ncols = A->Ncols; - - - SmoothType smoothType; - int ChebyshevIterations=2; //default to degree 2 - if (options.compareArgs("PARALMOND SMOOTHER", "CHEBYSHEV")) { - smoothType = CHEBYSHEV; - options.getArgs("PARALMOND CHEBYSHEV DEGREE", ChebyshevIterations); - } else { //default to DAMPED_JACOBI - smoothType = DAMPED_JACOBI; - } - levels[lev]->ChebyshevIterations = ChebyshevIterations; - - setupSmoother(parAlmond, levels[lev], smoothType); - - levels[lev]->deviceA = newHYB(parAlmond, levels[lev]->A); - - //set operator callback - void **args = (void **) calloc(2,sizeof(void*)); - args[0] = (void *) parAlmond; - args[1] = (void *) levels[lev]; - - levels[lev]->AxArgs = args; - levels[lev]->smoothArgs = args; - levels[lev]->Ax = agmgAx; - levels[lev]->smooth = agmgSmooth; - levels[lev]->device_Ax = device_agmgAx; - levels[lev]->device_smooth = device_agmgSmooth; - - //copy global partiton - levels[lev]->globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong)); - for (int r=0;rglobalRowStarts[r] = globalRowStarts[r]; - - hlong localSize = (hlong) levels[lev]->A->Nrows; - hlong globalSize = 0; - MPI_Allreduce(&localSize, &globalSize, 1, MPI_HLONG, MPI_SUM, agmg::comm); - - //if the system if already small, dont create MG levels - bool done = false; - if(globalSize <= gCoarseSize){ - setupExactSolve(parAlmond, levels[lev],parAlmond->nullSpace,parAlmond->nullSpacePenalty); - //setupSmoother(parAlmond, levels[lev], smoothType); - done = true; - } - while(!done){ - // create coarse MG level - levels[lev+1] = (agmgLevel *) calloc(1,sizeof(agmgLevel)); - dfloat *nullCoarseA; - - //printf("Setting up coarse level %d\n", lev+1); - - coarsenAgmgLevel(levels[lev], &(levels[lev+1]->A), &(levels[lev+1]->P), - &(levels[lev+1]->R), &nullCoarseA, parAlmond->options); - - //set dimensions of the fine level (max among the A,R ops) - levels[lev]->Ncols = mymax(levels[lev]->Ncols, levels[lev+1]->R->Ncols); - - parAlmond->numLevels++; - - levels[lev+1]->A->null = nullCoarseA; - levels[lev+1]->Nrows = levels[lev+1]->A->Nrows; - levels[lev+1]->Ncols = mymax(levels[lev+1]->A->Ncols, levels[lev+1]->P->Ncols); - levels[lev+1]->globalRowStarts = levels[lev]->globalAggStarts; - - levels[lev+1]->ChebyshevIterations = ChebyshevIterations; - - setupSmoother(parAlmond, levels[lev+1], smoothType); - - levels[lev+1]->deviceA = newHYB (parAlmond, levels[lev+1]->A); - levels[lev+1]->deviceR = newHYB (parAlmond, levels[lev+1]->R); - levels[lev+1]->dcsrP = newDCOO(parAlmond, levels[lev+1]->P); - - //set operator callback - void **args = (void **) calloc(2,sizeof(void*)); - args[0] = (void *) parAlmond; - args[1] = (void *) levels[lev+1]; - - levels[lev+1]->AxArgs = args; - levels[lev+1]->coarsenArgs = args; - levels[lev+1]->prolongateArgs = args; - levels[lev+1]->smoothArgs = args; - - levels[lev+1]->Ax = agmgAx; - levels[lev+1]->coarsen = agmgCoarsen; - levels[lev+1]->prolongate = agmgProlongate; - levels[lev+1]->smooth = agmgSmooth; - - levels[lev+1]->device_Ax = device_agmgAx; - levels[lev+1]->device_coarsen = device_agmgCoarsen; - levels[lev+1]->device_prolongate = device_agmgProlongate; - levels[lev+1]->device_smooth = device_agmgSmooth; - - const hlong localCoarseDim = (hlong) levels[lev+1]->A->Nrows; - hlong globalCoarseSize; - MPI_Allreduce(&localCoarseDim, &globalCoarseSize, 1, MPI_HLONG, MPI_SUM, agmg::comm); - - if(globalCoarseSize <= gCoarseSize || globalSize < 2*globalCoarseSize){ - setupExactSolve(parAlmond, levels[lev+1],parAlmond->nullSpace,parAlmond->nullSpacePenalty); - //setupSmoother(parAlmond, levels[lev+1], smoothType); - break; - } - - globalSize = globalCoarseSize; - lev++; - } - - //allocate vectors required - occa::device device = parAlmond->device; - for (int n=0;nnumLevels;n++) { - dlong N = levels[n]->Nrows; - dlong M = levels[n]->Ncols; - - if ((n>0)&&(nnumLevels)) { //kcycle vectors - if (M) levels[n]->ckp1 = (dfloat *) calloc(M,sizeof(dfloat)); - if (N) levels[n]->vkp1 = (dfloat *) calloc(N,sizeof(dfloat)); - if (N) levels[n]->wkp1 = (dfloat *) calloc(N,sizeof(dfloat)); - - if (M) levels[n]->o_ckp1 = device.malloc(M*sizeof(dfloat),levels[n]->ckp1); - if (N) levels[n]->o_vkp1 = device.malloc(N*sizeof(dfloat),levels[n]->vkp1); - if (N) levels[n]->o_wkp1 = device.malloc(N*sizeof(dfloat),levels[n]->wkp1); - } - if (M) levels[n]->x = (dfloat *) calloc(M,sizeof(dfloat)); - if (M) levels[n]->res = (dfloat *) calloc(M,sizeof(dfloat)); - if (N) levels[n]->rhs = (dfloat *) calloc(N,sizeof(dfloat)); - - if (M) levels[n]->o_x = device.malloc(M*sizeof(dfloat),levels[n]->x); - if (M) levels[n]->o_res = device.malloc(M*sizeof(dfloat),levels[n]->res); - if (N) levels[n]->o_rhs = device.malloc(N*sizeof(dfloat),levels[n]->rhs); - } - //buffer for innerproducts in kcycle - dlong numBlocks = ((levels[0]->Nrows+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD; - parAlmond->rho = (dfloat*) calloc(3*numBlocks,sizeof(dfloat)); - parAlmond->o_rho = device.malloc(3*numBlocks*sizeof(dfloat), parAlmond->rho); -} - -void parAlmondReport(parAlmond_t *parAlmond) { - - int rank, size; - rank = agmg::rank; - size = agmg::size; - - if(rank==0) { - printf("------------------ParAlmond Report-----------------------------------\n"); - printf("---------------------------------------------------------------------\n"); - printf("level| active ranks | dimension | nnzs | nnz/row |\n"); - printf(" | | (min,max,avg) | (min,max,avg) | (min,max,avg) |\n"); - printf("---------------------------------------------------------------------\n"); - } - - for(int lev=0; levnumLevels; lev++){ - - dlong Nrows = parAlmond->levels[lev]->Nrows; - hlong hNrows = (hlong) parAlmond->levels[lev]->Nrows; - - int active = (Nrows>0) ? 1:0; - int totalActive=0; - MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, agmg::comm); - - dlong minNrows=0, maxNrows=0; - hlong totalNrows=0; - dfloat avgNrows; - MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, agmg::comm); - MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, agmg::comm); - avgNrows = (dfloat) totalNrows/totalActive; - - if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min - MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, agmg::comm); - - - long long int nnz; - if (parAlmond->levels[lev]->A) - nnz = parAlmond->levels[lev]->A->diagNNZ+parAlmond->levels[lev]->A->offdNNZ; - else - nnz =0; - long long int minNnz=0, maxNnz=0, totalNnz=0; - dfloat avgNnz; - MPI_Allreduce(&nnz, &maxNnz, 1, MPI_LONG_LONG_INT, MPI_MAX, agmg::comm); - MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, agmg::comm); - avgNnz = (dfloat) totalNnz/totalActive; - - if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min - MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, agmg::comm); - - Nrows = parAlmond->levels[lev]->Nrows; - dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows; - dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0; - MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, agmg::comm); - MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, agmg::comm); - avgNnzPerRow /= totalActive; - - if (Nrows==0) nnzPerRow = maxNnzPerRow; - MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, agmg::comm); - - if (rank==0){ - printf(" %3d | %4d | %10.2f | %10.2f | %10.2f |\n", - lev, totalActive, (dfloat)minNrows, (dfloat)minNnz, minNnzPerRow); - printf(" | | %10.2f | %10.2f | %10.2f |\n", - (dfloat)maxNrows, (dfloat)maxNnz, maxNnzPerRow); - printf(" | | %10.2f | %10.2f | %10.2f |\n", - avgNrows, avgNnz, avgNnzPerRow); - } - } - if(rank==0) - printf("---------------------------------------------------------------------\n"); -} - - -//create coarsened problem -void coarsenAgmgLevel(agmgLevel *level, csr **coarseA, csr **P, csr **R, dfloat **nullCoarseA, setupAide options){ - - // establish the graph of strong connections - level->threshold = 0.5; - - csr *C = strong_graph(level->A, level->threshold); - - hlong *FineToCoarse = form_aggregates(level, C); - - find_aggregate_owners(level,FineToCoarse,options); - - *P = construct_interpolator(level, FineToCoarse, nullCoarseA); - *R = transpose(level, *P, level->globalRowStarts, level->globalAggStarts); - *coarseA = galerkinProd(level, *R, level->A, *P); -} - -csr * strong_graph(csr *A, dfloat threshold){ - - const dlong N = A->Nrows; - const dlong M = A->Ncols; - - csr *C = (csr *) calloc(1, sizeof(csr)); - - C->Nrows = N; - C->Ncols = M; - - C->diagRowStarts = (dlong *) calloc(N+1,sizeof(dlong)); - C->offdRowStarts = (dlong *) calloc(N+1,sizeof(dlong)); - - dfloat *maxOD; - if (N) maxOD = (dfloat *) calloc(N,sizeof(dfloat)); - - //store the diagonal of A for all needed columns - dfloat *diagA = (dfloat *) calloc(M,sizeof(dfloat)); - for (dlong i=0;idiagCoefs[A->diagRowStarts[i]]; - csrHaloExchange(A, sizeof(dfloat), diagA, A->sendBuffer, diagA+A->NlocalCols); - - #pragma omp parallel for - for(dlong i=0; i= 0) ? 1:-1; - dfloat Aii = fabs(diagA[i]); - - //find maxOD - //local entries - dlong Jstart = A->diagRowStarts[i], Jend = A->diagRowStarts[i+1]; - for(dlong jj= Jstart+1; jjdiagCols[jj]; - dfloat Ajj = fabs(diagA[col]); - dfloat OD = -sign*A->diagCoefs[jj]/(sqrt(Aii)*sqrt(Ajj)); - if(OD > maxOD[i]) maxOD[i] = OD; - } - //non-local entries - Jstart = A->offdRowStarts[i], Jend = A->offdRowStarts[i+1]; - for(dlong jj= Jstart; jjoffdCols[jj]; - dfloat Ajj = fabs(diagA[col]); - dfloat OD = -sign*A->offdCoefs[jj]/(sqrt(Aii)*sqrt(Ajj)); - if(OD > maxOD[i]) maxOD[i] = OD; - } - - int diag_strong_per_row = 1; // diagonal entry - //local entries - Jstart = A->diagRowStarts[i], Jend = A->diagRowStarts[i+1]; - for(dlong jj = Jstart+1; jjdiagCols[jj]; - dfloat Ajj = fabs(diagA[col]); - dfloat OD = -sign*A->diagCoefs[jj]/(sqrt(Aii)*sqrt(Ajj)); - if(OD > threshold*maxOD[i]) diag_strong_per_row++; - } - int offd_strong_per_row = 0; - //non-local entries - Jstart = A->offdRowStarts[i], Jend = A->offdRowStarts[i+1]; - for(dlong jj= Jstart; jjoffdCols[jj]; - dfloat Ajj = fabs(diagA[col]); - dfloat OD = -sign*A->offdCoefs[jj]/(sqrt(Aii)*sqrt(Ajj)); - if(OD > threshold*maxOD[i]) offd_strong_per_row++; - } - - C->diagRowStarts[i+1] = diag_strong_per_row; - C->offdRowStarts[i+1] = offd_strong_per_row; - } - - // cumulative sum - for(dlong i=1; idiagRowStarts[i] += C->diagRowStarts[i-1]; - C->offdRowStarts[i] += C->offdRowStarts[i-1]; - } - - C->diagNNZ = C->diagRowStarts[N]; - C->offdNNZ = C->offdRowStarts[N]; - - if (C->diagNNZ) C->diagCols = (dlong *) calloc(C->diagNNZ, sizeof(dlong)); - if (C->offdNNZ) C->offdCols = (dlong *) calloc(C->offdNNZ, sizeof(dlong)); - - // fill in the columns for strong connections - #pragma omp parallel for - for(dlong i=0; i= 0) ? 1:-1; - dfloat Aii = fabs(diagA[i]); - - dlong diagCounter = C->diagRowStarts[i]; - dlong offdCounter = C->offdRowStarts[i]; - - //local entries - C->diagCols[diagCounter++] = i;// diag entry - dlong Jstart = A->diagRowStarts[i], Jend = A->diagRowStarts[i+1]; - for(dlong jj = Jstart+1; jjdiagCols[jj]; - dfloat Ajj = fabs(diagA[col]); - dfloat OD = -sign*A->diagCoefs[jj]/(sqrt(Aii)*sqrt(Ajj)); - if(OD > threshold*maxOD[i]) - C->diagCols[diagCounter++] = A->diagCols[jj]; - } - Jstart = A->offdRowStarts[i], Jend = A->offdRowStarts[i+1]; - for(dlong jj = Jstart; jjoffdCols[jj]; - dfloat Ajj = fabs(diagA[col]); - dfloat OD = -sign*A->offdCoefs[jj]/(sqrt(Aii)*sqrt(Ajj)); - if(OD > threshold*maxOD[i]) - C->offdCols[offdCounter++] = A->offdCols[jj]; - } - } - if(N) free(maxOD); - - return C; -} - -bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i){ - - if(s > smax) return true; - if(smax > s) return false; - - if(r > rmax) return true; - if(rmax > r) return false; - - if(i > imax) return true; - if(i < imax) return false; - - return false; -} - -hlong * form_aggregates(agmgLevel *level, csr *C){ - - int rank, size; - rank = agmg::rank; - size = agmg::size; - - const dlong N = C->Nrows; - const dlong M = C->Ncols; - const dlong diagNNZ = C->diagNNZ; - const dlong offdNNZ = C->offdNNZ; - - hlong *FineToCoarse = (hlong *) calloc(M, sizeof(hlong)); - for (dlong i =0;iA; - hlong *globalRowStarts = level->globalRowStarts; - - int *intSendBuffer; - hlong *hlongSendBuffer; - dfloat *dfloatSendBuffer; - if (level->A->NsendTotal) { - intSendBuffer = (int *) calloc(A->NsendTotal,sizeof(int)); - hlongSendBuffer = (hlong *) calloc(A->NsendTotal,sizeof(hlong)); - dfloatSendBuffer = (dfloat *) calloc(A->NsendTotal,sizeof(dfloat)); - } - - for(dlong i=0; idiagCols[i]] += 1.; - - int *nnzCnt, *recvNnzCnt; - if (A->NHalo) nnzCnt = (int *) calloc(A->NHalo,sizeof(int)); - if (A->NsendTotal) recvNnzCnt = (int *) calloc(A->NsendTotal,sizeof(int)); - - //count the non-local non-zeros - for (dlong i=0;ioffdCols[i]-A->NlocalCols]++; - - //do a reverse halo exchange - int tag = 999; - - // initiate immediate send and receives to each other process as needed - dlong recvOffset = 0; - dlong sendOffset = 0; - int sendMessage = 0, recvMessage = 0; - for(int r=0;rNsendTotal) { - if(A->NsendPairs[r]) { - MPI_Irecv(recvNnzCnt+sendOffset, A->NsendPairs[r], MPI_INT, r, tag, - agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage); - sendOffset += A->NsendPairs[r]; - ++sendMessage; - } - } - if (A->NrecvTotal) { - if(A->NrecvPairs[r]){ - MPI_Isend(nnzCnt+recvOffset, A->NrecvPairs[r], MPI_INT, r, tag, - agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage); - recvOffset += A->NrecvPairs[r]; - ++recvMessage; - } - } - } - - // Wait for all sent messages to have left and received messages to have arrived - if (A->NrecvTotal) { - MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus); - free(sendStatus); - } - if (A->NsendTotal) { - MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status)); - MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus); - free(recvStatus); - } - - for(int i=0;iNsendTotal;++i){ - // local index of outgoing element in halo exchange - dlong id = A->haloElementList[i]; - - rands[id] += recvNnzCnt[i]; - } - - if (A->NHalo) free(nnzCnt); - if (A->NsendTotal) free(recvNnzCnt); - - //share randomizer values - csrHaloExchange(A, sizeof(dfloat), rands, dfloatSendBuffer, rands+A->NlocalCols); - - - - hlong done = 0; - while(!done){ - // first neighbours - #pragma omp parallel for - for(dlong i=0; idiagRowStarts[i]+1;jjdiagRowStarts[i+1];jj++){ - const dlong col = C->diagCols[jj]; - if(customLess(smax, rmax, imax, states[col], rands[col], col + globalRowStarts[rank])){ - smax = states[col]; - rmax = rands[col]; - imax = col + globalRowStarts[rank]; - } - } - //nonlocal entries - for(dlong jj=C->offdRowStarts[i];jjoffdRowStarts[i+1];jj++){ - const dlong col = C->offdCols[jj]; - if(customLess(smax, rmax, imax, states[col], rands[col], A->colMap[col])) { - smax = states[col]; - rmax = rands[col]; - imax = A->colMap[col]; - } - } - } - Ts[i] = smax; - Tr[i] = rmax; - Ti[i] = imax; - } - - //share results - csrHaloExchange(A, sizeof(dfloat), Tr, dfloatSendBuffer, Tr+A->NlocalCols); - csrHaloExchange(A, sizeof(int), Ts, intSendBuffer, Ts+A->NlocalCols); - csrHaloExchange(A, sizeof(hlong), Ti, hlongSendBuffer, Ti+A->NlocalCols); - - // second neighbours - #pragma omp parallel for - for(dlong i=0; idiagRowStarts[i]+1;jjdiagRowStarts[i+1];jj++){ - const dlong col = C->diagCols[jj]; - if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){ - smax = Ts[col]; - rmax = Tr[col]; - imax = Ti[col]; - } - } - //nonlocal entries - for(dlong jj=C->offdRowStarts[i];jjoffdRowStarts[i+1];jj++){ - const dlong col = C->offdCols[jj]; - if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){ - smax = Ts[col]; - rmax = Tr[col]; - imax = Ti[col]; - } - } - - // if I am the strongest among all the 1 and 2 ring neighbours - // I am an MIS node - if((states[i] == 0) && (imax == (i + globalRowStarts[rank]))) - states[i] = 1; - - // if there is an MIS node within distance 2, I am removed - if((states[i] == 0) && (smax == 1)) - states[i] = -1; - } - - csrHaloExchange(A, sizeof(int), states, intSendBuffer, states+A->NlocalCols); - - // if number of undecided nodes = 0, algorithm terminates - hlong cnt = std::count(states, states+N, 0); - MPI_Allreduce(&cnt,&done,1,MPI_HLONG, MPI_SUM,agmg::comm); - done = (done == 0) ? 1 : 0; - } - - dlong numAggs = 0; - dlong *gNumAggs = (dlong *) calloc(size,sizeof(dlong)); - level->globalAggStarts = (hlong *) calloc(size+1,sizeof(hlong)); - // count the coarse nodes/aggregates - for(dlong i=0; iglobalAggStarts[0] = 0; - for (int r=0;rglobalAggStarts[r+1] = level->globalAggStarts[r] + gNumAggs[r]; - - numAggs = 0; - // enumerate the coarse nodes/aggregates - for(dlong i=0; iglobalAggStarts[rank] + numAggs++; - - //share the initial aggregate flags - csrHaloExchange(A, sizeof(hlong), FineToCoarse, hlongSendBuffer, FineToCoarse+A->NlocalCols); - - // form the aggregates - #pragma omp parallel for - for(dlong i=0; idiagRowStarts[i]+1;jjdiagRowStarts[i+1];jj++){ - const dlong col = C->diagCols[jj]; - if(customLess(smax, rmax, imax, states[col], rands[col], col + globalRowStarts[rank])){ - smax = states[col]; - rmax = rands[col]; - imax = col + globalRowStarts[rank]; - cmax = FineToCoarse[col]; - } - } - //nonlocal entries - for(dlong jj=C->offdRowStarts[i];jjoffdRowStarts[i+1];jj++){ - const dlong col = C->offdCols[jj]; - if(customLess(smax, rmax, imax, states[col], rands[col], A->colMap[col])){ - smax = states[col]; - rmax = rands[col]; - imax = A->colMap[col]; - cmax = FineToCoarse[col]; - } - } - } - Ts[i] = smax; - Tr[i] = rmax; - Ti[i] = imax; - Tc[i] = cmax; - - if((states[i] == -1) && (smax == 1) && (cmax > -1)) - FineToCoarse[i] = cmax; - } - - csrHaloExchange(A, sizeof(hlong), FineToCoarse, hlongSendBuffer, FineToCoarse+A->NlocalCols); - csrHaloExchange(A, sizeof(dfloat), Tr, dfloatSendBuffer, Tr+A->NlocalCols); - csrHaloExchange(A, sizeof(int), Ts, intSendBuffer, Ts+A->NlocalCols); - csrHaloExchange(A, sizeof(hlong), Ti, hlongSendBuffer, Ti+A->NlocalCols); - csrHaloExchange(A, sizeof(hlong), Tc, hlongSendBuffer, Tc+A->NlocalCols); - - // second neighbours - #pragma omp parallel for - for(dlong i=0; idiagRowStarts[i]+1;jjdiagRowStarts[i+1];jj++){ - const dlong col = C->diagCols[jj]; - if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){ - smax = Ts[col]; - rmax = Tr[col]; - imax = Ti[col]; - cmax = Tc[col]; - } - } - //nonlocal entries - for(dlong jj=C->offdRowStarts[i];jjoffdRowStarts[i+1];jj++){ - const dlong col = C->offdCols[jj]; - if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){ - smax = Ts[col]; - rmax = Tr[col]; - imax = Ti[col]; - cmax = Tc[col]; - } - } - - if((states[i] == -1) && (smax == 1) && (cmax > -1)) - FineToCoarse[i] = cmax; - } - - csrHaloExchange(A, sizeof(hlong), FineToCoarse, hlongSendBuffer, FineToCoarse+A->NlocalCols); - - free(rands); - free(states); - free(Tr); - free(Ts); - free(Ti); - free(Tc); - if (level->A->NsendTotal) { - free(intSendBuffer); - free(hlongSendBuffer); - free(dfloatSendBuffer); - } - - //TODO maybe free C here? - - return FineToCoarse; -} - -typedef struct { - - dlong fineId; - hlong coarseId; - hlong newCoarseId; - - int originRank; - int ownerRank; - -} parallelAggregate_t; - -int compareOwner(const void *a, const void *b){ - parallelAggregate_t *pa = (parallelAggregate_t *) a; - parallelAggregate_t *pb = (parallelAggregate_t *) b; - - if (pa->ownerRank < pb->ownerRank) return -1; - if (pa->ownerRank > pb->ownerRank) return +1; - - return 0; -}; - -int compareAgg(const void *a, const void *b){ - parallelAggregate_t *pa = (parallelAggregate_t *) a; - parallelAggregate_t *pb = (parallelAggregate_t *) b; - - if (pa->coarseId < pb->coarseId) return -1; - if (pa->coarseId > pb->coarseId) return +1; - - if (pa->originRank < pb->originRank) return -1; - if (pa->originRank > pb->originRank) return +1; - - return 0; -}; - -int compareOrigin(const void *a, const void *b){ - parallelAggregate_t *pa = (parallelAggregate_t *) a; - parallelAggregate_t *pb = (parallelAggregate_t *) b; - - if (pa->originRank < pb->originRank) return -1; - if (pa->originRank > pb->originRank) return +1; - - return 0; -}; - -void find_aggregate_owners(agmgLevel *level, hlong* FineToCoarse, setupAide options) { - // MPI info - int rank, size; - rank = agmg::rank; - size = agmg::size; - - dlong N = level->A->Nrows; - - //Need to establish 'ownership' of aggregates - - //Keep the current partitioning for STRONGNODES. - // The rank that had the strong node for each aggregate owns the aggregate - if (options.compareArgs("PARALMOND PARTITION", "STRONGNODES")) return; - - //populate aggregate array - hlong gNumAggs = level->globalAggStarts[size]; //total number of aggregates - - parallelAggregate_t *sendAggs; - if (N) - sendAggs = (parallelAggregate_t *) calloc(N,sizeof(parallelAggregate_t)); - else - sendAggs = (parallelAggregate_t *) calloc(1,sizeof(parallelAggregate_t)); - - for (dlong i=0;imaxEntries) { - ownerRank = r; - maxEntries = rankCounts[r]; - } - } - - //set this aggregate's owner - for (dlong i=aggStarts[n];iglobalAggStarts[0] = 0; - for (int r=0;rglobalAggStarts[r+1] = level->globalAggStarts[r] + lNumAggs[r]; - - //set the new global coarse index - cnt = level->globalAggStarts[rank]; - if (newRecvNtotal) newRecvAggs[0].newCoarseId = cnt; - for (dlong i=1;iA->Nrows; - // const dlong M = level->A->Ncols; - - hlong *globalAggStarts = level->globalAggStarts; - - const hlong globalAggOffset = level->globalAggStarts[rank]; - const dlong NCoarse = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local num agg - - csr* P = (csr *) calloc(1, sizeof(csr)); - - P->Nrows = N; - P->Ncols = NCoarse; - - P->NlocalCols = NCoarse; - P->NHalo = 0; - - P->diagRowStarts = (dlong *) calloc(N+1, sizeof(dlong)); - P->offdRowStarts = (dlong *) calloc(N+1, sizeof(dlong)); - - // each row has exactly one nonzero per row - P->diagNNZ =0; - P->offdNNZ =0; - for(dlong i=0; iglobalAggOffset-1)&&(coldiagNNZ++; - P->diagRowStarts[i+1]++; - } else { - P->offdNNZ++; - P->offdRowStarts[i+1]++; - } - } - for(dlong i=0; idiagRowStarts[i+1] += P->diagRowStarts[i]; - P->offdRowStarts[i+1] += P->offdRowStarts[i]; - } - - if (P->diagNNZ) { - P->diagCols = (dlong *) calloc(P->diagNNZ, sizeof(dlong)); - P->diagCoefs = (dfloat *) calloc(P->diagNNZ, sizeof(dfloat)); - } - hlong *offdCols; - if (P->offdNNZ) { - offdCols = (hlong *) calloc(P->offdNNZ, sizeof(hlong)); - P->offdCols = (dlong *) calloc(P->offdNNZ, sizeof(dlong)); - P->offdCoefs = (dfloat *) calloc(P->offdNNZ, sizeof(dfloat)); - } - - dlong diagCnt = 0; - dlong offdCnt = 0; - for(dlong i=0; iglobalAggStarts[rank]-1)&&(coldiagCols[diagCnt] = (dlong) (col - globalAggOffset); //local index - P->diagCoefs[diagCnt++] = level->A->null[i]; - } else { - offdCols[offdCnt] = col; - P->offdCoefs[offdCnt++] = level->A->null[i]; - } - } - - //record global indexing of columns - P->colMap = (hlong *) calloc(P->Ncols, sizeof(hlong)); - for (dlong i=0;iNcols;i++) - P->colMap[i] = i + globalAggOffset; - - if (P->offdNNZ) { - //we now need to reorder the x vector for the halo, and shift the column indices - hlong *col = (hlong *) calloc(P->offdNNZ,sizeof(hlong)); - for (dlong i=0;ioffdNNZ;i++) - col[i] = offdCols[i]; //copy non-local column global ids - - //sort by global index - std::sort(col,col+P->offdNNZ); - - //count unique non-local column ids - P->NHalo = 0; - for (dlong i=1;ioffdNNZ;i++) - if (col[i]!=col[i-1]) col[++P->NHalo] = col[i]; - P->NHalo++; //number of unique columns - - P->Ncols += P->NHalo; - - //save global column ids in colMap - P->colMap = (hlong *) realloc(P->colMap, P->Ncols*sizeof(hlong)); - for (dlong i=0; iNHalo; i++) - P->colMap[i+P->NlocalCols] = col[i]; - free(col); - - //shift the column indices to local indexing - for (dlong i=0;ioffdNNZ;i++) { - hlong gcol = offdCols[i]; - for (dlong m=P->NlocalCols;mNcols;m++) { - if (gcol == P->colMap[m]) - P->offdCols[i] = m; - } - } - free(offdCols); - } - - csrHaloSetup(P,globalAggStarts); - - // normalize the columns of P - *nullCoarseA = (dfloat *) calloc(P->Ncols,sizeof(dfloat)); - - //add local nonzeros - for(dlong i=0; idiagNNZ; i++) - (*nullCoarseA)[P->diagCols[i]] += P->diagCoefs[i] * P->diagCoefs[i]; - - dfloat *nnzSum, *recvNnzSum; - if (P->NHalo) nnzSum = (dfloat *) calloc(P->NHalo,sizeof(dfloat)); - if (P->NsendTotal) recvNnzSum = (dfloat *) calloc(P->NsendTotal,sizeof(dfloat)); - - //add the non-local non-zeros - for (dlong i=0;ioffdNNZ;i++) - nnzSum[P->offdCols[i]-P->NlocalCols] += P->offdCoefs[i] * P->offdCoefs[i]; - - //do a reverse halo exchange - int tag = 999; - - // initiate immediate send and receives to each other process as needed - dlong recvOffset = 0; - dlong sendOffset = 0; - int sendMessage = 0, recvMessage = 0; - for(int r=0;rNsendTotal) { - if(P->NsendPairs[r]) { - MPI_Irecv(recvNnzSum+sendOffset, P->NsendPairs[r], MPI_DFLOAT, r, tag, - agmg::comm, (MPI_Request*)P->haloSendRequests+sendMessage); - sendOffset += P->NsendPairs[r]; - ++sendMessage; - } - } - if (P->NrecvTotal) { - if(P->NrecvPairs[r]){ - MPI_Isend(nnzSum+recvOffset, P->NrecvPairs[r], MPI_DFLOAT, r, tag, - agmg::comm, (MPI_Request*)P->haloRecvRequests+recvMessage); - recvOffset += P->NrecvPairs[r]; - ++recvMessage; - } - } - } - - // Wait for all sent messages to have left and received messages to have arrived - if (P->NrecvTotal) { - MPI_Status *sendStatus = (MPI_Status*) calloc(P->NsendMessages, sizeof(MPI_Status)); - MPI_Waitall(P->NsendMessages, (MPI_Request*)P->haloSendRequests, sendStatus); - free(sendStatus); - } - if (P->NsendTotal) { - MPI_Status *recvStatus = (MPI_Status*) calloc(P->NrecvMessages, sizeof(MPI_Status)); - MPI_Waitall(P->NrecvMessages, (MPI_Request*)P->haloRecvRequests, recvStatus); - free(recvStatus); - } - - for(dlong i=0;iNsendTotal;++i){ - // local index of outgoing element in halo exchange - dlong id = P->haloElementList[i]; - - (*nullCoarseA)[id] += recvNnzSum[i]; - } - - if (P->NHalo) free(nnzSum); - - for(dlong i=0; isendBuffer, *nullCoarseA+P->NlocalCols); - - for(dlong i=0; idiagNNZ; i++) - P->diagCoefs[i] /= (*nullCoarseA)[P->diagCols[i]]; - for(dlong i=0; ioffdNNZ; i++) - P->offdCoefs[i] /= (*nullCoarseA)[P->offdCols[i]]; - - MPI_Barrier(agmg::comm); - if (P->NsendTotal) free(recvNnzSum); - - return P; -} - -typedef struct { - - hlong row; - hlong col; - dfloat val; - int owner; - -} nonzero_t; - -int compareNonZero(const void *a, const void *b){ - nonzero_t *pa = (nonzero_t *) a; - nonzero_t *pb = (nonzero_t *) b; - - if (pa->owner < pb->owner) return -1; - if (pa->owner > pb->owner) return +1; - - if (pa->row < pb->row) return -1; - if (pa->row > pb->row) return +1; - - if (pa->col < pb->col) return -1; - if (pa->col > pb->col) return +1; - - return 0; -}; - -csr * transpose(agmgLevel* level, csr *A, - hlong *globalRowStarts, hlong *globalColStarts){ - - // MPI info - int rank, size; - rank = agmg::rank; - size = agmg::size; - - csr *At = (csr *) calloc(1,sizeof(csr)); - - At->Nrows = A->Ncols-A->NHalo; - At->Ncols = A->Nrows; - At->diagNNZ = A->diagNNZ; //local entries remain local - - At->NlocalCols = At->Ncols; - - At->diagRowStarts = (dlong *) calloc(At->Nrows+1, sizeof(dlong)); - At->offdRowStarts = (dlong *) calloc(At->Nrows+1, sizeof(dlong)); - - //start with local entries - if (A->diagNNZ) { - At->diagCols = (dlong *) calloc(At->diagNNZ, sizeof(dlong)); - At->diagCoefs = (dfloat *) calloc(At->diagNNZ, sizeof(dfloat)); - } - - // count the num of nonzeros per row for transpose - for(dlong i=0; idiagNNZ; i++){ - dlong row = A->diagCols[i]; - At->diagRowStarts[row+1]++; - } - - // cumulative sum for rows - for(dlong i=1; i<=At->Nrows; i++) - At->diagRowStarts[i] += At->diagRowStarts[i-1]; - - int *counter = (int *) calloc(At->Nrows+1,sizeof(int)); - for (dlong i=0; iNrows+1; i++) - counter[i] = At->diagRowStarts[i]; - - for(dlong i=0; iNrows; i++){ - const dlong Jstart = A->diagRowStarts[i], Jend = A->diagRowStarts[i+1]; - - for(dlong jj=Jstart; jjdiagCols[jj]; - At->diagCols[counter[row]] = i; - At->diagCoefs[counter[row]] = A->diagCoefs[jj]; - - counter[row]++; - } - } - free(counter); - - //record global indexing of columns - At->colMap = (hlong *) calloc(At->Ncols, sizeof(hlong)); - for (dlong i=0;iNcols;i++) - At->colMap[i] = i + globalRowStarts[rank]; - - //now the nonlocal entries. Need to reverse the halo exchange to send the nonzeros - int tag = 999; - - nonzero_t *sendNonZeros; - if (A->offdNNZ) - sendNonZeros = (nonzero_t *) calloc(A->offdNNZ,sizeof(nonzero_t)); - - int *Nsend = (int*) calloc(size, sizeof(int)); - int *Nrecv = (int*) calloc(size, sizeof(int)); - - for(int r=0;rNrows;++i){ - for (dlong j=A->offdRowStarts[i];joffdRowStarts[i+1];j++) { - hlong col = A->colMap[A->offdCols[j]]; //global ids - for (int r=0;roffdCoefs[j]; - } - } - - //sort outgoing nonzeros by owner, then row and col - if (A->offdNNZ) - qsort(sendNonZeros, A->offdNNZ, sizeof(nonzero_t), compareNonZero); - - MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, agmg::comm); - - //count incoming nonzeros - At->offdNNZ = 0; - for (int r=0;roffdNNZ += Nrecv[r]; - - nonzero_t *recvNonZeros; - if (At->offdNNZ) - recvNonZeros = (nonzero_t *) calloc(At->offdNNZ,sizeof(nonzero_t)); - - // initiate immediate send and receives to each other process as needed - int recvOffset = 0; - int sendOffset = 0; - int sendMessage = 0, recvMessage = 0; - for(int r=0;roffdNNZ) { - if(Nrecv[r]) { - MPI_Irecv(((char*)recvNonZeros)+recvOffset, Nrecv[r]*sizeof(nonzero_t), - MPI_CHAR, r, tag, agmg::comm, - (MPI_Request*)A->haloSendRequests+recvMessage); - recvOffset += Nrecv[r]*sizeof(nonzero_t); - ++recvMessage; - } - } - if (A->offdNNZ) { - if(Nsend[r]){ - MPI_Isend(((char*)sendNonZeros)+sendOffset, Nsend[r]*sizeof(nonzero_t), - MPI_CHAR, r, tag, agmg::comm, - (MPI_Request*)A->haloRecvRequests+sendMessage); - sendOffset += Nsend[r]*sizeof(nonzero_t); - ++sendMessage; - } - } - } - - // Wait for all sent messages to have left and received messages to have arrived - if (A->offdNNZ) { - MPI_Status *sendStatus = (MPI_Status*) calloc(sendMessage, sizeof(MPI_Status)); - MPI_Waitall(sendMessage, (MPI_Request*)A->haloRecvRequests, sendStatus); - free(sendStatus); - } - if (At->offdNNZ) { - MPI_Status *recvStatus = (MPI_Status*) calloc(recvMessage, sizeof(MPI_Status)); - MPI_Waitall(recvMessage, (MPI_Request*)A->haloSendRequests, recvStatus); - free(recvStatus); - } - if (A->offdNNZ) free(sendNonZeros); - - //free(Nsend); free(Nrecv); - - if (At->offdNNZ) { - //sort recieved nonzeros by row and col - qsort(recvNonZeros, At->offdNNZ, sizeof(nonzero_t), compareNonZero); - - hlong *offdCols = (hlong *) calloc(At->offdNNZ,sizeof(hlong)); - At->offdCols = (dlong *) calloc(At->offdNNZ,sizeof(dlong)); - At->offdCoefs = (dfloat *) calloc(At->offdNNZ, sizeof(dfloat)); - - //find row starts - for(dlong n=0;noffdNNZ;++n) { - dlong row = (dlong) (recvNonZeros[n].row - globalColStarts[rank]); - At->offdRowStarts[row+1]++; - } - //cumulative sum - for (dlong i=0;iNrows;i++) - At->offdRowStarts[i+1] += At->offdRowStarts[i]; - - //fill cols and coefs - for (dlong i=0; iNrows; i++) { - for (dlong j=At->offdRowStarts[i]; joffdRowStarts[i+1]; j++) { - offdCols[j] = recvNonZeros[j].col; - At->offdCoefs[j] = recvNonZeros[j].val; - } - } - free(recvNonZeros); - - //we now need to reorder the x vector for the halo, and shift the column indices - hlong *col = (hlong *) calloc(At->offdNNZ,sizeof(hlong)); - for (dlong n=0;noffdNNZ;n++) - col[n] = offdCols[n]; //copy non-local column global ids - - //sort by global index - std::sort(col,col+At->offdNNZ); - - //count unique non-local column ids - At->NHalo = 0; - for (dlong n=1;noffdNNZ;n++) - if (col[n]!=col[n-1]) col[++At->NHalo] = col[n]; - At->NHalo++; //number of unique columns - - At->Ncols += At->NHalo; - - //save global column ids in colMap - At->colMap = (hlong *) realloc(At->colMap,At->Ncols*sizeof(hlong)); - for (dlong n=0; nNHalo; n++) - At->colMap[n+At->NlocalCols] = col[n]; - free(col); - - //shift the column indices to local indexing - for (dlong n=0;noffdNNZ;n++) { - hlong gcol = offdCols[n]; - for (dlong m=At->NlocalCols;mNcols;m++) { - if (gcol == At->colMap[m]) - At->offdCols[n] = m; - } - } - free(offdCols); - } - - csrHaloSetup(At,globalRowStarts); - - return At; -} - -typedef struct { - - hlong coarseId; - dfloat coef; - -} pEntry_t; - -typedef struct { - - hlong I; - hlong J; - dfloat coef; - -} rapEntry_t; - -int compareRAPEntries(const void *a, const void *b){ - rapEntry_t *pa = (rapEntry_t *) a; - rapEntry_t *pb = (rapEntry_t *) b; - - if (pa->I < pb->I) return -1; - if (pa->I > pb->I) return +1; - - if (pa->J < pb->J) return -1; - if (pa->J > pb->J) return +1; - - return 0; -}; - -csr *galerkinProd(agmgLevel *level, csr *R, csr *A, csr *P){ - - // MPI info - int rank, size; - rank = agmg::rank; - size = agmg::size; - - hlong *globalAggStarts = level->globalAggStarts; - // hlong *globalRowStarts = level->globalRowStarts; - - hlong globalAggOffset = globalAggStarts[rank]; - - //The galerkin product can be computed as - // (RAP)_IJ = sum_{i in Agg_I} sum_{j in Agg_j} P_iI A_ij P_jJ - // Since each row of P has only one entry, we can share the ncessary - // P entries, form the products, and send them to their destination rank - - dlong N = A->Nrows; - dlong M = A->Ncols; - - //printf("Level has %d rows, and is making %d aggregates\n", N, globalAggStarts[rank+1]-globalAggStarts[rank]); - - pEntry_t *PEntries; - if (M) - PEntries = (pEntry_t *) calloc(M,sizeof(pEntry_t)); - else - PEntries = (pEntry_t *) calloc(1,sizeof(pEntry_t)); - - //record the entries of P that this rank has - dlong cnt =0; - for (dlong i=0;idiagRowStarts[i];jdiagRowStarts[i+1];j++) { - PEntries[cnt].coarseId = P->diagCols[j] + globalAggOffset; //global ID - PEntries[cnt].coef = P->diagCoefs[j]; - cnt++; - } - for (dlong j=P->offdRowStarts[i];joffdRowStarts[i+1];j++) { - PEntries[cnt].coarseId = P->colMap[P->offdCols[j]]; //global ID - PEntries[cnt].coef = P->offdCoefs[j]; - cnt++; - } - } - - pEntry_t *entrySendBuffer; - if (A->NsendTotal) - entrySendBuffer = (pEntry_t *) calloc(A->NsendTotal,sizeof(pEntry_t)); - - //fill in the entires of P needed in the halo - csrHaloExchange(A, sizeof(pEntry_t), PEntries, entrySendBuffer, PEntries+A->NlocalCols); - if (A->NsendTotal) free(entrySendBuffer); - - rapEntry_t *RAPEntries; - dlong totalNNZ = A->diagNNZ+A->offdNNZ; - if (totalNNZ) - RAPEntries = (rapEntry_t *) calloc(totalNNZ,sizeof(rapEntry_t)); - else - RAPEntries = (rapEntry_t *) calloc(1,sizeof(rapEntry_t)); //MPI_AlltoAll doesnt like null pointers - - // Make the MPI_RAPENTRY_T data type - MPI_Datatype MPI_RAPENTRY_T; - MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT}; - int blength[3] = {1, 1, 1}; - MPI_Aint addr[3], displ[3]; - MPI_Get_address ( &(RAPEntries[0] ), addr+0); - MPI_Get_address ( &(RAPEntries[0].J ), addr+1); - MPI_Get_address ( &(RAPEntries[0].coef), addr+2); - displ[0] = 0; - displ[1] = addr[1] - addr[0]; - displ[2] = addr[2] - addr[0]; - MPI_Type_create_struct (3, blength, displ, dtype, &MPI_RAPENTRY_T); - MPI_Type_commit (&MPI_RAPENTRY_T); - - //for the RAP products - cnt =0; - for (dlong i=0;idiagRowStarts[i];jdiagRowStarts[i+1];j++) { - dlong col = A->diagCols[j]; - dfloat coef = A->diagCoefs[j]; - - RAPEntries[cnt].I = PEntries[i].coarseId; - RAPEntries[cnt].J = PEntries[col].coarseId; - RAPEntries[cnt].coef = coef*PEntries[i].coef*PEntries[col].coef; - cnt++; - } - } - for (dlong i=0;ioffdRowStarts[i];joffdRowStarts[i+1];j++) { - dlong col = A->offdCols[j]; - dfloat coef = A->offdCoefs[j]; - - RAPEntries[cnt].I = PEntries[i].coarseId; - RAPEntries[cnt].J = PEntries[col].coarseId; - RAPEntries[cnt].coef = PEntries[i].coef*coef*PEntries[col].coef; - cnt++; - } - } - - //sort entries by the coarse row and col - if (totalNNZ) qsort(RAPEntries, totalNNZ, sizeof(rapEntry_t), compareRAPEntries); - - int *sendCounts = (int *) calloc(size,sizeof(int)); - int *recvCounts = (int *) calloc(size,sizeof(int)); - int *sendOffsets = (int *) calloc(size+1,sizeof(int)); - int *recvOffsets = (int *) calloc(size+1,sizeof(int)); - - for(dlong i=0;iNrows = numAggs; - RAP->Ncols = numAggs; - - RAP->NlocalCols = numAggs; - - RAP->diagRowStarts = (dlong *) calloc(numAggs+1, sizeof(dlong)); - RAP->offdRowStarts = (dlong *) calloc(numAggs+1, sizeof(dlong)); - - for (dlong n=0;n globalAggStarts[rank]-1)&& - (newRAPEntries[n].J < globalAggStarts[rank+1])) { - RAP->diagRowStarts[row+1]++; - } else { - RAP->offdRowStarts[row+1]++; - } - } - - // cumulative sum - for(dlong i=0; idiagRowStarts[i+1] += RAP->diagRowStarts[i]; - RAP->offdRowStarts[i+1] += RAP->offdRowStarts[i]; - } - RAP->diagNNZ = RAP->diagRowStarts[numAggs]; - RAP->offdNNZ = RAP->offdRowStarts[numAggs]; - - dlong *diagCols; - dfloat *diagCoefs; - if (RAP->diagNNZ) { - RAP->diagCols = (dlong *) calloc(RAP->diagNNZ, sizeof(dlong)); - RAP->diagCoefs = (dfloat *) calloc(RAP->diagNNZ, sizeof(dfloat)); - diagCols = (dlong *) calloc(RAP->diagNNZ, sizeof(dlong)); - diagCoefs = (dfloat *) calloc(RAP->diagNNZ, sizeof(dfloat)); - } - hlong *offdCols; - if (RAP->offdNNZ) { - offdCols = (hlong *) calloc(RAP->offdNNZ,sizeof(hlong)); - RAP->offdCols = (dlong *) calloc(RAP->offdNNZ,sizeof(dlong)); - RAP->offdCoefs = (dfloat *) calloc(RAP->offdNNZ, sizeof(dfloat)); - } - - dlong diagCnt =0; - dlong offdCnt =0; - for (dlong n=0;n globalAggStarts[rank]-1)&& - (newRAPEntries[n].J < globalAggStarts[rank+1])) { - diagCols[diagCnt] = (dlong) (newRAPEntries[n].J - globalAggOffset); - diagCoefs[diagCnt] = newRAPEntries[n].coef; - diagCnt++; - } else { - offdCols[offdCnt] = newRAPEntries[n].J; - RAP->offdCoefs[offdCnt] = newRAPEntries[n].coef; - offdCnt++; - } - } - - //move diagonal entries first - for (dlong i=0;iNrows;i++) { - dlong start = RAP->diagRowStarts[i]; - int cnt = 1; - for (dlong j=RAP->diagRowStarts[i]; jdiagRowStarts[i+1]; j++) { - if (diagCols[j] == i) { //move diagonal to first entry - RAP->diagCols[start] = diagCols[j]; - RAP->diagCoefs[start] = diagCoefs[j]; - } else { - RAP->diagCols[start+cnt] = diagCols[j]; - RAP->diagCoefs[start+cnt] = diagCoefs[j]; - cnt++; - } - } - } - - //record global indexing of columns - RAP->colMap = (hlong *) calloc(RAP->Ncols, sizeof(hlong)); - for (dlong i=0;iNcols;i++) - RAP->colMap[i] = i + globalAggOffset; - - if (RAP->offdNNZ) { - //we now need to reorder the x vector for the halo, and shift the column indices - hlong *col = (hlong *) calloc(RAP->offdNNZ,sizeof(hlong)); - for (dlong n=0;noffdNNZ;n++) - col[n] = offdCols[n]; //copy non-local column global ids - - //sort by global index - std::sort(col,col+RAP->offdNNZ); - - //count unique non-local column ids - RAP->NHalo = 0; - for (dlong n=1;noffdNNZ;n++) - if (col[n]!=col[n-1]) col[++RAP->NHalo] = col[n]; - RAP->NHalo++; //number of unique columns - - RAP->Ncols += RAP->NHalo; - - //save global column ids in colMap - RAP->colMap = (hlong *) realloc(RAP->colMap,RAP->Ncols*sizeof(hlong)); - for (dlong n=0; nNHalo; n++) - RAP->colMap[n+RAP->NlocalCols] = col[n]; - - //shift the column indices to local indexing - for (dlong n=0;noffdNNZ;n++) { - hlong gcol = offdCols[n]; - for (dlong m=RAP->NlocalCols;mNcols;m++) { - if (gcol == RAP->colMap[m]) - RAP->offdCols[n] = m; - } - } - free(col); - free(offdCols); - } - csrHaloSetup(RAP,globalAggStarts); - - //clean up - MPI_Barrier(agmg::comm); - MPI_Type_free(&MPI_RAPENTRY_T); - - free(PEntries); - free(sendCounts); free(recvCounts); - free(sendOffsets); free(recvOffsets); - if (RAP->diagNNZ) { - free(diagCols); - free(diagCoefs); - } - free(RAPEntries); - free(newRAPEntries); - free(recvRAPEntries); - - return RAP; -} - diff --git a/solvers/parALMOND/src/almondKernels.c b/solvers/parALMOND/src/almondKernels.c deleted file mode 100644 index adcb7a944..000000000 --- a/solvers/parALMOND/src/almondKernels.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "agmg.h" - -void buildAlmondKernels(parAlmond_t *parAlmond){ - - int rank, size; - rank = agmg::rank; - size = agmg::size; - - occa::properties kernelInfo; - kernelInfo["defines"].asObject(); - kernelInfo["includes"].asArray(); - kernelInfo["header"].asArray(); - kernelInfo["flags"].asObject(); - - - kernelInfo["defines/" "bdim"]= AGMGBDIM; - kernelInfo["defines/" "simd"]= SIMDWIDTH; - - if(sizeof(dlong)==4){ - kernelInfo["defines/" "dlong"]="int"; - } - if(sizeof(dlong)==8){ - kernelInfo["defines/" "dlong"]="long long int"; - } - - if(sizeof(dfloat) == sizeof(double)){ - kernelInfo["defines/" "dfloat"]= "double"; - kernelInfo["defines/" "dfloat4"]= "double4"; - } - else if(sizeof(dfloat) == sizeof(float)){ - kernelInfo["defines/" "dfloat"]= "float"; - kernelInfo["defines/" "dfloat4"]= "float4"; - } - - kernelInfo["defines/" "p_RDIMX"]= RDIMX; - kernelInfo["defines/" "p_RDIMY"]= RDIMY; - - kernelInfo["includes"] += DPWD "/okl/twoPhaseReduction.h"; - - if(parAlmond->device.mode()=="OpenCL"){ - // parAlmond->device.setCompilerFlags("-cl-opt-disable"); - kernelInfo["compiler_flags"] += "-cl-opt-disable"; - } - - if(parAlmond->device.mode()=="CUDA"){ // add backend compiler optimization for CUDA - kernelInfo["compiler_flags"] += "--ftz=true"; - kernelInfo["compiler_flags"] += "--prec-div=false"; - kernelInfo["compiler_flags"] += "--prec-sqrt=false"; - kernelInfo["compiler_flags"] += "--use_fast_math"; - kernelInfo["compiler_flags"] += "--fmad=true"; // compiler option for cuda - } - - if (rank==0) printf("Compiling parALMOND Kernels \n"); - - for (int r=0;rellAXPYKernel = parAlmond->device.buildKernel(DPWD "/okl/ellAXPY.okl", - "ellAXPY", kernelInfo); - - parAlmond->ellZeqAXPYKernel = parAlmond->device.buildKernel(DPWD "/okl/ellAXPY.okl", - "ellZeqAXPY", kernelInfo); - - parAlmond->ellJacobiKernel = parAlmond->device.buildKernel(DPWD "/okl/ellAXPY.okl", - "ellJacobi", kernelInfo); - - parAlmond->cooAXKernel = parAlmond->device.buildKernel(DPWD "/okl/cooAX.okl", - "cooAXKernel", kernelInfo); - - parAlmond->scaleVectorKernel = parAlmond->device.buildKernel(DPWD "/okl/scaleVector.okl", - "scaleVectorKernel", kernelInfo); - - parAlmond->sumVectorKernel = parAlmond->device.buildKernel(DPWD "/okl/sumVector.okl", - "sumVectorKernel", kernelInfo); - - parAlmond->addScalarKernel = parAlmond->device.buildKernel(DPWD "/okl/addScalar.okl", - "addScalarKernel", kernelInfo); - - parAlmond->vectorAddKernel = parAlmond->device.buildKernel(DPWD "/okl/vectorAdd.okl", - "vectorAddKernel", kernelInfo); - - parAlmond->vectorAddKernel2 = parAlmond->device.buildKernel(DPWD "/okl/vectorAdd.okl", - "vectorAddKernel2", kernelInfo); - - parAlmond->setVectorKernel = parAlmond->device.buildKernel(DPWD "/okl/setVector.okl", - "setVectorKernel", kernelInfo); - - parAlmond->dotStarKernel = parAlmond->device.buildKernel(DPWD "/okl/dotStar.okl", - "dotStarKernel", kernelInfo); - - parAlmond->simpleDotStarKernel = parAlmond->device.buildKernel(DPWD "/okl/dotStar.okl", - "simpleDotStarKernel", kernelInfo); - - parAlmond->haloExtract = parAlmond->device.buildKernel(DPWD "/okl/haloExtract.okl", - "haloExtract", kernelInfo); - - parAlmond->agg_interpolateKernel = parAlmond->device.buildKernel(DPWD "/okl/agg_interpolate.okl", - "agg_interpolate", kernelInfo); - - parAlmond->innerProdKernel = parAlmond->device.buildKernel(DPWD "/okl/innerProduct.okl", - "innerProductKernel", kernelInfo); - - parAlmond->vectorAddInnerProdKernel = parAlmond->device.buildKernel(DPWD "/okl/vectorAddInnerProduct.okl", - "vectorAddInnerProductKernel", kernelInfo); - - parAlmond->kcycleCombinedOp1Kernel = parAlmond->device.buildKernel(DPWD "/okl/kcycleCombinedOp.okl", - "kcycleCombinedOp1Kernel", kernelInfo); - - parAlmond->kcycleCombinedOp2Kernel = parAlmond->device.buildKernel(DPWD "/okl/kcycleCombinedOp.okl", - "kcycleCombinedOp2Kernel", kernelInfo); - - parAlmond->vectorAddWeightedInnerProdKernel = parAlmond->device.buildKernel(DPWD "/okl/vectorAddInnerProduct.okl", - "vectorAddWeightedInnerProductKernel", kernelInfo); - - parAlmond->kcycleWeightedCombinedOp1Kernel = parAlmond->device.buildKernel(DPWD "/okl/kcycleCombinedOp.okl", - "kcycleWeightedCombinedOp1Kernel", kernelInfo); - - parAlmond->kcycleWeightedCombinedOp2Kernel = parAlmond->device.buildKernel(DPWD "/okl/kcycleCombinedOp.okl", - "kcycleWeightedCombinedOp2Kernel", kernelInfo); - } - MPI_Barrier(agmg::comm); - } -} diff --git a/solvers/parALMOND/src/parAlmond.c b/solvers/parALMOND/src/parAlmond.c deleted file mode 100644 index 5e3cca31d..000000000 --- a/solvers/parALMOND/src/parAlmond.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "agmg.h" - -void parAlmondPrecon(parAlmond_t *parAlmond, occa::memory o_x, occa::memory o_rhs) { - - agmgLevel *baseLevel = parAlmond->levels[0]; - setupAide options = parAlmond->options; - - if (baseLevel->gatherLevel==true) {// gather rhs - baseLevel->device_gather(baseLevel->gatherArgs, o_rhs, baseLevel->o_rhs); - } else { - baseLevel->o_rhs.copyFrom(o_rhs); - } - - if (options.compareArgs("PARALMOND CYCLE", "HOST")) { - //host versions - baseLevel->o_rhs.copyTo(baseLevel->rhs); - if(options.compareArgs("PARALMOND CYCLE", "EXACT")) { - if(parAlmond->ktype == PCG) { - pcg(parAlmond,1000,1e-8); - } else if(parAlmond->ktype == GMRES) { - pgmres(parAlmond,1000,1e-8); - } - } else if(options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - kcycle(parAlmond, 0); - } else if(options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - vcycle(parAlmond, 0); - } - baseLevel->o_x.copyFrom(baseLevel->x); - } else { - if(options.compareArgs("PARALMOND CYCLE", "EXACT")){ - if(parAlmond->ktype == PCG) { - device_pcg(parAlmond,1000,1e-8); - } else if(parAlmond->ktype == GMRES) { - device_pgmres(parAlmond,1000,1e-8); - } - } else if(options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - device_kcycle(parAlmond, 0); - } else if(options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - device_vcycle(parAlmond, 0); - } - } - - if (baseLevel->gatherLevel==true) {// scatter solution - baseLevel->device_scatter(baseLevel->scatterArgs, baseLevel->o_x, o_x); - } else { - baseLevel->o_x.copyTo(o_x,baseLevel->Nrows*sizeof(dfloat)); - } -} - -parAlmond_t *parAlmondInit(mesh_t *mesh, setupAide options) { - - parAlmond_t *parAlmond = (parAlmond_t *) calloc(1,sizeof(parAlmond_t)); - - parAlmond->device = mesh->device; - parAlmond->defaultStream = mesh->defaultStream; - parAlmond->dataStream = mesh->dataStream; - parAlmond->options = options; - - parAlmond->levels = (agmgLevel **) calloc(MAX_LEVELS,sizeof(agmgLevel *)); - parAlmond->numLevels = 0; - - if (options.compareArgs("PARALMOND CYCLE", "NONSYM")) { - parAlmond->ktype = GMRES; - } else { - parAlmond->ktype = PCG; - } - - agmg::rank = mesh->rank; - agmg::size = mesh->size; - MPI_Comm_dup(mesh->comm, &(agmg::comm)); - - buildAlmondKernels(parAlmond); - - return parAlmond; -} - -void parAlmondAgmgSetup(parAlmond_t *parAlmond, - hlong* globalRowStarts, //global partition - dlong nnz, //-- - hlong* Ai, //-- Local A matrix data (globally indexed, COO storage, row sorted) - hlong* Aj, //-- - dfloat* Avals, //-- - bool nullSpace, - dfloat nullSpacePenalty){ - - int size, rank; - size = agmg::size; - rank = agmg::rank; - - hlong TotalRows = globalRowStarts[size]; - dlong numLocalRows = (dlong) (globalRowStarts[rank+1]-globalRowStarts[rank]); - - if(rank==0) printf("Setting up AMG...");fflush(stdout); - - csr *A = newCSRfromCOO(numLocalRows,globalRowStarts,nnz, Ai, Aj, Avals); - - //record if there is null space - parAlmond->nullSpace = nullSpace; - parAlmond->nullSpacePenalty = nullSpacePenalty; - - //populate null space vector - dfloat *nullA = (dfloat *) calloc(numLocalRows, sizeof(dfloat)); - for (dlong i=0;ioptions); - - if(rank==0) printf("done.\n"); - - if (parAlmond->options.compareArgs("VERBOSE","TRUE")) - parAlmondReport(parAlmond); -} - -//TODO code this -int parAlmondFree(void* A) { - return 0; -} - diff --git a/solvers/parALMOND/src/pcg.c b/solvers/parALMOND/src/pcg.c deleted file mode 100644 index 921e46bcf..000000000 --- a/solvers/parALMOND/src/pcg.c +++ /dev/null @@ -1,283 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "agmg.h" - - -void pcg(parAlmond_t *parAlmond, - int maxIt, - dfloat tol){ - - csr *A = parAlmond->levels[0]->A; - - const dlong m = A->Nrows; - // const dlong n = A->Ncols; - - parAlmond->ktype = PCG; - - // use parAlmond's buffers - dfloat *r = parAlmond->levels[0]->rhs; - dfloat *z = parAlmond->levels[0]->x; - - // initial residual - dfloat rdotr0Local = innerProd(m, r, r); - dfloat rdotr0 = 0; - MPI_Allreduce(&rdotr0Local,&rdotr0,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - dfloat *x, *p, *Ap; - - x = (dfloat *) calloc(m,sizeof(dfloat)); - Ap = (dfloat *) calloc(m,sizeof(dfloat)); - p = (dfloat *) calloc(m,sizeof(dfloat)); - - // x = 0; - setVector(m, x, 0.0); - - //sanity check - if (rdotr0<=(tol*tol)) { - for (dlong i=0;ilevels[0]->x[i] = x[i]; - - free(x); free(p); free(Ap); - return; - } - - // Precondition, z = M^{-1}*r - if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - kcycle(parAlmond, 0); - } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - vcycle(parAlmond, 0); - } - for (dlong i=0;i(tol*tol)){ - // Ap = A*p; - axpy(A, 1.0, p, 0.0, Ap,parAlmond->nullSpace,parAlmond->nullSpacePenalty); - - dfloat pApLocal = innerProd(m, p, Ap); - pAp = 0; - MPI_Allreduce(&pApLocal,&pAp,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - alpha = rdotz0/pAp; - - // update solution - // x = x + alpha * p; - vectorAdd(m, alpha, p, 1.0, x); - - // update residual - // r = r - alpha * Ap; - vectorAdd(m, -alpha, Ap, 1.0, r); - - - dfloat rdotr1Local = innerProd(m, r, r); - rdotr1 = 0; - MPI_Allreduce(&rdotr1Local,&rdotr1,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - if(rdotr1 < tol*tol) { - rdotr0 = rdotr1; - break; - } - - // Precondition, z = M^{-1}*r - if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - kcycle(parAlmond, 0); - } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - vcycle(parAlmond, 0); - } - - dfloat rdotz1Local = innerProd(m, r, z); - rdotz1 = 0; - MPI_Allreduce(&rdotz1Local,&rdotz1,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - #if 1 - // flexible pcg beta = (z.(-alpha*Ap))/zdotz0 - dfloat zdotApLocal = innerProd(m, z, Ap); - dfloat zdotAp = 0; - MPI_Allreduce(&zdotApLocal,&zdotAp,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - beta = -alpha*zdotAp/rdotz0; - #else - beta = rdotz1/rdotz0; - #endif - - // p = z + beta*p - vectorAdd(m, 1.0, z, beta, p); - - // switch rdotz0 <= rdotz1 - rdotz0 = rdotz1; - - // switch rdotz0,rdotr0 <= rdotz1,rdotr1 - rdotr0 = rdotr1; - - Niter++; - - printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0)); - - if(Niter==maxIt) break; - } - - //copy result back to parAlmond's x storage - for (dlong i=0;ilevels[0]->x[i] = x[i]; - - free(x); free(p); free(Ap); -} - -void device_pcg(parAlmond_t *parAlmond, int maxIt, dfloat tol){ - - hyb* A = parAlmond->levels[0]->deviceA; - - const dlong m = A->Nrows; - const dlong n = A->Ncols; - - parAlmond->ktype = PCG; - - // use parAlmond's buffers - occa::memory &o_r = parAlmond->levels[0]->o_rhs; - occa::memory &o_z = parAlmond->levels[0]->o_x; - - // initial residual - dfloat rdotr0Local = innerProd(parAlmond, m, o_r, o_r); - dfloat rdotr0 = 0; - MPI_Allreduce(&rdotr0Local,&rdotr0,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - occa::memory o_x, o_p, o_Ap; - - o_x = parAlmond->device.malloc(n*sizeof(dfloat),parAlmond->levels[0]->x); - o_Ap = parAlmond->device.malloc(n*sizeof(dfloat),parAlmond->levels[0]->x); - o_p = parAlmond->device.malloc(n*sizeof(dfloat),parAlmond->levels[0]->x); - - // x = 0; - setVector(parAlmond, m, o_x, 0.0); - - //sanity check - if (rdotr0<=(tol*tol)) { - parAlmond->levels[0]->o_x.copyFrom(o_x); - printf("Almond PCG iter %d, res = %g\n", 0, sqrt(rdotr0)); - o_x.free(); o_p.free(); o_Ap.free(); - return; - } - - // Precondition, z = M^{-1}*r - if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - device_kcycle(parAlmond, 0); - } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - device_vcycle(parAlmond, 0); - } - o_p.copyFrom(o_z); - - dfloat rdotz0Local = innerProd(parAlmond, m, o_r, o_z); - dfloat rdotz0 = 0; - MPI_Allreduce(&rdotz0Local,&rdotz0,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - dfloat rdotr1 = 0; - dfloat rdotz1 = 0; - dfloat alpha, beta, pAp; - - int Niter = 0; - while(rdotr0>(tol*tol)){ - // Ap = A*p; - axpy(parAlmond, A, 1.0, o_p, 0.0, o_Ap,parAlmond->nullSpace,parAlmond->nullSpacePenalty); - - dfloat pApLocal = innerProd(parAlmond, m, o_p, o_Ap); - pAp = 0; - MPI_Allreduce(&pApLocal,&pAp,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - alpha = rdotz0/pAp; - - // update solution - // x = x + alpha * p; - vectorAdd(parAlmond, m, alpha, o_p, 1.0, o_x); - - // update residual - // r = r - alpha * Ap; - vectorAdd(parAlmond, m, -alpha, o_Ap, 1.0, o_r); - - - dfloat rdotr1Local = innerProd(parAlmond, m, o_r, o_r); - rdotr1 = 0.; - MPI_Allreduce(&rdotr1Local,&rdotr1,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - if(rdotr1 < tol*tol) { - rdotr0 = rdotr1; - break; - } - - // Precondition, z = M^{-1}*r - if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) { - device_kcycle(parAlmond, 0); - } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) { - device_vcycle(parAlmond, 0); - } - - dfloat rdotz1Local = innerProd(parAlmond, m, o_r, o_z); - rdotz1 = 0; - MPI_Allreduce(&rdotz1Local,&rdotz1,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - - #if 1 - // flexible pcg beta = (z.(-alpha*Ap))/zdotz0 - dfloat zdotApLocal = innerProd(parAlmond, m, o_z, o_Ap); - dfloat zdotAp = 0; - MPI_Allreduce(&zdotApLocal,&zdotAp,1,MPI_DFLOAT,MPI_SUM,agmg::comm); - beta = -alpha*zdotAp/rdotz0; - #else - beta = rdotz1/rdotz0; - #endif - - // p = z + beta*p - vectorAdd(parAlmond, m, 1.0, o_z, beta, o_p); - - // switch rdotz0 <= rdotz1 - rdotz0 = rdotz1; - - // switch rdotz0,rdotr0 <= rdotz1,rdotr1 - rdotr0 = rdotr1; - - Niter++; - - //printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0)); - - if(Niter==maxIt) break; - } - - //copy result back to parAlmond's x storage - parAlmond->levels[0]->o_x.copyFrom(o_x); - - printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0)); - - o_x.free(); o_p.free(); o_Ap.free(); -} - - diff --git a/solvers/parALMOND/src/vectorPrimitives.c b/solvers/parALMOND/src/vectorPrimitives.c deleted file mode 100644 index 7d2f6d036..000000000 --- a/solvers/parALMOND/src/vectorPrimitives.c +++ /dev/null @@ -1,337 +0,0 @@ -/* - -The MIT License (MIT) - -Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - -*/ - -#include "agmg.h" - -dfloat norm(dlong n, dfloat *a){ - dfloat result = 0.; - #pragma omp parallel for reduction(+:result) - for(dlong i=0; iscaleVectorKernel(N, alpha, o_a); -} - -void setVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a, dfloat alpha){ - if (N) parAlmond->setVectorKernel(N, alpha, o_a); -} - -dfloat sumVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a){ - dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD; - if(!numBlocks) numBlocks = 1; - - if (N) parAlmond->sumVectorKernel(numBlocks,N,o_a,parAlmond->o_rho); - parAlmond->o_rho.copyTo(parAlmond->rho,numBlocks*sizeof(dfloat),0); - - dfloat alpha =0.; - #pragma omp parallel for reduction(+:alpha) - for (dlong i=0; irho[i]; - } - - return alpha; -} - -void addScalar(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_a){ - if (N) parAlmond->addScalarKernel(N, alpha, o_a); -} - -void dotStar(parAlmond_t *parAlmond, dlong N, occa::memory o_a, occa::memory o_b){ - if (N) parAlmond->simpleDotStarKernel(N, o_a, o_b); -} - -void dotStar(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_a, - occa::memory o_b, dfloat beta, occa::memory o_c){ - if (N) parAlmond->dotStarKernel(N, alpha, beta, o_a, o_b, o_c); -} - -dfloat innerProd(parAlmond_t *parAlmond, dlong N, - occa::memory o_x, occa::memory o_y){ - - dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD; - if(!numBlocks) numBlocks = 1; - - parAlmond->innerProdKernel(numBlocks,N,o_x,o_y,parAlmond->o_rho); - parAlmond->o_rho.copyTo(parAlmond->rho,numBlocks*sizeof(dfloat),0); - - dfloat result =0.; - #pragma omp parallel for reduction(+:result) - for (dlong i=0; irho[i]; - } - - return result; -} - -// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b, -void kcycleCombinedOp1(parAlmond_t *parAlmond, dlong N, dfloat *aDotbc, - occa::memory o_a, occa::memory o_b, - occa::memory o_c, occa::memory o_w, bool weighted) { - dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD; - if(!numBlocks) numBlocks = 1; - - if (weighted) { - parAlmond->kcycleWeightedCombinedOp1Kernel(numBlocks,N,o_a,o_b,o_c,o_w,parAlmond->o_rho); - } else { - parAlmond->kcycleCombinedOp1Kernel(numBlocks,N,o_a,o_b,o_c,parAlmond->o_rho); - } - parAlmond->o_rho.copyTo(parAlmond->rho,3*numBlocks*sizeof(dfloat),0); - - dfloat aDotb = 0., aDotc = 0., bDotb = 0.; - #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:bDotb) - for(dlong i=0; irho[3*i+0]; - aDotc += parAlmond->rho[3*i+1]; - bDotb += parAlmond->rho[3*i+2]; - } - - aDotbc[0] = aDotb; - aDotbc[1] = aDotc; - aDotbc[2] = bDotb; -} - -// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d, -void kcycleCombinedOp2(parAlmond_t *parAlmond, dlong N, dfloat *aDotbcd, - occa::memory o_a, occa::memory o_b, - occa::memory o_c, occa::memory o_d, - occa::memory o_w, bool weighted) { - - dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD; - if(!numBlocks) numBlocks = 1; - - if (weighted) { - parAlmond->kcycleWeightedCombinedOp2Kernel(numBlocks,N,o_a,o_b,o_c,o_d,o_w,parAlmond->o_rho); - } else { - parAlmond->kcycleCombinedOp2Kernel(numBlocks,N,o_a,o_b,o_c,o_d,parAlmond->o_rho); - } - parAlmond->o_rho.copyTo(parAlmond->rho,3*numBlocks*sizeof(dfloat),0); - - dfloat aDotb = 0., aDotc = 0., aDotd = 0.; - #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:aDotd) - for(dlong i=0; irho[3*i+0]; - aDotc += parAlmond->rho[3*i+1]; - aDotd += parAlmond->rho[3*i+2]; - } - - aDotbcd[0] = aDotb; - aDotbcd[1] = aDotc; - aDotbcd[2] = aDotd; -} - -// y = beta*y + alpha*x, and return y\dot y -dfloat vectorAddInnerProd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x, - dfloat beta, occa::memory o_y, - occa::memory o_w, bool weighted){ - dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD; - if(!numBlocks) numBlocks = 1; - - if (weighted) { - parAlmond->vectorAddWeightedInnerProdKernel(numBlocks,N,alpha,beta,o_x,o_y,o_w,parAlmond->o_rho); - } else { - parAlmond->vectorAddInnerProdKernel(numBlocks,N,alpha,beta,o_x,o_y,parAlmond->o_rho); - } - parAlmond->o_rho.copyTo(parAlmond->rho,numBlocks*sizeof(dfloat),0); - - dfloat result =0.; - #pragma omp parallel for reduction(+:result) - for (dlong i=0; irho[i]; - } - - return result; -} - - -void vectorAdd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y){ - parAlmond->vectorAddKernel(N, alpha, beta, o_x, o_y); -} - -void vectorAdd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x, - dfloat beta, occa::memory o_y, occa::memory o_z){ - parAlmond->vectorAddKernel2(N, alpha, beta, o_x, o_y, o_z); -}