diff --git a/3rdParty/gslib/src/gs.c b/3rdParty/gslib/src/gs.c
index 7b91607ca..1582f4a11 100644
--- a/3rdParty/gslib/src/gs.c
+++ b/3rdParty/gslib/src/gs.c
@@ -1114,11 +1114,15 @@ static void auto_setup(struct gs_remote *r, struct gs_topology *top,
     struct gs_remote r_alt;
     double time[2][3];
 
+    // #define DRY_RUN(i,gsr,str) do { \
+    //   if(comm->id==0) printf("   " str ": "); \
+    //   dry_run_time(time[i],gsr,comm,buf); \
+    //   if(comm->id==0) \
+    //     printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \
+    // } while(0)
+
     #define DRY_RUN(i,gsr,str) do { \
-      if(comm->id==0) printf("   " str ": "); \
       dry_run_time(time[i],gsr,comm,buf); \
-      if(comm->id==0) \
-        printf("%g %g %g\n",time[i][0],time[i][1],time[i][2]); \
     } while(0)
 
     #define DRY_RUN_CHECK(str,new_name) do { \
@@ -1143,7 +1147,7 @@ static void auto_setup(struct gs_remote *r, struct gs_topology *top,
     #undef DRY_RUN_CHECK
     #undef DRY_RUN
 
-    if(comm->id==0) printf("   used all_to_all method: %s\n",name);
+    // if(comm->id==0) printf("   used all_to_all method: %s\n",name);
   }
 }
 
diff --git a/include/ogs_t.h b/include/ogs_t.h
deleted file mode 100644
index 2864fc0ed..000000000
--- a/include/ogs_t.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-// OCCA+gslib gather scatter
-typedef struct {
-  
-  dlong         Ngather;     //  total number of gather nodes
-  dlong         NtotalGather;     //  total number of gather nodes
-  dlong         NnonHaloGather;       //  number of local gathered nodes 
-  dlong         NhaloGather;          //  number of gathered nodes on halo
-
-  dlong       *nonHaloGatherOffsets;
-  int         *nonHaloGatherHaloFlags;
-  int         *nonHaloGatherBaseRanks;
-  dlong       *nonHaloGatherLocalIds;
-  hlong       *nonHaloGatherBaseIds;
-
-  dlong       *haloGatherOffsets;
-  int         *haloGatherHaloFlags;
-  int         *haloGatherBaseRanks;
-  dlong       *haloGatherLocalIds;
-  hlong       *haloGatherBaseIds;
-
-  dlong    *ownedHaloGatherIds;
-
-  dfloat * haloGatherTmp;
-  occa::memory o_nonHaloGatherOffsets;  //  start of local bases
-  occa::memory o_nonHaloGatherLocalIds; //  base connected nodes
-  occa::memory o_nonHaloGatherTmp;      //  DEVICE gather buffer
-
-  occa::memory o_haloGatherOffsets;  //  start of local bases
-  occa::memory o_haloGatherLocalIds; //  base connected nodes
-  occa::memory o_haloGatherTmp;      //  DEVICE gather buffer
-  
-  occa::memory o_ownedHaloGatherIds;
-
-  void         *haloGsh;       // gslib gather 
-  dlong         Nhalo;            //  number of halo nodes
-  dlong         NownedHalo;       //  number of owned halo nodes
-  
-  //degree vectors
-  dfloat *invDegree, *gatherInvDegree;
-  occa::memory o_invDegree;
-  occa::memory o_gatherInvDegree;
-
-}ogs_t;
diff --git a/include/parAlmond.h b/include/parAlmond.h
deleted file mode 100644
index 09542cd4c..000000000
--- a/include/parAlmond.h
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef PARALMOND_H
-#define PARALMOND_H 1
-
-typedef struct csr_t {
-
-  dlong Nrows;
-  dlong Ncols;
-
-  dlong NlocalCols;
-
-  //local
-  dlong diagNNZ;
-  dlong   *diagRowStarts;
-  dlong   *diagCols;
-  dfloat *diagCoefs;
-
-  //non-local
-  dlong offdNNZ;
-  dlong   *offdRowStarts;
-  dlong   *offdCols;
-  dfloat *offdCoefs;
-
-  dfloat *diagInv;
-
-  dfloat *null;
-
-  //storage for smoothing
-  dfloat *scratch;
-
-  hlong   *colMap;
-
-  // MPI halo exchange info
-  dlong  NHalo;
-  int  NrecvTotal;  // number of elements to be sent in halo exchange
-  int  NsendTotal;
-  dlong  totalHaloPairs;
-  dlong *haloElementList; // sorted list of elements to be sent in halo exchange
-  int *NsendPairs;      // number of elements worth of data to send
-  int *NrecvPairs;      // number of elements worth of data to recv
-  int  NsendMessages;   // number of messages to send
-  int  NrecvMessages;   // number of messages to recv
-  dfloat *sendBuffer;
-
-  void *haloSendRequests;
-  void *haloRecvRequests;
-
-} csr;
-
-
-typedef struct ell_t {
-
-  dlong Nrows;
-  dlong Ncols;
-  int nnzPerRow;
-  dlong strideLength;
-  dlong actualNNZ;
-
-  occa::memory o_cols;
-  occa::memory o_coefs;
-
-} ell;
-
-typedef struct coo_t {
-
-  dlong Nrows;
-  dlong Ncols;
-  dlong nnz;
-
-  // device memory
-  occa::memory o_offsets;
-  occa::memory o_cols;
-  occa::memory o_coefs;
-
-} coo;
-
-typedef struct hyb_t {
-
-  dlong Nrows;
-  dlong Ncols;
-
-  dlong NlocalCols;
-
-  coo *C;
-  ell *E;
-
-  occa::memory o_diagInv;
-
-  occa::memory o_null;
-
-  // MPI halo exchange info
-  dlong  NHalo;
-  hlong *colMap;
-  int  NrecvTotal;  // number of elements to be sent in halo exchange
-  int  NsendTotal;
-  dlong *haloElementList; // sorted list of elements to be sent in halo exchange
-  occa::memory o_haloElementList;
-  int *NsendPairs;      // number of elements worth of data to send
-  int *NrecvPairs;      // number of elements worth of data to recv
-  int  NsendMessages;   // number of messages to send
-  int  NrecvMessages;   // number of messages to recv
-  dfloat   *sendBuffer;
-  dfloat   *recvBuffer;
-  occa::memory o_haloBuffer;
-
-  void *haloSendRequests;
-  void *haloRecvRequests;
-
-} hyb;
-
-
-typedef struct dcsr_t {
-
-  dlong Nrows;
-  dlong Ncols;
-
-  dlong NlocalCols;
-
-  //local
-  dlong diagNNZ;
-  occa::memory o_diagRows;
-  occa::memory o_diagCols;
-  occa::memory o_diagCoefs;
-
-  //non-local
-  dlong offdNNZ;
-  occa::memory o_offdRows;
-  occa::memory o_offdCols;
-  occa::memory o_offdCoefs;
-
-  // MPI halo exchange info
-  dlong  NHalo;
-  int  NrecvTotal;  // number of elements to be sent in halo exchange
-  int  NsendTotal;
-  dlong  totalHaloPairs;
-  dlong *haloElementList; // sorted list of elements to be sent in halo exchange
-  int *NsendPairs;      // number of elements worth of data to send
-  int *NrecvPairs;      // number of elements worth of data to recv
-  int  NsendMessages;   // number of messages to send
-  int  NrecvMessages;   // number of messages to recv
-  dfloat   *sendBuffer;
-  dfloat   *recvBuffer;
-
-  occa::memory o_haloElementList;
-  occa::memory o_haloBuffer;
-
-  void *haloSendRequests;
-  void *haloRecvRequests;
-
-} dcoo;
-
-typedef enum {PCG=0,GMRES=1}KrylovType;
-typedef enum {JACOBI=0,DAMPED_JACOBI=1,CHEBYSHEV=2}SmoothType;
-
-typedef struct agmgLevel_t {
-  dlong Nrows;
-  dlong Ncols;
-
-  hlong *globalRowStarts; //global partitioning of fine level
-  hlong *globalAggStarts; //global partitioning of coarse level
-
-  bool gatherLevel;
-  bool weightedInnerProds;
-
-  void **AxArgs;
-  void **smoothArgs;
-  void **smootherArgs;
-  void **coarsenArgs;
-  void **prolongateArgs;
-  void **gatherArgs;
-  void **scatterArgs;
-
-  //operator call-backs
-  void (*device_Ax)        (void **args, occa::memory &o_x, occa::memory &o_Ax);
-  void (*device_smooth)    (void **args, occa::memory &o_r, occa::memory &o_x, bool x_is_zero);
-  void (*device_smoother)  (void **args, occa::memory &o_r, occa::memory &o_Sr);
-  void (*device_coarsen)   (void **args, occa::memory &o_x, occa::memory &o_Rx);
-  void (*device_prolongate)(void **args, occa::memory &o_x, occa::memory &o_Px);
-  void (*device_gather)    (void **args, occa::memory &o_x, occa::memory &o_Gx);
-  void (*device_scatter)   (void **args, occa::memory &o_x, occa::memory &o_Sx);
-
-  //host versions
-  void (*Ax)        (void **args, dfloat *x, dfloat *Ax);
-  void (*smooth)    (void **args, dfloat *r, dfloat *x, bool x_is_zero);
-  void (*smoother)  (void **args, dfloat *r, dfloat *Sr);
-  void (*coarsen)   (void **args, dfloat *x, dfloat *Rx);
-  void (*prolongate)(void **args, dfloat *x, dfloat *Px);
-  void (*gather)    (void **args, dfloat *x, dfloat *Gx);
-  void (*scatter)   (void **args, dfloat *x, dfloat *Sx);
-
-  //agmg operators
-  csr *A;
-  csr *P;
-  csr *R;
-
-  hyb  *deviceA;
-  dcoo  *dcsrP;
-  hyb  *deviceR;
-
-  dfloat *rhs, *res, *x;
-
-  dfloat *Srhs, *Sx; //scatter copies
-
-  dfloat *ckp1, *vkp1, *wkp1;
-
-  dfloat *weight;
-
-  occa::memory o_rhs, o_res, o_x;
-  occa::memory o_Srhs, o_Sx;
-  occa::memory o_ckp1, o_vkp1, o_wkp1;
-
-  occa::memory o_weight;
-
-  dfloat *smoother_params;
-  dfloat *smootherResidual;
-  dfloat *smootherResidual2;
-  dfloat *smootherUpdate;
-  occa::memory o_smootherResidual;
-  occa::memory o_smootherResidual2;
-  occa::memory o_smootherUpdate;
-  int ChebyshevIterations;
-
-  dfloat threshold;
-  dlong numAggregates;
-  SmoothType stype;
-
-} agmgLevel;
-
-typedef struct {
-  agmgLevel **levels;
-  int numLevels;
-
-  KrylovType ktype;
-
-  setupAide options;
-
-  //Matrix Free args
-  void (*MatFreeAx)(void **args, occa::memory o_q, occa::memory o_Aq,const char* options);
-  void **MatFreeArgs;
-
-  //Coarse solver
-  void *ExactSolve;
-  int coarseTotal;
-  int coarseOffset;
-  int *coarseOffsets;
-  int *coarseCounts;
-  dfloat *invCoarseA;
-  dfloat *xCoarse, *rhsCoarse;
-
-  bool nullSpace;
-  dfloat nullSpacePenalty;
-
-  occa::device device;
-  occa::stream defaultStream;
-  occa::stream dataStream;  
-
-  occa::memory o_x;
-  occa::memory o_Ax;
-
-  dfloat *rho;
-  occa::memory o_rho;
-
-  occa::kernel ellAXPYKernel;
-  occa::kernel ellZeqAXPYKernel;
-  occa::kernel ellJacobiKernel;
-  occa::kernel cooAXKernel;
-  occa::kernel scaleVectorKernel;
-  occa::kernel vectorAddKernel;
-  occa::kernel vectorAddKernel2;
-  occa::kernel setVectorKernel;
-  occa::kernel sumVectorKernel;
-  occa::kernel addScalarKernel;
-  occa::kernel dotStarKernel;
-  occa::kernel simpleDotStarKernel;
-  occa::kernel haloExtract;
-  occa::kernel agg_interpolateKernel;
-  occa::kernel innerProdKernel;
-  occa::kernel vectorAddInnerProdKernel;
-  occa::kernel kcycleCombinedOp1Kernel;
-  occa::kernel kcycleCombinedOp2Kernel;
-  occa::kernel vectorAddWeightedInnerProdKernel;
-  occa::kernel kcycleWeightedCombinedOp1Kernel;
-  occa::kernel kcycleWeightedCombinedOp2Kernel;
-
-} parAlmond_t;
-
-parAlmond_t *parAlmondInit(mesh_t *mesh, setupAide options);
-
-void parAlmondAgmgSetup(parAlmond_t* parAlmond,
-                       hlong* rowStarts,
-                       dlong nnz,
-                       hlong* Ai,
-                       hlong* Aj,
-                       dfloat* Avals,
-                       bool nullSpace,
-                       dfloat nullSpacePenalty);
-
-void parAlmondPrecon(parAlmond_t* parAlmond, occa::memory o_x, occa::memory o_rhs);
-
-int parAlmondFree(void* A);
-
-#endif
diff --git a/libs/gatherScatter/src/ogsKernels.cpp b/libs/gatherScatter/src/ogsKernels.cpp
index 954a5114a..ec0eb20b6 100644
--- a/libs/gatherScatter/src/ogsKernels.cpp
+++ b/libs/gatherScatter/src/ogsKernels.cpp
@@ -158,7 +158,7 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) {
   int rank, size;
   MPI_Comm_rank(comm, &rank);
   MPI_Comm_size(comm, &size);
-  
+
   ogs::defaultStream = device.getStream();
   ogs::dataStream    = device.createStream();
 
@@ -197,7 +197,7 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) {
    kernelInfo["compiler_flags"] += "--fmad=true"; // compiler option for cuda
   }
 
-  if (rank==0) printf("Compiling GatherScatter Kernels \n");
+  if (rank==0) printf("Compiling GatherScatter Kernels...");fflush(stdout);
 
   for (int r=0;r<size;r++) {
     if (r==rank) {
@@ -260,7 +260,7 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) {
       ogs::gatherScatterManyKernel_longMul = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMul", kernelInfo);
       ogs::gatherScatterManyKernel_longMin = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMin", kernelInfo);
       ogs::gatherScatterManyKernel_longMax = device.buildKernel(DOGS "/okl/gatherScatterMany.okl", "gatherScatterMany_longMax", kernelInfo);
-      
+
 
 
       ogs::gatherKernel_floatAdd = device.buildKernel(DOGS "/okl/gather.okl", "gather_floatAdd", kernelInfo);
@@ -323,7 +323,7 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) {
       ogs::gatherManyKernel_longMin = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longMin", kernelInfo);
       ogs::gatherManyKernel_longMax = device.buildKernel(DOGS "/okl/gatherMany.okl", "gatherMany_longMax", kernelInfo);
 
-      
+
 
       ogs::scatterKernel_float = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_float", kernelInfo);
       ogs::scatterKernel_double = device.buildKernel(DOGS "/okl/scatter.okl", "scatter_double", kernelInfo);
@@ -342,6 +342,7 @@ void ogs::initKernels(MPI_Comm comm, occa::device device) {
     }
     MPI_Barrier(comm);
   }
+  if(rank==0) printf("done.\n");
 }
 
 void ogs::freeKernels() {
diff --git a/libs/gatherScatter/src/ogsSetup.cpp b/libs/gatherScatter/src/ogsSetup.cpp
index 81ef56acb..8f5d9314d 100644
--- a/libs/gatherScatter/src/ogsSetup.cpp
+++ b/libs/gatherScatter/src/ogsSetup.cpp
@@ -32,7 +32,7 @@ typedef struct{
 
   dlong localId;    // local node id
   hlong baseId;     // original global index
-  
+
   dlong newId;         // new global id
   int owned;
 
@@ -65,12 +65,12 @@ int compareLocalId(const void *a, const void *b){
   return 0;
 }
 
-ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm, 
+ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
                 int verbose, occa::device device){
 
   ogs_t *ogs = (ogs_t*) calloc(1, sizeof(ogs_t));
 
-  //Keep track of how many gs handles we've created, and 
+  //Keep track of how many gs handles we've created, and
   // build kernels if this is the first
   if (!ogs::Nrefs) ogs::initKernels(comm, device);
   ogs::Nrefs++;
@@ -79,8 +79,8 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
   ogs->comm = comm;
 
   int rank, size;
-  MPI_Comm_rank(ogs->comm, &rank); 
-  MPI_Comm_size(ogs->comm, &size); 
+  MPI_Comm_rank(ogs->comm, &rank);
+  MPI_Comm_size(ogs->comm, &size);
 
   //make a host gs handle (calls gslib)
   ogs->hostGsh = ogsHostSetup(comm, N, ids, 0, 0);
@@ -114,10 +114,10 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
 
   //set up the local gatherScatter
   parallelNode_t *localNodes;
-  
+
   if (ogs->Nlocal) {
     localNodes = (parallelNode_t*) calloc(ogs->Nlocal,sizeof(parallelNode_t));
-  
+
     dlong cnt=0;
     for (dlong i=0;i<N;i++) {
       if (ids[i]==0) continue;
@@ -150,17 +150,17 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
     // sort based on local ids
     qsort(localNodes, ogs->Nlocal, sizeof(parallelNode_t), compareLocalId);
 
-    //tally up how many nodes are being gathered to each gatherNode and 
+    //tally up how many nodes are being gathered to each gatherNode and
     //  map to a local ordering
     dlong *localGatherCounts = (dlong*) calloc(ogs->NlocalGather,sizeof(dlong));
     dlong *localGatherMap    = (dlong*) calloc(ogs->NlocalGather,sizeof(dlong));
     cnt = 0;
     for (dlong i=0;i<ogs->Nlocal;i++) {
       dlong newId = localNodes[i].newId; //get the ordered id
-      
-      if (localNodes[i].owned) 
+
+      if (localNodes[i].owned)
         localGatherMap[newId] = cnt++; //record a new index if this is a new gatherNode
-      
+
       localNodes[i].newId = localGatherMap[newId]; //reorder
       localGatherCounts[localGatherMap[newId]]++;  //tally
     }
@@ -177,7 +177,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
       dlong gatherId = localNodes[i].newId;
       dlong offset = ogs->localGatherOffsets[gatherId];
       int index  = localGatherCounts[gatherId];
-      
+
       ogs->localGatherIds[offset+index] = localNodes[i].localId;
       localGatherCounts[gatherId]++;
     }
@@ -235,7 +235,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
     // sort based on local ids
     qsort(haloNodes, ogs->Nhalo, sizeof(parallelNode_t), compareLocalId);
 
-    //tally up how many nodes are being gathered to each gatherNode and 
+    //tally up how many nodes are being gathered to each gatherNode and
     //  map to a local ordering
     dlong *haloGatherCounts = (dlong*) calloc(ogs->NhaloGather,sizeof(dlong));
     dlong *haloGatherMap    = (dlong*) calloc(ogs->NhaloGather,sizeof(dlong));
@@ -246,19 +246,19 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
     dlong cnt2 = ogs->NownedHalo;
     for (dlong i=0;i<ogs->Nhalo;i++) {
       dlong newId = haloNodes[i].newId; //get the ordered id
-      
+
       if (haloNodes[i].owned) {
         dlong c;
-        if (haloNodes[i].baseId>0) 
+        if (haloNodes[i].baseId>0)
           c = cnt++;
-        else 
+        else
           c = cnt2++;
 
-        symIds[c]    = abs(haloNodes[i].baseId); //record the base id 
-        nonSymIds[c] = haloNodes[i].baseId;      //record the base id 
+        symIds[c]    = abs(haloNodes[i].baseId); //record the base id
+        nonSymIds[c] = haloNodes[i].baseId;      //record the base id
         haloGatherMap[newId] = c; //record a new index if this is a new gatherNode
       }
-      
+
       haloNodes[i].newId = haloGatherMap[newId];  //reorder
       haloGatherCounts[haloGatherMap[newId]]++;  //tally
     }
@@ -275,7 +275,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
       dlong gatherId = haloNodes[i].newId;
       dlong offset = ogs->haloGatherOffsets[gatherId];
       int index  = haloGatherCounts[gatherId];
-      
+
       ogs->haloGatherIds[offset+index] = haloNodes[i].localId;
       haloGatherCounts[gatherId]++;
     }
@@ -294,7 +294,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
   free(minRank); free(maxRank); free(flagIds);
 
   //total number of owned gathered nodes
-  ogs->Ngather = ogs->NlocalGather+ogs->NownedHalo; 
+  ogs->Ngather = ogs->NlocalGather+ogs->NownedHalo;
 
   ogs->device = device;
 
@@ -305,7 +305,7 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
 
   ogs->o_invDegree = device.malloc(N*sizeof(dfloat), ogs->invDegree);
   ogs->o_gatherInvDegree = device.malloc(ogs->Ngather*sizeof(dfloat), ogs->gatherInvDegree);
-  
+
   ogsGather(ogs->o_gatherInvDegree, ogs->o_invDegree, ogsDfloat, ogsAdd, ogs);
 
   if(ogs->Ngather)
@@ -313,11 +313,11 @@ ogs_t *ogsSetup(dlong N, hlong *ids, MPI_Comm &comm,
 
   ogsScatter(ogs->o_invDegree, ogs->o_gatherInvDegree, ogsDfloat, ogsAdd, ogs);
 
-  ogs->o_invDegree.copyTo(ogs->invDegree);  
-  
-  for(dlong n=0;n<ogs->N;++n) 
+  if (N) ogs->o_invDegree.copyTo(ogs->invDegree);
+
+  for(dlong n=0;n<ogs->N;++n)
     ogs->invDegree[n] = 1./ogs->invDegree[n];
-  
+
   for(dlong n=0;n<ogs->Ngather;++n)
     ogs->gatherInvDegree[n] = 1./ogs->gatherInvDegree[n];
 
diff --git a/libs/parAlmond/include/agmg.hpp b/libs/parAlmond/include/agmg.hpp
new file mode 100644
index 000000000..06e2af214
--- /dev/null
+++ b/libs/parAlmond/include/agmg.hpp
@@ -0,0 +1,104 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARALMOND_AGMGLEVEL_HPP
+#define PARALMOND_AGMGLEVEL_HPP
+
+namespace parAlmond {
+
+class agmgLevel: public multigridLevel {
+
+public:
+  parCSR   *A,   *P,   *R;
+  parHYB *o_A, *o_P, *o_R;
+
+  SmoothType stype;
+  dfloat lambda, lambda1, lambda0; //smoothing params
+
+  int ChebyshevIterations;
+
+  bool gatherLevel;
+  ogs_t *ogs;
+  dfloat *Gx, *Sx;
+  occa::memory o_Sx, o_Gx;
+
+  agmgLevel(parCSR *AA, KrylovType Ktype);
+  agmgLevel(parCSR *AA, parCSR *PP, parCSR *RR, KrylovType Ktype);
+  ~agmgLevel();
+
+  void Ax(dfloat        *x, dfloat        *Ax);
+  void Ax(occa::memory o_x, occa::memory o_Ax);
+
+  void smooth(dfloat        *rhs, dfloat        *x, bool x_is_zero);
+  void smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero);
+
+  void residual(dfloat        *rhs, dfloat        *x, dfloat        *res);
+  void residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res);
+
+  void coarsen(dfloat        *x, dfloat        *Cx);
+  void coarsen(occa::memory o_x, occa::memory o_Cx);
+
+  void prolongate(dfloat        *x, dfloat        *Px);
+  void prolongate(occa::memory o_x, occa::memory o_Px);
+
+  void smoothJacobi(dfloat *r, dfloat *x, const bool x_is_zero);
+  void smoothDampedJacobi(dfloat *r, dfloat *x, const bool x_is_zero);
+  void smoothChebyshev(dfloat *r, dfloat *x, const bool x_is_zero);
+
+  void smoothJacobi(occa::memory o_r, occa::memory o_x, bool x_is_zero);
+  void smoothDampedJacobi(occa::memory o_r, occa::memory o_x, bool x_is_zero);
+  void smoothChebyshev(occa::memory o_r, occa::memory o_x, bool x_is_zero);
+
+  void Report();
+};
+
+
+agmgLevel *coarsenAgmgLevel(agmgLevel *level, KrylovType ktype, setupAide options);
+
+parCSR* strongGraph(parCSR *A);
+
+void formAggregates(parCSR *A, parCSR *C,
+                     hlong* FineToCoarse,
+                     hlong* globalAggStarts);
+
+parCSR *constructProlongation(parCSR *A, hlong *FineToCoarse,
+                            hlong *globalAggStarts, dfloat **nullCoarseA);
+
+parCSR *transpose(parCSR *A);
+
+parCSR *galerkinProd(parCSR *A, parCSR *P);
+
+
+
+void setupAgmgSmoother(agmgLevel *level, SmoothType s, int ChebIterations);
+
+void allocateAgmgVectors(agmgLevel *level, int k, int numLevels, CycleType ctype);
+
+void syncAgmgToDevice(agmgLevel *level, int k, int numLevels, CycleType ctype);
+
+}
+
+#endif
\ No newline at end of file
diff --git a/solvers/parALMOND/okl/cooAX.okl b/libs/parAlmond/include/coarse.hpp
similarity index 61%
rename from solvers/parALMOND/okl/cooAX.okl
rename to libs/parAlmond/include/coarse.hpp
index b909676b4..dd0e0aba4 100644
--- a/solvers/parALMOND/okl/cooAX.okl
+++ b/libs/parAlmond/include/coarse.hpp
@@ -24,29 +24,51 @@ SOFTWARE.
 
 */
 
+#ifndef PARALMOND_COARSESOLVE_HPP
+#define PARALMOND_COARSESOLVE_HPP
+
+namespace parAlmond {
+
+class coarseSolver {
+
+public:
+  int coarseTotal;
+  int coarseOffset;
+  int *coarseOffsets=NULL;
+  int *coarseCounts=NULL;
+
+  int N;
+  dfloat *invCoarseA=NULL;
+
+  dfloat *xLocal=NULL;
+  dfloat *rhsLocal=NULL;
+
+  dfloat *xCoarse=NULL;
+  dfloat *rhsCoarse=NULL;
+
+  bool gatherLevel;
+  ogs_t *ogs;
+  dfloat *Gx, *Sx;
+  occa::memory o_Sx, o_Gx;
+
+  MPI_Comm comm;
+  occa::device device;
+
+  setupAide options;
+
+  coarseSolver(setupAide options);
+  ~coarseSolver();
+
+  int getTargetSize();
+
+  void setup(parCSR *A);
+
+  void syncToDevice();
+
+  void solve(dfloat *rhs, dfloat *x);
+  void solve(occa::memory o_rhs, occa::memory o_x);
+};
 
-// y += alpha*A*x
-
-@kernel void cooAXKernel(const dlong   numRows,
-			    const dfloat           alpha,
-			    @restrict const    dlong * offsets,
-			    @restrict const    dlong * cols,
-			    @restrict const  dfloat * coeffs,
-			    @restrict const  dfloat * x,
-			          @restrict dfloat * y){
-  
-  for(dlong n=0;n<numRows;++n;@tile(256,@outer,@inner)){
-    if (n<numRows) {
-      dlong start = offsets[n];
-      dlong end = offsets[n+1];
-      
-      dfloat res = y[n];
-      dfloat Axn = 0;
-      for(dlong i=start;i<end;++i){
-        Axn += coeffs[i]*x[cols[i]];
-      }
-      y[n] = res + alpha*Axn;
-    }
-  }
 }
-	
+
+#endif
\ No newline at end of file
diff --git a/solvers/parALMOND/okl/haloExtract.okl b/libs/parAlmond/include/defines.hpp
similarity index 69%
rename from solvers/parALMOND/okl/haloExtract.okl
rename to libs/parAlmond/include/defines.hpp
index 3966e1063..3335c38b3 100644
--- a/solvers/parALMOND/okl/haloExtract.okl
+++ b/libs/parAlmond/include/defines.hpp
@@ -24,17 +24,27 @@ SOFTWARE.
 
 */
 
+#ifndef PARALMOND_DEFINES_HPP
+#define PARALMOND_DEFINES_HPP
 
-@kernel void haloExtract(const dlong NhaloElements,
-			      const int              Nentries,
-			      @restrict const  dlong   * haloElements,
-			      @restrict const  dfloat * q,
-			            @restrict dfloat * haloq){
-
-  for(dlong e=0;e<NhaloElements;++e;@outer(0)){  // for all elements
-    for(int n=0;n<Nentries;++n;@inner(0)){     // for all entries in this element
-      const dlong id = haloElements[e];
-      haloq[n + Nentries*e] = q[n + Nentries*id];
-    }
-  }
-}
+#define BLOCKSIZE 512
+#define NBLOCKS 128
+
+#define MAX_LEVELS 100
+#define GPU_CPU_SWITCH_SIZE 0 //host-device switch threshold
+
+#define NUMKCYCLES 3
+#define COARSENTHREASHOLD 0.5
+#define KCYCLETOL 0.2
+
+namespace parAlmond {
+
+extern int ChebyshevIterations;
+
+typedef enum {VCYCLE=0,KCYCLE=1,EXACT=3} CycleType;
+typedef enum {PCG=0,GMRES=1} KrylovType;
+typedef enum {JACOBI=0,DAMPED_JACOBI=1,CHEBYSHEV=2} SmoothType;
+
+} //namespace parAlmond
+
+#endif
\ No newline at end of file
diff --git a/libs/parAlmond/include/kernels.hpp b/libs/parAlmond/include/kernels.hpp
new file mode 100644
index 000000000..6840b53de
--- /dev/null
+++ b/libs/parAlmond/include/kernels.hpp
@@ -0,0 +1,64 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARALMOND_KERNELS_HPP
+#define PARALMOND_KERNELS_HPP
+
+namespace parAlmond {
+
+  void buildParAlmondKernels(MPI_Comm comm, occa::device device);
+
+  void freeParAlmondKernels();
+
+  extern int Nrefs;
+
+  extern occa::kernel haloExtractKernel;
+
+  extern occa::kernel SpMVcsrKernel1;
+  extern occa::kernel SpMVcsrKernel2;
+  extern occa::kernel SpMVellKernel1;
+  extern occa::kernel SpMVellKernel2;
+  extern occa::kernel SpMVmcsrKernel1;
+  extern occa::kernel SpMVmcsrKernel2;
+
+  extern occa::kernel vectorSetKernel;
+  extern occa::kernel vectorScaleKernel;
+  extern occa::kernel vectorAddScalarKernel;
+  extern occa::kernel vectorAddKernel1;
+  extern occa::kernel vectorAddKernel2;
+  extern occa::kernel vectorDotStarKernel1;
+  extern occa::kernel vectorDotStarKernel2;
+  extern occa::kernel vectorInnerProdKernel;
+  extern occa::kernel vectorAddInnerProdKernel;
+  extern occa::kernel vectorAddWeightedInnerProdKernel;
+  extern occa::kernel kcycleCombinedOp1Kernel;
+  extern occa::kernel kcycleCombinedOp2Kernel;
+  extern occa::kernel kcycleWeightedCombinedOp1Kernel;
+  extern occa::kernel kcycleWeightedCombinedOp2Kernel;
+
+} //namespace parAlmond
+
+#endif
\ No newline at end of file
diff --git a/libs/parAlmond/include/level.hpp b/libs/parAlmond/include/level.hpp
new file mode 100644
index 000000000..6e375dc5b
--- /dev/null
+++ b/libs/parAlmond/include/level.hpp
@@ -0,0 +1,86 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARALMOND_LEVEL_HPP
+#define PARALMOND_LEVEL_HPP
+
+namespace parAlmond {
+
+class multigridLevel {
+
+public:
+  dlong Nrows, Ncols;
+
+  dfloat *x=NULL;
+  dfloat *rhs=NULL;
+  dfloat *res=NULL;
+  occa::memory o_x, o_rhs, o_res;
+
+  //extra storage for kcycle
+  dfloat *ck=NULL;
+  dfloat *vk=NULL;
+  dfloat *wk=NULL;
+  occa::memory o_ck, o_vk, o_wk;
+
+  //switch for weighted inner products
+  bool weighted;
+  dfloat *weight=NULL;
+  occa::memory o_weight;
+
+  KrylovType ktype;
+  SmoothType stype;
+
+  MPI_Comm comm;
+
+  multigridLevel(dlong N, dlong M, KrylovType Ktype, MPI_Comm comm);
+  ~multigridLevel();
+
+  virtual void Ax(dfloat        *x, dfloat        *Ax)=0;
+  virtual void Ax(occa::memory o_x, occa::memory o_Ax)=0;
+
+  virtual void smooth(dfloat        *rhs, dfloat        *x, bool x_is_zero)=0;
+  virtual void smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero)=0;
+
+  virtual void residual(dfloat        *rhs, dfloat        *x, dfloat        *res)=0;
+  virtual void residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res)=0;
+
+  virtual void coarsen(dfloat        *x, dfloat        *Cx)=0;
+  virtual void coarsen(occa::memory o_x, occa::memory o_Cx)=0;
+
+  virtual void prolongate(dfloat        *x, dfloat        *Px)=0;
+  virtual void prolongate(occa::memory o_x, occa::memory o_Px)=0;
+
+  virtual void Report()=0;
+
+  void kcycleOp1(dfloat *alpha1, dfloat *rho1, dfloat *norm_rhs, dfloat *norm_rhstilde);
+  void kcycleOp2(const dfloat alpha1, const dfloat rho1);
+  void device_kcycleOp1(dfloat *alpha1, dfloat *rho1, dfloat *norm_rhs, dfloat *norm_rhstilde);
+  void device_kcycleOp2(const dfloat alpha1, const dfloat rho1);
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/libs/parAlmond/include/matrix.hpp b/libs/parAlmond/include/matrix.hpp
new file mode 100644
index 000000000..d09f62dc8
--- /dev/null
+++ b/libs/parAlmond/include/matrix.hpp
@@ -0,0 +1,244 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARALMOND_MATRIX_HPP
+#define PARALMOND_MATRIX_HPP
+
+
+namespace parAlmond {
+
+class matrix_t {
+
+public:
+  dlong Nrows;
+  dlong Ncols;
+
+  matrix_t(dlong N=0, dlong M=0);
+
+  virtual void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, dfloat *y)=0;
+  virtual void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, const dfloat *y, dfloat *z)=0;
+  virtual void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, const occa::memory o_y)=0;
+  virtual void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, occa::memory o_y, occa::memory o_z)=0;
+};
+
+class CSR: public matrix_t {
+
+public:
+  dlong nnz;
+  dlong  *rowStarts=NULL;
+  dlong  *cols=NULL;
+  dfloat *vals=NULL;
+
+  occa::memory o_rowStarts;
+  occa::memory o_cols;
+  occa::memory o_vals;
+
+  CSR(dlong N=0, dlong M=0);
+  ~CSR();
+
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, dfloat *y);
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, const dfloat *y, dfloat *z);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, const occa::memory o_y);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, occa::memory o_y, occa::memory o_z);
+};
+
+class ELL: public matrix_t {
+
+public:
+  int   nnzPerRow;
+  dlong actualNNZ;
+
+  dlong  *cols=NULL;
+  dfloat *vals=NULL;
+
+  occa::memory o_cols;
+  occa::memory o_vals;
+
+  ELL(dlong N=0, dlong M=0);
+  ~ELL();
+
+  void syncToDevice(occa::device device);
+
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, dfloat *y);
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, const dfloat *y, dfloat *z);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, const occa::memory o_y);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, occa::memory o_y, occa::memory o_z);
+};
+
+class MCSR: public matrix_t {
+
+public:
+  dlong nnz;
+  dlong actualRows;
+  dlong  *rowStarts=NULL;
+  dlong  *rows=NULL;
+  dlong  *cols=NULL;
+  dfloat *vals=NULL;
+
+  occa::memory o_rowStarts;
+  occa::memory o_rows;
+  occa::memory o_cols;
+  occa::memory o_vals;
+
+  MCSR(dlong N=0, dlong M=0);
+  ~MCSR();
+
+  void syncToDevice(occa::device device);
+
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, dfloat *y);
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, const dfloat *y, dfloat *z);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, const occa::memory o_y);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, occa::memory o_y, occa::memory o_z);
+};
+
+class parCSR: public matrix_t {
+
+public:
+  //local
+  CSR *diag;
+
+  //non-local
+  CSR *offd;
+
+  dfloat *diagA=NULL;
+  dfloat *diagInv=NULL;
+
+  occa::memory o_diagA;
+  occa::memory o_diagInv;
+
+  bool nullSpace;
+  dfloat nullSpacePenalty;
+  dfloat *null=NULL;
+  occa::memory o_null;
+
+  //partition info
+  MPI_Comm comm;
+  hlong *globalRowStarts=NULL;
+  hlong *globalColStarts=NULL;
+  hlong *colMap=NULL;
+
+  ogs_t *ogs=NULL;
+  ogs_t *ogsHalo=NULL;
+  dlong Nhalo;
+  dlong Nshared;
+  dlong NlocalCols;
+
+  dlong *haloIds=NULL;
+  occa::memory o_haloIds;
+
+  occa::device device;
+
+
+  parCSR(dlong N=0, dlong M=0);
+  parCSR(dlong N, dlong M, MPI_Comm Comm, occa::device Device);
+
+  //build a parCSR matrix from a distributed COO matrix
+  parCSR(dlong N,         // number of rows on this rank
+         hlong* starts,   // global partitioning
+         dlong nnz,       // number of nonzeros on this rank
+         hlong *Ai,       // global row ids
+         hlong *Aj,       // global column ids
+         dfloat *Avals,    // values
+         bool NullSpace,          //switch for nullspace
+         dfloat *Null,            //null vector (or low energy mode)
+         dfloat NullSpacePenalty, //penalty parameter for rank boost
+         MPI_Comm Comm,
+         occa::device Device);
+
+  ~parCSR();
+
+  void haloSetup(hlong *colIds);
+  void haloExchangeStart (dfloat *x);
+  void haloExchangeFinish(dfloat *x);
+  void haloExchangeStart (occa::memory o_x);
+  void haloExchangeFinish(occa::memory o_x);
+
+  dfloat rhoDinvA();
+
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, dfloat *y);
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, const dfloat *y, dfloat *z);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, const occa::memory o_y);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, occa::memory o_y, occa::memory o_z);
+};
+
+
+class parHYB: public matrix_t {
+
+public:
+
+  ELL  *E;
+  MCSR *C;
+
+  dfloat *diagA=NULL;
+  dfloat *diagInv=NULL;
+
+  occa::memory o_diagA;
+  occa::memory o_diagInv;
+
+  bool nullSpace;
+  dfloat nullSpacePenalty;
+  dfloat *null=NULL;
+  occa::memory o_null;
+
+  //partition info
+  MPI_Comm comm;
+  hlong *globalRowStarts=NULL;
+  hlong *globalColStarts=NULL;
+  hlong *colMap=NULL;
+
+  ogs_t *ogs=NULL;
+  ogs_t *ogsHalo=NULL;
+  dlong Nhalo;
+  dlong Nshared;
+  dlong NlocalCols;
+
+  dlong *haloIds=NULL;
+  occa::memory o_haloIds;
+
+  occa::device device;
+
+  parHYB(dlong N=0, dlong M=0);
+  parHYB(parCSR *A); //build from parCSR
+
+  ~parHYB();
+
+  void haloExchangeStart (dfloat *x);
+  void haloExchangeFinish(dfloat *x);
+  void haloExchangeStart (occa::memory o_x);
+  void haloExchangeFinish(occa::memory o_x);
+
+  void syncToDevice();
+
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, dfloat *y);
+  void SpMV(const dfloat alpha,        dfloat *x, const dfloat beta, const dfloat *y, dfloat *z);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, const occa::memory o_y);
+  void SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta, occa::memory o_y, occa::memory o_z);
+};
+
+
+} //namespace parAlmond
+
+#endif
\ No newline at end of file
diff --git a/solvers/parALMOND/okl/innerProduct.okl b/libs/parAlmond/include/solver.hpp
similarity index 55%
rename from solvers/parALMOND/okl/innerProduct.okl
rename to libs/parAlmond/include/solver.hpp
index 817e87b0b..fa3d6b9cd 100644
--- a/solvers/parALMOND/okl/innerProduct.okl
+++ b/libs/parAlmond/include/solver.hpp
@@ -24,34 +24,53 @@ SOFTWARE.
 
 */
 
-  
-// ip = x.y 
-@kernel void innerProductKernel(const dlong Nblocks,
-          const dlong   N,
-          @restrict const  dfloat * x,
-          @restrict const  dfloat * y,
-                @restrict dfloat * ip){
-
-  for(dlong b=0;b<Nblocks;++b;@outer(0)){
-
-    @shared volatile dfloat s_ip[p_RDIMY][p_RDIMX];
-    @shared volatile dfloat s_res[p_RDIMY];
-    @exclusive dfloat res;
-    
-    for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){
-      for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){
-        dlong i = tx + ty*p_RDIMX + b*p_RDIMX*p_RDIMY;
-        
-        res = 0;
-        while(i<N){ // scan through whole array
-          res += x[i]*y[i];
-          i += Nblocks*p_RDIMX*p_RDIMY; 
-        }
-      }
-    }
-    
-    twoPhaseReduction(res, s_ip, s_res, ip[b]);
-  }
-}
-
-    
+#ifndef PARALMOND_SOLVER_HPP
+#define PARALMOND_SOLVER_HPP
+
+namespace parAlmond {
+
+class solver_t {
+
+public:
+  MPI_Comm comm;
+  int rank, size;
+
+  occa::device device;
+  setupAide options;
+
+  bool exact;
+  CycleType    ctype;
+  KrylovType   ktype;
+  SmoothType stype;
+
+  int numLevels;
+  int AMGstartLev, baseLevel;
+  multigridLevel **levels=NULL;
+
+  coarseSolver *coarseLevel;
+
+  int ChebyshevIterations;
+
+  solver_t(occa::device otherdevice, MPI_Comm othercomm,
+                         setupAide otheroptions);
+
+  ~solver_t();
+
+  void AMGSetup(parCSR *A);
+
+  void Report();
+
+  void kcycle(int k);
+  void vcycle(int k);
+  void device_kcycle(int k);
+  void device_vcycle(int k);
+
+  void pcg(const int maxIt, const dfloat tol);
+  void pgmres(const int maxIt, const dfloat tol);
+  void device_pcg(const int maxIt, const dfloat tol);
+  void device_pgmres(const int maxIt, const dfloat tol);
+};
+
+} //namespace parAlmond
+
+#endif
\ No newline at end of file
diff --git a/libs/parAlmond/include/utils.hpp b/libs/parAlmond/include/utils.hpp
new file mode 100644
index 000000000..e1603d63d
--- /dev/null
+++ b/libs/parAlmond/include/utils.hpp
@@ -0,0 +1,105 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARALMOND_UTILS_HPP
+#define PARALMOND_UTILS_HPP
+
+namespace parAlmond {
+
+//scratch space
+extern size_t scratchSpaceBytes;
+extern void *scratch;
+extern occa::memory o_scratch;
+
+extern size_t pinnedScratchSpaceBytes;
+extern void *pinnedScratch;
+extern occa::memory o_pinnedScratch;
+
+extern size_t reductionScratchBytes;
+extern void *reductionScratch;
+extern occa::memory o_reductionScratch;
+
+void allocateScratchSpace(size_t requiredBytes, occa::device device);
+void allocatePinnedScratchSpace(size_t requiredBytes, occa::device device);
+void freeScratchSpace();
+void freePinnedScratchSpace();
+
+typedef struct {
+
+  dlong localId;
+  hlong globalId;
+
+  dlong newId;
+
+} parallelId_t;
+
+
+typedef struct {
+
+  dlong fineId;
+  hlong coarseId;
+  hlong newCoarseId;
+
+  int originRank;
+  int ownerRank;
+
+} parallelAggregate_t;
+
+
+typedef struct {
+
+  hlong row;
+  hlong col;
+  dfloat val;
+
+} nonzero_t;
+
+
+int CompareGlobalId(const void *a, const void *b);
+int CompareLocalId(const void *a, const void *b);
+
+int compareOwner(const void *a, const void *b);
+int compareAgg(const void *a, const void *b);
+int compareOrigin(const void *a, const void *b);
+
+int compareNonZeroByRow(const void *a, const void *b);
+
+bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i);
+
+extern "C"{
+  void dgetrf_(int* M, int *N, double* A, int* lda, int* IPIV, int* INFO);
+  void dgetri_(int* N, double* A, int* lda, int* IPIV, double* WORK, int* lwork, int* INFO);
+  void dgeev_(char *JOBVL, char *JOBVR, int *N, double *A, int *LDA, double *WR, double *WI,
+  double *VL, int *LDVL, double *VR, int *LDVR, double *WORK, int *LWORK, int *INFO );
+}
+
+void eig(const int Nrows, double *A, double *WR, double *WI);
+
+void matrixInverse(int N, dfloat *A);
+
+} //namespace parAlmond
+
+#endif
\ No newline at end of file
diff --git a/libs/parAlmond/include/vector.hpp b/libs/parAlmond/include/vector.hpp
new file mode 100644
index 000000000..872219b1d
--- /dev/null
+++ b/libs/parAlmond/include/vector.hpp
@@ -0,0 +1,133 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARALMOND_VECTOR_HPP
+#define PARALMOND_VECTOR_HPP
+
+namespace parAlmond {
+
+//------------------------------------------------------------------------
+//
+//  Host vector operations
+//
+//------------------------------------------------------------------------
+
+void vectorSet(const dlong m, const dfloat alpha, dfloat *a);
+
+void vectorRandomize(const dlong m, dfloat *a);
+
+void vectorScale(const dlong m, const dfloat alpha, dfloat *a);
+
+void vectorAddScalar(const dlong m, const dfloat alpha, dfloat *a);
+
+// y = beta*y + alpha*x
+void vectorAdd(const dlong n, const dfloat alpha, const dfloat *x,
+               const dfloat beta, dfloat *y);
+
+// z = beta*y + alpha*x
+void vectorAdd(const dlong n, const dfloat alpha, const dfloat *x,
+               const dfloat beta, const dfloat *y, dfloat *z);
+
+// b = a*b
+void vectorDotStar(const dlong m, const dfloat *a, dfloat *b);
+
+// c = alpha*a*b + beta*c
+void vectorDotStar(const dlong m, const dfloat alpha, const dfloat *a,
+                   const dfloat *b, const dfloat beta,  dfloat *c);
+
+dfloat vectorNorm(const dlong n, const dfloat *a, MPI_Comm comm);
+
+dfloat vectorInnerProd(const dlong n, const dfloat *a, const dfloat *b,
+                       MPI_Comm comm);
+
+dfloat vectorMaxAbs(const dlong n, const dfloat *a, MPI_Comm comm);
+
+// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b,
+void kcycleCombinedOp1(const dlong n, dfloat *aDotbc, const dfloat *a,
+                      const dfloat *b, const dfloat *c, const dfloat* w,
+                      const bool weighted, MPI_Comm comm);
+
+// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d,
+void kcycleCombinedOp2(const dlong n, dfloat *aDotbcd, const dfloat *a,
+                       const dfloat *b, const dfloat *c, const dfloat* d,
+                       const dfloat *w, const bool weighted, MPI_Comm comm);
+
+// y = beta*y + alpha*x, and return y\dot y
+dfloat vectorAddInnerProd(const dlong n, const dfloat alpha, const dfloat *x,
+                          const dfloat beta, dfloat *y,
+                          const dfloat *w, const bool weighted, MPI_Comm comm);
+
+//------------------------------------------------------------------------
+//
+//  Device vector operations
+//
+//------------------------------------------------------------------------
+
+void vectorSet(const dlong N, const dfloat alpha, occa::memory o_a);
+
+void vectorRandomize(const dlong m, occa::memory o_a);
+
+void vectorScale(const dlong N, const dfloat alpha, occa::memory o_a);
+
+void vectorAddScalar(const dlong N, const dfloat alpha, occa::memory o_a);
+
+void vectorAdd(const dlong N, const dfloat alpha, occa::memory o_x,
+               const dfloat beta, occa::memory o_y);
+
+void vectorAdd(const dlong N, const dfloat alpha, occa::memory o_x,
+               const dfloat beta, occa::memory o_y, occa::memory o_z);
+
+void vectorDotStar(const dlong N, occa::memory o_a, occa::memory o_b);
+
+void vectorDotStar(const dlong N, const dfloat alpha, occa::memory o_a,
+                   occa::memory o_b, const dfloat beta, occa::memory o_c);
+
+dfloat vectorNorm(const dlong n, occa::memory o_a, MPI_Comm comm);
+
+dfloat vectorInnerProd(const dlong N, occa::memory o_x, occa::memory o_y,
+                       MPI_Comm comm);
+
+dfloat vectorMaxAbs(const dlong n, occa::memory o_a, MPI_Comm comm);
+
+// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b,
+void kcycleCombinedOp1(const dlong N, dfloat *aDotbc, occa::memory o_a,
+                       occa::memory o_b, occa::memory o_c, occa::memory o_w,
+                       const bool weighted, MPI_Comm comm);
+
+// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d,
+void kcycleCombinedOp2(const dlong N, dfloat *aDotbcd,
+                        occa::memory o_a, occa::memory o_b,
+                        occa::memory o_c, occa::memory o_d,
+                        occa::memory o_w, const bool weighted, MPI_Comm comm);
+
+// y = beta*y + alpha*x, and return y\dot y
+dfloat vectorAddInnerProd(const dlong N, const dfloat alpha, occa::memory o_x,
+                          const dfloat beta, occa::memory o_y,
+                          occa::memory o_w, const bool weighted, MPI_Comm comm);
+
+} //namespace parAlmond
+
+#endif
\ No newline at end of file
diff --git a/libs/parAlmond/makefile b/libs/parAlmond/makefile
new file mode 100644
index 000000000..77590446a
--- /dev/null
+++ b/libs/parAlmond/makefile
@@ -0,0 +1,75 @@
+ifndef OCCA_DIR
+ERROR:
+	@echo "Error, environment variable [OCCA_DIR] is not set"
+endif
+
+CXXFLAGS =
+
+include ${OCCA_DIR}/scripts/Makefile
+
+# define variables
+HDRDIR = ../../
+OGSDIR  = ../gatherScatter
+GSDIR  = ../../3rdParty/gslib
+
+# set options for this machine
+# specify which compilers to use for c, fortran and linking
+CC	= mpicc
+CXX	= mpic++
+LD	= mpic++
+
+# compiler flags to be used (set to compile with debugging on)
+CFLAGS = -I. -I./include/ $(compilerFlags) $(flags) -I$(HDRDIR)/include/ -I$(OGSDIR) -g -D DPARALMOND='"${CURDIR}"'
+
+# link flags to be used
+LDFLAGS	= $(compilerFlags) $(flags) -g
+
+# libraries to be linked in
+LIBS	=   -L$(OCCA_DIR)/lib  $(links) -L$(OGSDIR) -logs -L$(GSDIR)/lib -lgs \
+						$(links) -L../../3rdParty/BlasLapack -lBlasLapack -lgfortran
+
+INCLUDES = $(includes)
+DEPS = $(INCLUDES) \
+$(OGSDIR)/ogs.hpp \
+$(HDRDIR)/include/types.h
+
+# types of files we are going to construct rules for
+.SUFFIXES: .c .cpp
+
+# rule for .c files
+.cpp.o: $(DEPS)
+	$(CXX) $(CFLAGS) -o $*.o -c $*.cpp $(paths)
+
+# list of objects to be compiled
+OBJS = \
+./src/agmgLevel.o \
+./src/agmgSmoother.o \
+./src/coarseSolver.o \
+./src/kernels.o \
+./src/level.o \
+./src/matrix.o \
+./src/multigrid.o \
+./src/parAlmond.o \
+./src/pcg.o \
+./src/pgmres.o \
+./src/solver.o \
+./src/SpMV.o \
+./src/utils.o \
+./src/vector.o \
+./src/agmgSetup/agmgSetup.o \
+./src/agmgSetup/constructProlongation.o \
+./src/agmgSetup/formAggregates.o \
+./src/agmgSetup/galerkinProd.o \
+./src/agmgSetup/strongGraph.o \
+./src/agmgSetup/transpose.o \
+
+
+all: lib
+
+lib: $(OBJS) $(DEPS)
+	ar -cr libparAlmond.a $(OBJS)
+
+clean:
+	rm -f libparAlmond.a
+	rm -f ./src/*.o
+	rm -f ./src/agmgSetup/*.o
diff --git a/libs/parAlmond/okl/SpMVcsr.okl b/libs/parAlmond/okl/SpMVcsr.okl
new file mode 100644
index 000000000..f569b47da
--- /dev/null
+++ b/libs/parAlmond/okl/SpMVcsr.okl
@@ -0,0 +1,77 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+@kernel void SpMVcsr1(const dlong   Nrows,
+                      const dfloat  alpha,
+                      const dfloat  beta,
+                      @restrict const  dlong  * rowStarts,
+                      @restrict const  dlong  * cols,
+                      @restrict const  dfloat * vals,
+                      @restrict const  dfloat * x,
+                      @restrict        dfloat * y){
+
+  // y = alpha * A * x + beta * y
+  for(dlong n=0;n<Nrows;++n;@tile(p_BLOCKSIZE,@outer,@inner)){
+    dfloat betay = 0.;
+
+    if (beta)
+      betay = beta*y[n];
+
+    dfloat result = 0.;
+    const dlong start = rowStarts[n];
+    const dlong end   = rowStarts[n+1];
+
+    for(dlong i=start; i<end; i++){
+      const dlong col = cols[i];
+      result += vals[i]*x[col];
+    }
+    y[n] = alpha*result + betay;
+  }
+}
+
+@kernel void SpMVcsr2(const dlong  Nrows,
+                      const dfloat alpha,
+                      const dfloat beta,
+                      @restrict const  dlong  * rowStarts,
+                      @restrict const  dlong  * cols,
+                      @restrict const  dfloat * vals,
+                      @restrict const  dfloat * x,
+                      @restrict const  dfloat * y,
+                      @restrict        dfloat * z){
+
+  // z = alpha * A * x + beta * y
+  for(dlong n=0;n<Nrows;++n;@tile(p_BLOCKSIZE,@outer,@inner)){
+    dfloat result = 0.;
+    const dlong start = rowStarts[n];
+    const dlong end   = rowStarts[n+1];
+
+    for(dlong i=start; i<end; i++){
+      const dlong col = cols[i];
+      result += vals[i]*x[col];
+    }
+    z[n] = alpha*result + beta*y[n];
+  }
+}
diff --git a/libs/parAlmond/okl/SpMVell.okl b/libs/parAlmond/okl/SpMVell.okl
new file mode 100644
index 000000000..fc42821e6
--- /dev/null
+++ b/libs/parAlmond/okl/SpMVell.okl
@@ -0,0 +1,78 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+@kernel void SpMVell1(const dlong   Nrows,
+                      const int     nnzPerRow,
+                      const dfloat  alpha,
+                      const dfloat  beta,
+                      @restrict const  dlong  * cols,
+                      @restrict const  dfloat * vals,
+                      @restrict const  dfloat * x,
+                      @restrict        dfloat * y){
+
+  // y = alpha * A * x + beta * y
+  for(dlong i=0;i<Nrows;++i;@tile(p_BLOCKSIZE,@outer,@inner)){
+    dfloat betay = 0.;
+
+    if (beta)
+      betay = beta*y[i];
+
+    dfloat result = 0.;
+    for(int c=0; c<nnzPerRow; c++){
+      // access column index
+      const dlong col = cols[i+c*Nrows];
+
+      if (col > -1)
+        result += vals[i+c*Nrows]*x[col];
+    }
+    y[i] = alpha*result + betay;
+  }
+}
+
+@kernel void SpMVell2(const dlong  Nrows,
+                      const int    nnzPerRow,
+                      const dfloat alpha,
+                      const dfloat beta,
+                      @restrict const  dlong  * cols,
+                      @restrict const  dfloat * vals,
+                      @restrict const  dfloat * x,
+                      @restrict const  dfloat * y,
+                      @restrict        dfloat * z){
+
+  // z = alpha * A * x + beta * y
+  for(dlong i=0;i<Nrows;++i;@tile(p_BLOCKSIZE,@outer,@inner)){
+    dfloat result = 0.;
+    for(int c=0; c<nnzPerRow; c++){
+      // access column index
+      const dlong col = cols[i+c*Nrows];
+
+      // dont access vals[i+c*Nrows] if col is -ve
+      if(col > -1)
+        result += vals[i+c*Nrows]*x[col];
+    }
+    z[i] = alpha*result + beta*y[i];
+  }
+}
diff --git a/libs/parAlmond/okl/SpMVmcsr.okl b/libs/parAlmond/okl/SpMVmcsr.okl
new file mode 100644
index 000000000..f75f08878
--- /dev/null
+++ b/libs/parAlmond/okl/SpMVmcsr.okl
@@ -0,0 +1,83 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+@kernel void SpMVmcsr1(const dlong   Nrows,
+                       const dfloat  alpha,
+                       const dfloat  beta,
+                       @restrict const  dlong  * rowStarts,
+                       @restrict const  dlong  * rows,
+                       @restrict const  dlong  * cols,
+                       @restrict const  dfloat * vals,
+                       @restrict const  dfloat * x,
+                       @restrict        dfloat * y){
+
+  // y = alpha * A * x + beta * y
+  for(dlong n=0;n<Nrows;++n;@tile(p_BLOCKSIZE,@outer,@inner)){
+    const dlong row = rows[n];
+
+    dfloat betay = 0.;
+
+    if (beta)
+      betay = beta*y[row];
+
+    dfloat result = 0.;
+    const dlong start = rowStarts[n];
+    const dlong end   = rowStarts[n+1];
+
+    for(dlong i=start; i<end; i++){
+      const dlong col = cols[i];
+      result += vals[i]*x[col];
+    }
+    y[row] = alpha*result + betay;
+  }
+}
+
+@kernel void SpMVmcsr2(const dlong  Nrows,
+                       const dfloat alpha,
+                       const dfloat beta,
+                       @restrict const  dlong  * rowStarts,
+                       @restrict const  dlong  * rows,
+                       @restrict const  dlong  * cols,
+                       @restrict const  dfloat * vals,
+                       @restrict const  dfloat * x,
+                       @restrict const  dfloat * y,
+                       @restrict        dfloat * z){
+
+  // z = alpha * A * x + beta * y
+  for(dlong n=0;n<Nrows;++n;@tile(p_BLOCKSIZE,@outer,@inner)){
+    const dlong row = rows[n];
+
+    dfloat result = 0.;
+    const dlong start = rowStarts[n];
+    const dlong end   = rowStarts[n+1];
+
+    for(dlong i=start; i<end; i++){
+      const dlong col = cols[i];
+      result += vals[i]*x[col];
+    }
+    z[row] = alpha*result + beta*y[row];
+  }
+}
diff --git a/libs/parAlmond/okl/haloExtract.okl b/libs/parAlmond/okl/haloExtract.okl
new file mode 100644
index 000000000..b41f778e4
--- /dev/null
+++ b/libs/parAlmond/okl/haloExtract.okl
@@ -0,0 +1,37 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+
+@kernel void haloExtract(const dlong Nhalo,
+                         @restrict const dlong  * haloIds,
+                         @restrict const dfloat * q,
+                         @restrict       dfloat * haloq){
+
+  for(dlong e=0;e<Nhalo;++e;@tile(p_BLOCKSIZE,@outer,@inner)){
+    const dlong id = haloIds[e];
+    haloq[e] = q[id];
+  }
+}
diff --git a/libs/parAlmond/okl/kcycleCombinedOp.okl b/libs/parAlmond/okl/kcycleCombinedOp.okl
new file mode 100644
index 000000000..1ad1dd415
--- /dev/null
+++ b/libs/parAlmond/okl/kcycleCombinedOp.okl
@@ -0,0 +1,535 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+
+// a.b, a.c, b.b
+@kernel void kcycleCombinedOp1(const dlong Nblocks,
+                               const dlong N,
+                               @restrict const dfloat * a,
+                               @restrict const dfloat * b,
+                               @restrict const dfloat * c,
+                               @restrict       dfloat * ips){
+
+  for(dlong n=0;n<Nblocks;++n;@outer(0)){
+
+    @shared volatile dfloat s_ip[3*p_BLOCKSIZE];
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
+      dlong id = t + n*p_BLOCKSIZE;
+      s_ip[0*p_BLOCKSIZE+t] = 0.0;
+      s_ip[1*p_BLOCKSIZE+t] = 0.0;
+      s_ip[2*p_BLOCKSIZE+t] = 0.0;
+      while (id<N) {
+        const dfloat ai =  a[id];
+        const dfloat bi =  b[id];
+        const dfloat ci =  c[id];
+
+        s_ip[0*p_BLOCKSIZE+t] += ai*bi;
+        s_ip[1*p_BLOCKSIZE+t] += ai*ci;
+        s_ip[2*p_BLOCKSIZE+t] += bi*bi;
+        id += p_BLOCKSIZE*Nblocks;
+      }
+    }
+    @barrier("local");
+
+#if p_BLOCKSIZE>512
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<512) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+512];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+512];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+512];
+      }
+    }
+    @barrier("local");
+#endif
+
+#if p_BLOCKSIZE>256
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<256) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+256];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+256];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256];
+      }
+    }
+    @barrier("local");
+#endif
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<128) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+128];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+128];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+128];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<64) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+64];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+64];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+64];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<32) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+32];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+32];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+32];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<16) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+16];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+16];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+16];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<8) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+8];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+8];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+8];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<4) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+4];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+4];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+4];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<2) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+2];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+2];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+2];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<1) {
+        ips[3*n+0] = s_ip[0*p_BLOCKSIZE+0] + s_ip[0*p_BLOCKSIZE+t+1];
+        ips[3*n+1] = s_ip[1*p_BLOCKSIZE+0] + s_ip[1*p_BLOCKSIZE+t+1];
+        ips[3*n+2] = s_ip[2*p_BLOCKSIZE+0] + s_ip[2*p_BLOCKSIZE+t+1];
+      }
+    }
+  }
+}
+
+// a.b, a.c, b.b
+@kernel void kcycleCombinedOp2(const dlong Nblocks,
+                               const dlong N,
+                               @restrict const dfloat * a,
+                               @restrict const dfloat * b,
+                               @restrict const dfloat * c,
+                               @restrict const dfloat * d,
+                               @restrict       dfloat * ips){
+
+  for(dlong n=0;n<Nblocks;++n;@outer(0)){
+
+    @shared volatile dfloat s_ip[3*p_BLOCKSIZE];
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
+      dlong id = t + n*p_BLOCKSIZE;
+      s_ip[0*p_BLOCKSIZE+t] = 0.0;
+      s_ip[1*p_BLOCKSIZE+t] = 0.0;
+      s_ip[2*p_BLOCKSIZE+t] = 0.0;
+      while (id<N) {
+        const dfloat ai =  a[id];
+        const dfloat bi =  b[id];
+        const dfloat ci =  c[id];
+        const dfloat di =  d[id];
+
+        s_ip[0*p_BLOCKSIZE+t] += ai*bi;
+        s_ip[1*p_BLOCKSIZE+t] += ai*ci;
+        s_ip[2*p_BLOCKSIZE+t] += ai*di;
+        id += p_BLOCKSIZE*Nblocks;
+      }
+    }
+    @barrier("local");
+
+    #if p_BLOCKSIZE>512
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<512) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+512];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+512];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+512];
+      }
+    }
+    @barrier("local");
+#endif
+
+#if p_BLOCKSIZE>256
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<256) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+256];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+256];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256];
+      }
+    }
+    @barrier("local");
+#endif
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<128) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+128];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+128];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+128];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<64) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+64];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+64];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+64];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<32) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+32];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+32];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+32];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<16) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+16];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+16];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+16];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<8) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+8];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+8];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+8];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<4) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+4];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+4];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+4];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<2) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+2];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+2];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+2];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<1) {
+        ips[3*n+0] = s_ip[0*p_BLOCKSIZE+0] + s_ip[0*p_BLOCKSIZE+t+1];
+        ips[3*n+1] = s_ip[1*p_BLOCKSIZE+0] + s_ip[1*p_BLOCKSIZE+t+1];
+        ips[3*n+2] = s_ip[2*p_BLOCKSIZE+0] + s_ip[2*p_BLOCKSIZE+t+1];
+      }
+    }
+  }
+}
+
+
+// w.a.b, w.a.c, w.b.b
+@kernel void kcycleWeightedCombinedOp1(const dlong Nblocks,
+                                       const dlong N,
+                                       @restrict const dfloat * a,
+                                       @restrict const dfloat * b,
+                                       @restrict const dfloat * c,
+                                       @restrict const dfloat * w,
+                                       @restrict       dfloat * ips){
+
+  for(dlong n=0;n<Nblocks;++n;@outer(0)){
+
+    @shared volatile dfloat s_ip[3*p_BLOCKSIZE];
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
+      dlong id = t + n*p_BLOCKSIZE;
+      s_ip[0*p_BLOCKSIZE+t] = 0.0;
+      s_ip[1*p_BLOCKSIZE+t] = 0.0;
+      s_ip[2*p_BLOCKSIZE+t] = 0.0;
+      while (id<N) {
+        const dfloat ai =  a[id];
+        const dfloat bi =  b[id];
+        const dfloat ci =  c[id];
+        const dfloat wi =  w[id];
+
+        s_ip[0*p_BLOCKSIZE+t] += wi*ai*bi;
+        s_ip[1*p_BLOCKSIZE+t] += wi*ai*ci;
+        s_ip[2*p_BLOCKSIZE+t] += wi*bi*bi;
+        id += p_BLOCKSIZE*Nblocks;
+      }
+    }
+    @barrier("local");
+
+#if p_BLOCKSIZE>512
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<512) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+512];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+512];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+512];
+      }
+    }
+    @barrier("local");
+#endif
+
+#if p_BLOCKSIZE>256
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<256) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+256];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+256];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256];
+      }
+    }
+    @barrier("local");
+#endif
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<128) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+128];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+128];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+128];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<64) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+64];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+64];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+64];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<32) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+32];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+32];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+32];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<16) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+16];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+16];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+16];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<8) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+8];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+8];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+8];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<4) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+4];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+4];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+4];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<2) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+2];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+2];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+2];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<1) {
+        ips[3*n+0] = s_ip[0*p_BLOCKSIZE+0] + s_ip[0*p_BLOCKSIZE+t+1];
+        ips[3*n+1] = s_ip[1*p_BLOCKSIZE+0] + s_ip[1*p_BLOCKSIZE+t+1];
+        ips[3*n+2] = s_ip[2*p_BLOCKSIZE+0] + s_ip[2*p_BLOCKSIZE+t+1];
+      }
+    }
+  }
+}
+
+// w.a.b, w.a.c, w.b.b
+@kernel void kcycleWeightedCombinedOp2(const dlong Nblocks,
+                                       const dlong N,
+                                       @restrict const dfloat * a,
+                                       @restrict const dfloat * b,
+                                       @restrict const dfloat * c,
+                                       @restrict const dfloat * d,
+                                       @restrict const dfloat * w,
+                                       @restrict       dfloat * ips){
+
+  for(dlong n=0;n<Nblocks;++n;@outer(0)){
+
+    @shared volatile dfloat s_ip[3*p_BLOCKSIZE];
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
+      dlong id = t + n*p_BLOCKSIZE;
+      s_ip[0*p_BLOCKSIZE+t] = 0.0;
+      s_ip[1*p_BLOCKSIZE+t] = 0.0;
+      s_ip[2*p_BLOCKSIZE+t] = 0.0;
+      while (id<N) {
+        const dfloat ai =  a[id];
+        const dfloat bi =  b[id];
+        const dfloat ci =  c[id];
+        const dfloat di =  d[id];
+        const dfloat wi =  w[id];
+
+        s_ip[0*p_BLOCKSIZE+t] += wi*ai*bi;
+        s_ip[1*p_BLOCKSIZE+t] += wi*ai*ci;
+        s_ip[2*p_BLOCKSIZE+t] += wi*ai*di;
+        id += p_BLOCKSIZE*Nblocks;
+      }
+    }
+    @barrier("local");
+
+    #if p_BLOCKSIZE>512
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<512) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+512];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+512];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+512];
+      }
+    }
+    @barrier("local");
+#endif
+
+#if p_BLOCKSIZE>256
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<256) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+256];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+256];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+256];
+      }
+    }
+    @barrier("local");
+#endif
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<128) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+128];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+128];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+128];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<64) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+64];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+64];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+64];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<32) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+32];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+32];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+32];
+      }
+    }
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<16) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+16];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+16];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+16];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<8) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+8];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+8];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+8];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<4) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+4];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+4];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+4];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<2) {
+        s_ip[0*p_BLOCKSIZE+t] += s_ip[0*p_BLOCKSIZE+t+2];
+        s_ip[1*p_BLOCKSIZE+t] += s_ip[1*p_BLOCKSIZE+t+2];
+        s_ip[2*p_BLOCKSIZE+t] += s_ip[2*p_BLOCKSIZE+t+2];
+      }
+    }
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) {
+      if(t<1) {
+        ips[3*n+0] = s_ip[0*p_BLOCKSIZE+0] + s_ip[0*p_BLOCKSIZE+t+1];
+        ips[3*n+1] = s_ip[1*p_BLOCKSIZE+0] + s_ip[1*p_BLOCKSIZE+t+1];
+        ips[3*n+2] = s_ip[2*p_BLOCKSIZE+0] + s_ip[2*p_BLOCKSIZE+t+1];
+      }
+    }
+  }
+}
diff --git a/solvers/parALMOND/okl/vectorAdd.okl b/libs/parAlmond/okl/vectorAdd.okl
similarity index 61%
rename from solvers/parALMOND/okl/vectorAdd.okl
rename to libs/parAlmond/okl/vectorAdd.okl
index f2c8d9bcc..0b3b9211f 100644
--- a/solvers/parALMOND/okl/vectorAdd.okl
+++ b/libs/parAlmond/okl/vectorAdd.okl
@@ -25,28 +25,30 @@ SOFTWARE.
 */
 
 // y = beta*y + alpha*x
-@kernel void vectorAddKernel(const dlong   N,
-				  const dfloat   alpha,
-				  const dfloat   beta,
-				  @restrict const  dfloat * x,
-				        @restrict dfloat * y){
-
-  for(dlong i=0;i<N;++i;@tile(256,@outer,@inner)){
-    if(i<N)
-      y[i] = beta*y[i] + alpha*x[i];
+@kernel void vectorAdd1(const dlong   N,
+                        const dfloat  alpha,
+                        const dfloat  beta,
+                        @restrict const dfloat * x,
+                        @restrict       dfloat * y){
+
+  for(dlong i=0;i<N;++i;@tile(p_BLOCKSIZE,@outer,@inner)){
+    dfloat yi = 0.0;
+    if (beta)
+      yi = beta*y[i];
+
+    y[i] = yi + alpha*x[i];
   }
 }
 
 // z = alpha*x + beta*y
-@kernel void vectorAddKernel2(const dlong   N,
-				   const dfloat   alpha,
-				   const dfloat   beta,
-				   @restrict const  dfloat * x,
-				   @restrict const  dfloat * y,
-				         @restrict dfloat * z){
-
-  for(dlong i=0;i<N;++i;@tile(256,@outer,@inner)){
-    if(i<N)
-      z[i] = beta*y[i] + alpha*x[i];
+@kernel void vectorAdd2(const dlong  N,
+                        const dfloat alpha,
+                        const dfloat beta,
+                        @restrict const dfloat * x,
+                        @restrict const dfloat * y,
+                        @restrict       dfloat * z){
+
+  for(dlong i=0;i<N;++i;@tile(p_BLOCKSIZE,@outer,@inner)){
+    z[i] = beta*y[i] + alpha*x[i];
   }
 }
diff --git a/libs/parAlmond/okl/vectorAddInnerProd.okl b/libs/parAlmond/okl/vectorAddInnerProd.okl
new file mode 100644
index 000000000..4ae7b72e3
--- /dev/null
+++ b/libs/parAlmond/okl/vectorAddInnerProd.okl
@@ -0,0 +1,164 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+
+// y = beta*y + alpha*x
+// ip = y.y
+@kernel void vectorAddInnerProd(const dlong Nblocks,
+                                const dlong N,
+                                const dfloat alpha,
+                                const dfloat beta,
+                                @restrict const dfloat * x,
+                                @restrict       dfloat * y,
+                                @restrict       dfloat * ip){
+
+  for(dlong b=0;b<Nblocks;++b;@outer(0)){
+
+    @shared volatile dfloat s_ip[p_BLOCKSIZE];
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
+      dlong id = t + b*p_BLOCKSIZE;
+
+      s_ip[t] = 0.0;
+      while (id<N) {
+        dfloat yi;
+        if (beta) yi = y[id];
+        else      yi = 0.0;
+
+        const dfloat r = beta*yi + alpha*x[id];
+
+        y[id] = r;
+
+        s_ip[t] += (r*r);
+        id += p_BLOCKSIZE*Nblocks;
+      }
+    }
+    @barrier("local");
+
+#if p_BLOCKSIZE>512
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<512) s_ip[t] += s_ip[t+512];
+    @barrier("local");
+#endif
+
+#if p_BLOCKSIZE>256
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<256) s_ip[t] += s_ip[t+256];
+    @barrier("local");
+#endif
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<128) s_ip[t] += s_ip[t+128];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 64) s_ip[t] += s_ip[t+ 64];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 32) s_ip[t] += s_ip[t+ 32];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 16) s_ip[t] += s_ip[t+ 16];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  8) s_ip[t] += s_ip[t+  8];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  4) s_ip[t] += s_ip[t+  4];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  2) s_ip[t] += s_ip[t+  2];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  1) ip[b] = s_ip[0] + s_ip[1];
+  }
+}
+
+// y = beta*y + alpha*x
+// ip = w.y.y
+@kernel void vectorAddWeightedInnerProd(const dlong Nblocks,
+                                        const dlong N,
+                                        const dfloat alpha,
+                                        const dfloat beta,
+                                        @restrict const dfloat * x,
+                                        @restrict       dfloat * y,
+                                        @restrict const dfloat * w,
+                                        @restrict       dfloat * ip){
+
+  for(dlong b=0;b<Nblocks;++b;@outer(0)){
+
+    @shared volatile dfloat s_ip[p_BLOCKSIZE];
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
+      dlong id = t + b*p_BLOCKSIZE;
+
+      s_ip[t] = 0.0;
+      while (id<N) {
+        dfloat yi;
+        if (beta) yi = y[id];
+        else      yi = 0.0;
+
+        const dfloat r = beta*yi + alpha*x[id];
+
+        y[id] = r;
+
+        s_ip[t] += (w[id]*r*r);
+        id += p_BLOCKSIZE*Nblocks;
+      }
+    }
+    @barrier("local");
+
+#if p_BLOCKSIZE>512
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<512) s_ip[t] += s_ip[t+512];
+    @barrier("local");
+#endif
+
+#if p_BLOCKSIZE>256
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<256) s_ip[t] += s_ip[t+256];
+    @barrier("local");
+#endif
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<128) s_ip[t] += s_ip[t+128];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 64) s_ip[t] += s_ip[t+ 64];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 32) s_ip[t] += s_ip[t+ 32];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 16) s_ip[t] += s_ip[t+ 16];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  8) s_ip[t] += s_ip[t+  8];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  4) s_ip[t] += s_ip[t+  4];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  2) s_ip[t] += s_ip[t+  2];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  1) ip[b] = s_ip[0] + s_ip[1];
+  }
+}
+
diff --git a/solvers/parALMOND/okl/addScalar.okl b/libs/parAlmond/okl/vectorAddScalar.okl
similarity index 83%
rename from solvers/parALMOND/okl/addScalar.okl
rename to libs/parAlmond/okl/vectorAddScalar.okl
index 4b6e310b5..9d01f2469 100644
--- a/solvers/parALMOND/okl/addScalar.okl
+++ b/libs/parAlmond/okl/vectorAddScalar.okl
@@ -25,12 +25,11 @@ SOFTWARE.
 */
 
 // x = x + alpha
-@kernel void addScalarKernel(const dlong   n,
-          const dfloat alpha,
-                @restrict dfloat * x){
+@kernel void vectorAddScalar(const dlong   n,
+                             const dfloat alpha,
+                             @restrict dfloat * x){
 
-  for(dlong i=0;i<n;++i;@tile(256,@outer,@inner)){
-    if(i<n)
-      x[i] += alpha;
+  for(dlong i=0;i<n;++i;@tile(p_BLOCKSIZE,@outer,@inner)){
+    x[i] += alpha;
   }
 }
diff --git a/solvers/parALMOND/okl/dotStar.okl b/libs/parAlmond/okl/vectorDotStar.okl
similarity index 63%
rename from solvers/parALMOND/okl/dotStar.okl
rename to libs/parAlmond/okl/vectorDotStar.okl
index 7801b906f..e0597e9c2 100644
--- a/solvers/parALMOND/okl/dotStar.okl
+++ b/libs/parAlmond/okl/vectorDotStar.okl
@@ -25,26 +25,28 @@ SOFTWARE.
 */
 
 // b = b.*a
-@kernel void simpleDotStarKernel(const dlong   N,
-				      @restrict const  dfloat * a,
-				            @restrict dfloat * b){
+@kernel void vectorDotStar1(const dlong N,
+                            @restrict const dfloat * a,
+                            @restrict       dfloat * b){
 
-  for(dlong i=0;i<N;++i;@tile(256,@outer,@inner)){
-    if(i<N)
-      b[i] *= a[i];
+  for(dlong i=0;i<N;++i;@tile(p_BLOCKSIZE,@outer,@inner)){
+    b[i] *= a[i];
   }
 }
 
 // c = alpha*(a.*b) + beta *c
-@kernel void dotStarKernel(const dlong   N,
-			        const dfloat   alpha,
-			        const dfloat   beta,
-			        @restrict const  dfloat * a,
-			        @restrict const  dfloat * b,
-			              @restrict dfloat * c){
-
-  for(dlong i=0;i<N;++i;@tile(256,@outer,@inner)){
-    if(i<N)
-      c[i] = alpha * a[i] * b[i] + beta * c[i];
+@kernel void vectorDotStar2(const dlong  N,
+                            const dfloat alpha,
+                            const dfloat beta,
+                            @restrict const dfloat * a,
+                            @restrict const dfloat * b,
+                            @restrict       dfloat * c){
+
+  for(dlong i=0;i<N;++i;@tile(p_BLOCKSIZE,@outer,@inner)){
+    dfloat ci = 0.0;
+    if (beta)
+      ci = beta*c[i];
+
+    c[i] = (alpha * a[i] * b[i]) + ci;
   }
 }
diff --git a/libs/parAlmond/okl/vectorInnerProd.okl b/libs/parAlmond/okl/vectorInnerProd.okl
new file mode 100644
index 000000000..b3bd2c3eb
--- /dev/null
+++ b/libs/parAlmond/okl/vectorInnerProd.okl
@@ -0,0 +1,85 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+
+// ip = x.y
+@kernel void vectorInnerProd(const dlong Nblocks,
+                             const dlong N,
+                             @restrict const  dfloat * x,
+                             @restrict const  dfloat * y,
+                             @restrict        dfloat * ip){
+
+  for(dlong b=0;b<Nblocks;++b;@outer(0)){
+
+    @shared volatile dfloat s_ip[p_BLOCKSIZE];
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)){
+      dlong id = t + b*p_BLOCKSIZE;
+      s_ip[t] = 0.0;
+      while (id<N) {
+        s_ip[t] += x[id]*y[id];
+        id += p_BLOCKSIZE*Nblocks;
+      }
+    }
+
+    @barrier("local");
+
+#if p_BLOCKSIZE>512
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<512) s_ip[t] += s_ip[t+512];
+    @barrier("local");
+#endif
+
+#if p_BLOCKSIZE>256
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<256) s_ip[t] += s_ip[t+256];
+    @barrier("local");
+#endif
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<128) s_ip[t] += s_ip[t+128];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 64) s_ip[t] += s_ip[t+ 64];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 32) s_ip[t] += s_ip[t+ 32];
+    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t< 16) s_ip[t] += s_ip[t+ 16];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  8) s_ip[t] += s_ip[t+  8];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  4) s_ip[t] += s_ip[t+  4];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  2) s_ip[t] += s_ip[t+  2];
+    //    @barrier("local");
+
+    for(int t=0;t<p_BLOCKSIZE;++t;@inner(0)) if(t<  1) ip[b] = s_ip[0] + s_ip[1];
+  }
+}
+
+
diff --git a/solvers/parALMOND/okl/scaleVector.okl b/libs/parAlmond/okl/vectorScale.okl
similarity index 84%
rename from solvers/parALMOND/okl/scaleVector.okl
rename to libs/parAlmond/okl/vectorScale.okl
index 20465c358..9f72ce8d7 100644
--- a/solvers/parALMOND/okl/scaleVector.okl
+++ b/libs/parAlmond/okl/vectorScale.okl
@@ -25,12 +25,11 @@ SOFTWARE.
 */
 
 // x = alpha*x
-@kernel void scaleVectorKernel(const dlong   n,
-				  const dfloat alpha,
-				        @restrict dfloat * x){
+@kernel void vectorScale(const dlong  n,
+                         const dfloat alpha,
+                         @restrict dfloat * x){
 
-  for(dlong i=0;i<n;++i;@tile(256,@outer,@inner)){
-    if(i<n)
-      x[i] *= alpha;
+  for(dlong i=0;i<n;++i;@tile(p_BLOCKSIZE,@outer,@inner)){
+    x[i] *= alpha;
   }
 }
diff --git a/solvers/parALMOND/okl/setVector.okl b/libs/parAlmond/okl/vectorSet.okl
similarity index 84%
rename from solvers/parALMOND/okl/setVector.okl
rename to libs/parAlmond/okl/vectorSet.okl
index e09048b66..549183dac 100644
--- a/solvers/parALMOND/okl/setVector.okl
+++ b/libs/parAlmond/okl/vectorSet.okl
@@ -25,12 +25,11 @@ SOFTWARE.
 */
 
 
-@kernel void setVectorKernel(const dlong N, 
-              const dfloat alpha, 
-              dfloat *x){
+@kernel void vectorSet(const dlong  N,
+                       const dfloat alpha,
+                       @restrict dfloat *x){
 
-  for(dlong n=0;n<N;++n;@tile(256,@outer,@inner)){
-    if(n<N)
-      x[n] = alpha;
+  for(dlong n=0;n<N;++n;@tile(p_BLOCKSIZE,@outer,@inner)){
+    x[n] = alpha;
   }
 }
diff --git a/libs/parAlmond/parAlmond.hpp b/libs/parAlmond/parAlmond.hpp
new file mode 100644
index 000000000..d80aca45e
--- /dev/null
+++ b/libs/parAlmond/parAlmond.hpp
@@ -0,0 +1,71 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef PARALMOND_HPP
+#define PARALMOND_HPP
+
+#include <math.h>
+#include <stdlib.h>
+#include <occa.hpp>
+
+#include "mpi.h"
+#include "types.h"
+#include "ogs.hpp"
+#include "setupAide.hpp"
+
+#include "include/defines.hpp"
+#include "include/utils.hpp"
+#include "include/kernels.hpp"
+#include "include/vector.hpp"
+#include "include/matrix.hpp"
+#include "include/level.hpp"
+#include "include/agmg.hpp"
+#include "include/coarse.hpp"
+#include "include/solver.hpp"
+
+
+namespace parAlmond {
+
+solver_t *Init(occa::device device, MPI_Comm comm, setupAide options);
+
+void AMGSetup(solver_t* M,
+             hlong* rowStarts,
+             dlong nnz,
+             hlong* Ai,
+             hlong* Aj,
+             dfloat* Avals,
+             bool nullSpace,
+             dfloat nullSpacePenalty);
+
+void Precon(solver_t* M, occa::memory o_x, occa::memory o_rhs);
+
+void Report(solver_t *M);
+
+void Free(solver_t* M);
+
+} //namespace parAlmond
+
+#endif
diff --git a/libs/parAlmond/src/SpMV.cpp b/libs/parAlmond/src/SpMV.cpp
new file mode 100644
index 000000000..053cf8887
--- /dev/null
+++ b/libs/parAlmond/src/SpMV.cpp
@@ -0,0 +1,398 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+//------------------------------------------------------------------------
+//
+//  CSR matrix
+//
+//------------------------------------------------------------------------
+void CSR::SpMV(const dfloat alpha, dfloat *x,
+               const dfloat beta, dfloat *y) {
+  // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  if (beta) {
+    // #pragma omp parallel for
+    for(dlong i=0; i<Nrows; i++){ //local
+      dfloat result = 0.0;
+      for(dlong jj=rowStarts[i]; jj<rowStarts[i+1]; jj++)
+        result += vals[jj]*x[cols[jj]];
+
+      y[i] = alpha*result + beta*y[i];
+    }
+  } else {
+    // #pragma omp parallel for
+    for(dlong i=0; i<Nrows; i++){ //local
+      dfloat result = 0.0;
+      for(dlong jj=rowStarts[i]; jj<rowStarts[i+1]; jj++)
+        result += vals[jj]*x[cols[jj]];
+
+      y[i] = alpha*result;
+    }
+  }
+}
+
+void CSR::SpMV(const dfloat alpha, dfloat *x,
+               const dfloat beta, const dfloat *y, dfloat *z) {
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  // #pragma omp parallel for
+  for(dlong i=0; i<Nrows; i++){ //local
+    dfloat result = 0.0;
+    for(dlong jj=rowStarts[i]; jj<rowStarts[i+1]; jj++)
+      result += vals[jj]*x[cols[jj]];
+
+    z[i] = alpha*result + beta*y[i];
+  }
+}
+
+void CSR::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+              occa::memory o_y){
+  // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  // occaTimerTic(device,"SpMV CSR");
+  if (Nrows)
+    SpMVcsrKernel1(Nrows, alpha, beta, o_rowStarts, o_cols, o_vals,
+                          o_x, o_y);
+  // occaTimerToc(device,"SpMV CSR");
+}
+
+void CSR::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+              occa::memory o_y, occa::memory o_z){
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  // occaTimerTic(device,"SpMV CSR");
+  if (Nrows)
+    SpMVcsrKernel2(Nrows, alpha, beta, o_rowStarts, o_cols, o_vals,
+                          o_x, o_y, o_z);
+  // occaTimerToc(device,"SpMV CSR");
+}
+
+
+//------------------------------------------------------------------------
+//
+//  ELL matrix
+//
+//------------------------------------------------------------------------
+void ELL::SpMV(const dfloat alpha, dfloat *x,
+               const dfloat beta, dfloat *y) {
+  // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  if (beta) {
+    // #pragma omp parallel for
+    for(dlong i=0; i<Nrows; i++){ //local
+      dfloat result = 0.0;
+      for(dlong c=0; c<nnzPerRow; c++) {
+        dlong col = cols[c+nnzPerRow*i];
+        if (col>-1) {
+          result += vals[c+nnzPerRow*i]*x[col];
+        }
+      }
+      y[i] = alpha*result + beta*y[i];
+    }
+  } else {
+    // #pragma omp parallel for
+    for(dlong i=0; i<Nrows; i++){ //local
+      dfloat result = 0.0;
+      for(dlong c=0; c<nnzPerRow; c++) {
+        dlong col = cols[c+nnzPerRow*i];
+        if (col>-1) {
+          result += vals[c+nnzPerRow*i]*x[col];
+        }
+      }
+      y[i] = alpha*result;
+    }
+  }
+}
+
+void ELL::SpMV(const dfloat alpha, dfloat *x,
+               const dfloat beta, const dfloat *y, dfloat *z) {
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  // #pragma omp parallel for
+  for(dlong i=0; i<Nrows; i++){ //local
+    dfloat result = 0.0;
+    for(dlong c=0; c<nnzPerRow; c++) {
+      dlong col = cols[c+nnzPerRow*i];
+      if (col>-1) {
+        result += vals[c+nnzPerRow*i]*x[col];
+      }
+    }
+    z[i] = alpha*result + beta*y[i];
+  }
+}
+
+void ELL::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+                occa::memory o_y) {
+  // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  if(nnzPerRow){
+    // occaTimerTic(device,"SpMV ELL");
+    SpMVellKernel1(Nrows, nnzPerRow,
+                             alpha, beta, o_cols, o_vals, o_x, o_y);
+    // occaTimerToc(device,"SpMV ELL");
+  }
+}
+
+void ELL::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+                occa::memory o_y, occa::memory o_z) {
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  if(nnzPerRow){
+    // occaTimerTic(device,"SpMV ELL");
+    SpMVellKernel2(Nrows, nnzPerRow,
+                             alpha, beta, o_cols, o_vals, o_x, o_y, o_z);
+    // occaTimerToc(device,"SpMV ELL");
+  }
+}
+
+//------------------------------------------------------------------------
+//
+//  MCSR matrix
+//
+//------------------------------------------------------------------------
+void MCSR::SpMV(const dfloat alpha, dfloat *x,
+                const dfloat beta, dfloat *y){
+  // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  if (beta) {
+    // #pragma omp parallel for
+    for(dlong i=0; i<actualRows; i++){ //local
+      dlong row = rows[i];
+      dfloat result = 0.0;
+      for(dlong jj=rowStarts[i]; jj<rowStarts[i+1]; jj++)
+        result += vals[jj]*x[cols[jj]];
+
+      y[row] = alpha*result + beta*y[row];
+    }
+  } else {
+    // #pragma omp parallel for
+    for(dlong i=0; i<actualRows; i++){ //local
+      dlong row = rows[i];
+      dfloat result = 0.0;
+      for(dlong jj=rowStarts[i]; jj<rowStarts[i+1]; jj++)
+        result += vals[jj]*x[cols[jj]];
+
+      y[row] = alpha*result;
+    }
+  }
+}
+
+void MCSR::SpMV(const dfloat alpha, dfloat *x,
+                const dfloat beta, const dfloat *y, dfloat *z){
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  // #pragma omp parallel for
+  for(dlong i=0; i<actualRows; i++){ //local
+    dlong row = rows[i];
+    dfloat result = 0.0;
+    for(dlong jj=rowStarts[i]; jj<rowStarts[i+1]; jj++)
+      result += vals[jj]*x[cols[jj]];
+
+    z[row] = alpha*result + beta*y[row];
+  }
+}
+
+void MCSR::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+                occa::memory o_y) {
+  // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  // occaTimerTic(device,"SpMV MCSR");
+  if (actualRows)
+    SpMVmcsrKernel1(actualRows, alpha, beta,
+                    o_rowStarts, o_rows, o_cols, o_vals,
+                    o_x, o_y);
+  // occaTimerToc(device,"SpMV MCSR");
+}
+
+void MCSR::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+                occa::memory o_y, occa::memory o_z) {
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  // occaTimerTic(device,"SpMV MCSR");
+  if (actualRows)
+    SpMVmcsrKernel2(actualRows, alpha, beta,
+                    o_rowStarts, o_rows, o_cols, o_vals,
+                    o_x, o_y, o_z);
+  // occaTimerToc(device,"SpMV MCSR");
+}
+
+
+//------------------------------------------------------------------------
+//
+//  parCSR matrix
+//
+//------------------------------------------------------------------------
+void parCSR::SpMV(const dfloat alpha, dfloat *x,
+                  const dfloat beta, dfloat *y) {
+
+  this->haloExchangeStart(x);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  diag->SpMV(alpha, x, beta, y);
+
+  this->haloExchangeFinish(x);
+
+  offd->SpMV(alpha, x, 1.0, y);
+
+  //rank 1 correction if there is a nullspace
+  if (nullSpace) {
+    dfloat gamma = vectorInnerProd(Nrows, null, x, comm)*nullSpacePenalty;
+    vectorAdd(Nrows, alpha*gamma, null, 1.0, y);
+  }
+}
+
+void parCSR::SpMV(const dfloat alpha, dfloat *x,
+                  const dfloat beta, const dfloat *y, dfloat *z) {
+
+  this->haloExchangeStart(x);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  diag->SpMV(alpha, x, beta, y, z);
+
+  this->haloExchangeFinish(x);
+
+  offd->SpMV(alpha, x, 1.0, z);
+
+  //rank 1 correction if there is a nullspace
+  if (nullSpace) {
+    dfloat gamma = vectorInnerProd(Nrows, null, x, comm)*nullSpacePenalty;
+    vectorAdd(Nrows, alpha*gamma, null, 1.0, z);
+  }
+}
+
+void parCSR::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+                  occa::memory o_y) {
+
+  this->haloExchangeStart(o_x);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  diag->SpMV(alpha, o_x, beta, o_y);
+
+  this->haloExchangeFinish(o_x);
+
+  offd->SpMV(alpha, o_x, 1.0, o_y);
+
+  //rank 1 correction if there is a nullspace
+  if (nullSpace) {
+    dfloat gamma = vectorInnerProd(Nrows, o_null, o_x, comm)*nullSpacePenalty;
+    vectorAdd(Nrows, alpha*gamma, o_null, 1.0, o_y);
+  }
+}
+
+void parCSR::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+                  occa::memory o_y, occa::memory o_z) {
+
+  this->haloExchangeStart(o_x);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  diag->SpMV(alpha, o_x, beta, o_y, o_z);
+
+  this->haloExchangeFinish(o_x);
+
+  offd->SpMV(alpha, o_x, 1.0, o_z);
+
+  //rank 1 correction if there is a nullspace
+  if (nullSpace) {
+    dfloat gamma = vectorInnerProd(Nrows, o_null, o_x, comm)*nullSpacePenalty;
+    vectorAdd(Nrows, alpha*gamma, o_null, 1.0, o_z);
+  }
+}
+
+//------------------------------------------------------------------------
+//
+//  parHYB matrix
+//
+//------------------------------------------------------------------------
+void parHYB::SpMV(const dfloat alpha, dfloat *x,
+                  const dfloat beta, dfloat *y) {
+
+  this->haloExchangeStart(x);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  E->SpMV(alpha, x, beta, y);
+
+  this->haloExchangeFinish(x);
+
+  C->SpMV(alpha, x, 1.0, y);
+
+  //rank 1 correction if there is a nullspace
+  if (nullSpace) {
+    dfloat gamma = vectorInnerProd(Nrows, null, x, comm)*nullSpacePenalty;
+    vectorAdd(Nrows, alpha*gamma, null, 1.0, y);
+  }
+}
+
+void parHYB::SpMV(const dfloat alpha, dfloat *x,
+                  const dfloat beta, const dfloat *y, dfloat *z) {
+
+  this->haloExchangeStart(x);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  E->SpMV(alpha, x, beta, y, z);
+
+  this->haloExchangeFinish(x);
+
+  C->SpMV(alpha, x, 1.0, z);
+
+  //rank 1 correction if there is a nullspace
+  if (nullSpace) {
+    dfloat gamma = vectorInnerProd(Nrows, null, x, comm)*nullSpacePenalty;
+    vectorAdd(Nrows, alpha*gamma, null, 1.0, z);
+  }
+}
+
+void parHYB::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+                  occa::memory o_y) {
+
+  this->haloExchangeStart(o_x);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  E->SpMV(alpha, o_x, beta, o_y);
+
+  this->haloExchangeFinish(o_x);
+
+  C->SpMV(alpha, o_x, 1.0, o_y);
+
+  //rank 1 correction if there is a nullspace
+  if (nullSpace) {
+    dfloat gamma = vectorInnerProd(Nrows, o_null, o_x, comm)*nullSpacePenalty;
+    vectorAdd(Nrows, alpha*gamma, o_null, 1.0, o_y);
+  }
+}
+
+void parHYB::SpMV(const dfloat alpha, occa::memory o_x, const dfloat beta,
+                  occa::memory o_y, occa::memory o_z) {
+
+  this->haloExchangeStart(o_x);
+
+  // z[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
+  E->SpMV(alpha, o_x, beta, o_y, o_z);
+
+  this->haloExchangeFinish(o_x);
+
+  C->SpMV(alpha, o_x, 1.0, o_z);
+
+  //rank 1 correction if there is a nullspace
+  if (nullSpace) {
+    dfloat gamma = vectorInnerProd(Nrows, o_null, o_x, comm)*nullSpacePenalty;
+    vectorAdd(Nrows, alpha*gamma, o_null, 1.0, o_z);
+  }
+}
+
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgLevel.cpp b/libs/parAlmond/src/agmgLevel.cpp
new file mode 100644
index 000000000..5e99d5967
--- /dev/null
+++ b/libs/parAlmond/src/agmgLevel.cpp
@@ -0,0 +1,186 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+agmgLevel::agmgLevel(parCSR *A_, KrylovType ktype_):
+  multigridLevel(A_->Nrows, A_->Ncols, ktype_, A_->comm) {
+
+  weighted = false;
+  gatherLevel = false;
+
+  A = A_;
+}
+
+agmgLevel::agmgLevel(parCSR *A_, parCSR *P_, parCSR *R_, KrylovType ktype_):
+  multigridLevel(A_->Nrows, A_->Ncols, ktype_, A_->comm) {
+
+  //max
+  Ncols = (A_->Ncols>P_->Ncols) ? A_->Ncols : P_->Ncols;
+
+  weighted = false;
+  gatherLevel = false;
+
+  A = A_;
+  P = P_;
+  R = R_;
+}
+
+agmgLevel::~agmgLevel() {
+
+  delete   A; delete   P; delete   R;
+  delete o_A; delete o_P; delete o_R;
+
+}
+
+void agmgLevel::Ax        (dfloat *x, dfloat *Ax){ A->SpMV(1.0, x, 0.0, Ax); }
+
+void agmgLevel::coarsen   (dfloat *r, dfloat *Rr){
+  if (gatherLevel) {
+    ogsGather(Gx, r, ogsDfloat, ogsAdd, ogs);
+    vectorDotStar(ogs->Ngather, ogs->gatherInvDegree, Gx);
+    R->SpMV(1.0, Gx, 0.0, Rr);
+  } else {
+    R->SpMV(1.0, r, 0.0, Rr);
+  }
+}
+
+void agmgLevel::prolongate(dfloat *x, dfloat *Px){
+  if (gatherLevel) {
+    P->SpMV(1.0, x, 0.0, Gx);
+    ogsScatter(Sx, Gx, ogsDfloat, ogsAdd, ogs);
+    vectorAdd(P->Nrows, 1.0, Sx, 1.0, Px);
+  } else {
+    P->SpMV(1.0, x, 1.0, Px);
+  }
+}
+
+void agmgLevel::residual  (dfloat *rhs, dfloat *x, dfloat *res) { A->SpMV(-1.0, x, 1.0, rhs, res); }
+
+void agmgLevel::Ax        (occa::memory o_x, occa::memory o_Ax){ o_A->SpMV(1.0, o_x, 0.0, o_Ax); }
+
+void agmgLevel::coarsen   (occa::memory o_r, occa::memory o_Rr){
+  if (gatherLevel) {
+    ogsGather(o_Gx, o_r, ogsDfloat, ogsAdd, ogs);
+    vectorDotStar(ogs->Ngather, ogs->o_gatherInvDegree, o_Gx);
+    o_R->SpMV(1.0, o_Gx, 0.0, o_Rr);
+  } else {
+    o_R->SpMV(1.0, o_r, 0.0, o_Rr);
+  }
+}
+
+void agmgLevel::prolongate(occa::memory o_x, occa::memory o_Px){
+  if (gatherLevel) {
+    o_P->SpMV(1.0, o_x, 0.0, o_Gx);
+    ogsScatter(o_Sx, o_Gx, ogsDfloat, ogsAdd, ogs);
+    vectorAdd(ogs->N, 1.0, o_Sx, 1.0, o_Px);
+  } else {
+    o_P->SpMV(1.0, o_x, 1.0, o_Px);
+  }
+}
+
+void agmgLevel::residual  (occa::memory o_rhs, occa::memory o_x, occa::memory o_res) { o_A->SpMV(-1.0, o_x, 1.0, o_rhs, o_res); }
+
+void agmgLevel::smooth(dfloat *rhs, dfloat *x, bool x_is_zero){
+  if(stype == JACOBI){
+    this->smoothJacobi(rhs, x, x_is_zero);
+  } else if(stype == DAMPED_JACOBI){
+    this->smoothDampedJacobi(rhs, x, x_is_zero);
+  } else if(stype == CHEBYSHEV){
+    this->smoothChebyshev(rhs, x, x_is_zero);
+  }
+}
+
+void agmgLevel::smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero){
+  if(stype == JACOBI){
+    this->smoothJacobi(o_rhs, o_x, x_is_zero);
+  } else if(stype == DAMPED_JACOBI){
+    this->smoothDampedJacobi(o_rhs, o_x, x_is_zero);
+  } else if(stype == CHEBYSHEV){
+    this->smoothChebyshev(o_rhs, o_x, x_is_zero);
+  }
+}
+
+void agmgLevel::Report() {
+
+  int rank, size;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &size);
+
+  hlong hNrows = (hlong) Nrows;
+
+  int active = (Nrows>0) ? 1:0;
+  int totalActive=0;
+  MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, comm);
+
+  dlong minNrows=0, maxNrows=0;
+  hlong totalNrows=0;
+  dfloat avgNrows;
+  MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, comm);
+  MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, comm);
+  avgNrows = (dfloat) totalNrows/totalActive;
+
+  if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min
+  MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, comm);
+
+
+  long long int nnz;
+  nnz = A->diag->nnz+A->offd->nnz;
+
+  long long int minNnz=0, maxNnz=0, totalNnz=0;
+  dfloat avgNnz;
+  MPI_Allreduce(&nnz, &maxNnz,   1, MPI_LONG_LONG_INT, MPI_MAX, comm);
+  MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, comm);
+  avgNnz = (dfloat) totalNnz/totalActive;
+
+  if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min
+  MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, comm);
+
+  dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows;
+  dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0;
+  MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, comm);
+  MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, comm);
+  avgNnzPerRow /= totalActive;
+
+  if (Nrows==0) nnzPerRow = maxNnzPerRow;
+  MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, comm);
+
+  char smootherString[BUFSIZ];
+  if (stype==DAMPED_JACOBI)
+    strcpy(smootherString, "Damped Jacobi   ");
+  else if (stype==CHEBYSHEV)
+    strcpy(smootherString, "Chebyshev       ");
+
+  if (rank==0){
+    printf(     "|  parAlmond |  %12d  | %13d   |   %s|\n", minNrows, (int)minNnzPerRow, smootherString);
+    printf("     |            |  %12d  | %13d   |                   |\n", maxNrows, (int)maxNnzPerRow);
+    printf("     |            |  %12d  | %13d   |                   |\n", (int)avgNrows, (int)avgNnzPerRow);
+  }
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgSetup/adjustPartition.cpp b/libs/parAlmond/src/agmgSetup/adjustPartition.cpp
new file mode 100644
index 000000000..500234680
--- /dev/null
+++ b/libs/parAlmond/src/agmgSetup/adjustPartition.cpp
@@ -0,0 +1,277 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+void adjustPartition(agmgLevel *level, hlong* FineToCoarse, setupAide options) {
+  // MPI info
+  int rank, size;
+  rank = agmg::rank;
+  size = agmg::size;
+
+  dlong N = level->A->Nrows;
+
+  //Need to establish 'ownership' of aggregates
+
+  //Keep the current partitioning for STRONGNODES.
+  // The rank that had the strong node for each aggregate owns the aggregate
+  if (options.compareArgs("PARALMOND PARTITION", "STRONGNODES")) return;
+
+  //populate aggregate array
+  hlong gNumAggs = level->globalAggStarts[size]; //total number of aggregates
+
+  parallelAggregate_t *sendAggs;
+  if (N)
+    sendAggs = (parallelAggregate_t *) calloc(N,sizeof(parallelAggregate_t));
+  else
+    sendAggs = (parallelAggregate_t *) calloc(1,sizeof(parallelAggregate_t));
+
+  for (dlong i=0;i<N;i++) {
+    sendAggs[i].fineId = i;
+    sendAggs[i].originRank = rank;
+
+    sendAggs[i].coarseId = FineToCoarse[i];
+
+    //set a temporary owner. Evenly distibute aggregates amoungst ranks
+    sendAggs[i].ownerRank = (int) (FineToCoarse[i]*size)/gNumAggs;
+  }
+
+  // Make the MPI_PARALLEL_AGGREGATE data type
+  MPI_Datatype MPI_PARALLEL_AGGREGATE;
+  MPI_Datatype dtype[5] = {MPI_DLONG, MPI_HLONG, MPI_HLONG, MPI_INT, MPI_INT};
+  int blength[5] = {1, 1, 1, 1, 1};
+  MPI_Aint addr[5], displ[5];
+  MPI_Get_address ( &(sendAggs[0]            ), addr+0);
+  MPI_Get_address ( &(sendAggs[0].coarseId   ), addr+1);
+  MPI_Get_address ( &(sendAggs[0].newCoarseId), addr+2);
+  MPI_Get_address ( &(sendAggs[0].originRank ), addr+3);
+  MPI_Get_address ( &(sendAggs[0].ownerRank  ), addr+4);
+  displ[0] = 0;
+  displ[1] = addr[1] - addr[0];
+  displ[2] = addr[2] - addr[0];
+  displ[3] = addr[3] - addr[0];
+  displ[4] = addr[4] - addr[0];
+  MPI_Type_create_struct (5, blength, displ, dtype, &MPI_PARALLEL_AGGREGATE);
+  MPI_Type_commit (&MPI_PARALLEL_AGGREGATE);
+
+  //sort by owning rank for all_reduce
+  qsort(sendAggs, N, sizeof(parallelAggregate_t), compareOwner);
+
+  int *sendCounts = (int *) calloc(size,sizeof(int));
+  int *recvCounts = (int *) calloc(size,sizeof(int));
+  int *sendOffsets = (int *) calloc(size+1,sizeof(int));
+  int *recvOffsets = (int *) calloc(size+1,sizeof(int));
+
+  for(dlong i=0;i<N;++i)
+    sendCounts[sendAggs[i].ownerRank]++;
+
+  // find how many nodes to expect (should use sparse version)
+  MPI_Alltoall(sendCounts, 1, MPI_INT, recvCounts, 1, MPI_INT, agmg::comm);
+
+  // find send and recv offsets for gather
+  dlong recvNtotal = 0;
+  for(int r=0;r<size;++r){
+    sendOffsets[r+1] = sendOffsets[r] + sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r] + recvCounts[r];
+    recvNtotal += recvCounts[r];
+  }
+  parallelAggregate_t *recvAggs = (parallelAggregate_t *) calloc(recvNtotal,sizeof(parallelAggregate_t));
+
+  MPI_Alltoallv(sendAggs, sendCounts, sendOffsets, MPI_PARALLEL_AGGREGATE,
+                recvAggs, recvCounts, recvOffsets, MPI_PARALLEL_AGGREGATE,
+                agmg::comm);
+
+  //sort by coarse aggregate number, and then by original rank
+  qsort(recvAggs, recvNtotal, sizeof(parallelAggregate_t), compareAgg);
+
+  //count the number of unique aggregates here
+  dlong NumUniqueAggs =0;
+  if (recvNtotal) NumUniqueAggs++;
+  for (dlong i=1;i<recvNtotal;i++)
+    if(recvAggs[i].coarseId!=recvAggs[i-1].coarseId) NumUniqueAggs++;
+
+  //get their locations in the array
+  dlong *aggStarts;
+  if (NumUniqueAggs)
+    aggStarts = (dlong *) calloc(NumUniqueAggs+1,sizeof(dlong));
+  dlong cnt = 1;
+  for (dlong i=1;i<recvNtotal;i++)
+    if(recvAggs[i].coarseId!=recvAggs[i-1].coarseId) aggStarts[cnt++] = i;
+  aggStarts[NumUniqueAggs] = recvNtotal;
+
+
+  if (options.compareArgs("PARALMOND PARTITION", "DISTRIBUTED")) { //rank that contributes most to the aggregate ownes it
+    //use a random dfloat for each rank to break ties.
+    dfloat rand = (dfloat) drand48();
+    dfloat *gRands = (dfloat *) calloc(size,sizeof(dfloat));
+    MPI_Allgather(&rand, 1, MPI_DFLOAT, gRands, 1, MPI_DFLOAT, agmg::comm);
+
+    //determine the aggregates majority owner
+    int *rankCounts = (int *) calloc(size,sizeof(int));
+    for (dlong n=0;n<NumUniqueAggs;n++) {
+      //populate randomizer
+      for (int r=0;r<size;r++)
+        rankCounts[r] = gRands[r];
+
+      //count the number of contributions to the aggregate from the separate ranks
+      for (dlong i=aggStarts[n];i<aggStarts[n+1];i++)
+        rankCounts[recvAggs[i].originRank]++;
+
+      //find which rank is contributing the most to this aggregate
+      int ownerRank = 0;
+      dfloat maxEntries = rankCounts[0];
+      for (int r=1;r<size;r++) {
+        if (rankCounts[r]>maxEntries) {
+          ownerRank = r;
+          maxEntries = rankCounts[r];
+        }
+      }
+
+      //set this aggregate's owner
+      for (dlong i=aggStarts[n];i<aggStarts[n+1];i++)
+        recvAggs[i].ownerRank = ownerRank;
+    }
+    free(gRands); free(rankCounts);
+  } else { //default SATURATE: always choose the lowest rank to own the aggregate
+    for (dlong n=0;n<NumUniqueAggs;n++) {
+
+      int minrank = size;
+
+      //count the number of contributions to the aggregate from the separate ranks
+      for (dlong i=aggStarts[n];i<aggStarts[n+1];i++){
+
+        minrank = (recvAggs[i].originRank<minrank) ? recvAggs[i].originRank : minrank;
+      }
+
+      //set this aggregate's owner
+      for (dlong i=aggStarts[n];i<aggStarts[n+1];i++)
+        recvAggs[i].ownerRank = minrank;
+    }
+  }
+  free(aggStarts);
+
+  //sort by owning rank
+  qsort(recvAggs, recvNtotal, sizeof(parallelAggregate_t), compareOwner);
+
+  int *newSendCounts = (int *) calloc(size,sizeof(int));
+  int *newRecvCounts = (int *) calloc(size,sizeof(int));
+  int *newSendOffsets = (int *) calloc(size+1,sizeof(int));
+  int *newRecvOffsets = (int *) calloc(size+1,sizeof(int));
+
+  for(dlong i=0;i<recvNtotal;++i)
+    newSendCounts[recvAggs[i].ownerRank]++;
+
+  // find how many nodes to expect (should use sparse version)
+  MPI_Alltoall(newSendCounts, 1, MPI_INT, newRecvCounts, 1, MPI_INT, agmg::comm);
+
+  // find send and recv offsets for gather
+  dlong newRecvNtotal = 0;
+  for(int r=0;r<size;++r){
+    newSendOffsets[r+1] = newSendOffsets[r] + newSendCounts[r];
+    newRecvOffsets[r+1] = newRecvOffsets[r] + newRecvCounts[r];
+    newRecvNtotal += newRecvCounts[r];
+  }
+  parallelAggregate_t *newRecvAggs = (parallelAggregate_t *) calloc(newRecvNtotal,sizeof(parallelAggregate_t));
+
+  MPI_Alltoallv(   recvAggs, newSendCounts, newSendOffsets, MPI_PARALLEL_AGGREGATE,
+                newRecvAggs, newRecvCounts, newRecvOffsets, MPI_PARALLEL_AGGREGATE,
+                agmg::comm);
+
+  //sort by coarse aggregate number, and then by original rank
+  qsort(newRecvAggs, newRecvNtotal, sizeof(parallelAggregate_t), compareAgg);
+
+  //count the number of unique aggregates this rank owns
+  dlong numAggs = 0;
+  if (newRecvNtotal) numAggs++;
+  for (dlong i=1;i<newRecvNtotal;i++)
+    if(newRecvAggs[i].coarseId!=newRecvAggs[i-1].coarseId) numAggs++;
+
+  //determine a global numbering of the aggregates
+  dlong *lNumAggs = (dlong*) calloc(size,sizeof(dlong));
+  MPI_Allgather(&numAggs, 1, MPI_DLONG, lNumAggs, 1, MPI_INT, agmg::comm);
+
+  level->globalAggStarts[0] = 0;
+  for (int r=0;r<size;r++)
+    level->globalAggStarts[r+1] = level->globalAggStarts[r] + lNumAggs[r];
+
+  //set the new global coarse index
+  cnt = level->globalAggStarts[rank];
+  if (newRecvNtotal) newRecvAggs[0].newCoarseId = cnt;
+  for (dlong i=1;i<newRecvNtotal;i++) {
+    if(newRecvAggs[i].coarseId!=newRecvAggs[i-1].coarseId) cnt++;
+
+    newRecvAggs[i].newCoarseId = cnt;
+  }
+
+  //sort by owning rank
+  qsort(newRecvAggs, newRecvNtotal, sizeof(parallelAggregate_t), compareOrigin);
+
+  for(int r=0;r<size;r++) sendCounts[r] = 0;
+  for(int r=0;r<=size;r++) {
+    sendOffsets[r] = 0;
+    recvOffsets[r] = 0;
+  }
+
+  for(dlong i=0;i<newRecvNtotal;++i)
+    sendCounts[newRecvAggs[i].originRank]++;
+
+  // find how many nodes to expect (should use sparse version)
+  MPI_Alltoall(sendCounts, 1, MPI_INT, recvCounts, 1, MPI_INT, agmg::comm);
+
+  // find send and recv offsets for gather
+  recvNtotal = 0;
+  for(int r=0;r<size;++r){
+    sendOffsets[r+1] = sendOffsets[r] + sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r] + recvCounts[r];
+    recvNtotal += recvCounts[r];
+  }
+
+  //send the aggregate data back
+  MPI_Alltoallv(newRecvAggs, sendCounts, sendOffsets, MPI_PARALLEL_AGGREGATE,
+                   sendAggs, recvCounts, recvOffsets, MPI_PARALLEL_AGGREGATE,
+                agmg::comm);
+
+  //clean up
+  MPI_Barrier(agmg::comm);
+  MPI_Type_free(&MPI_PARALLEL_AGGREGATE);
+
+  free(recvAggs);
+  free(sendCounts);  free(recvCounts);
+  free(sendOffsets); free(recvOffsets);
+  free(newRecvAggs);
+  free(newSendCounts);  free(newRecvCounts);
+  free(newSendOffsets); free(newRecvOffsets);
+
+  //record the new FineToCoarse map
+  for (dlong i=0;i<N;i++)
+    FineToCoarse[sendAggs[i].fineId] = sendAggs[i].newCoarseId;
+
+  free(sendAggs);
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgSetup/agmgSetup.cpp b/libs/parAlmond/src/agmgSetup/agmgSetup.cpp
new file mode 100644
index 000000000..43403f350
--- /dev/null
+++ b/libs/parAlmond/src/agmgSetup/agmgSetup.cpp
@@ -0,0 +1,172 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+void solver_t::AMGSetup(parCSR *A){
+
+  // approximate Nrows at coarsest level
+  coarseLevel = new coarseSolver(options);
+  const int gCoarseSize = coarseLevel->getTargetSize();
+
+  AMGstartLev = numLevels;
+
+  agmgLevel *L = new agmgLevel(A, ktype);
+  levels[numLevels] = L;
+
+  setupAgmgSmoother((agmgLevel*)(levels[numLevels]), stype, ChebyshevIterations);
+
+  hlong globalSize = L->A->globalRowStarts[size];
+
+  //if the system if already small, dont create MG levels
+  bool done = false;
+  if(globalSize <= gCoarseSize){
+    coarseLevel->setup(A);
+    baseLevel = numLevels;
+    done = true;
+  }
+  numLevels++;
+
+  while(!done){
+    L = coarsenAgmgLevel((agmgLevel*)(levels[numLevels-1]), ktype, options);
+    levels[numLevels] = L;
+    hlong globalCoarseSize = L->A->globalRowStarts[size];
+    numLevels++;
+
+    if(globalCoarseSize <= gCoarseSize || globalSize < 2*globalCoarseSize){
+      coarseLevel->setup(L->A);
+      baseLevel = numLevels-1;
+      break;
+    }
+    globalSize = globalCoarseSize;
+  }
+
+  size_t requiredBytes = 3*levels[AMGstartLev]->Ncols*sizeof(dfloat);
+  allocateScratchSpace(requiredBytes, device);
+
+  for (int n=AMGstartLev;n<numLevels;n++) {
+    setupAgmgSmoother((agmgLevel*)(levels[n]), stype, ChebyshevIterations);
+    allocateAgmgVectors((agmgLevel*)(levels[n]), n, AMGstartLev, ctype);
+    syncAgmgToDevice((agmgLevel*)(levels[n]), n, AMGstartLev, ctype);
+  }
+  coarseLevel->syncToDevice();
+}
+
+//create coarsened problem
+agmgLevel *coarsenAgmgLevel(agmgLevel *level, KrylovType ktype, setupAide options){
+
+  int rank, size;
+  MPI_Comm_rank(level->comm, &rank);
+  MPI_Comm_size(level->comm, &size);
+
+  parCSR *C = strongGraph(level->A);
+
+  hlong *FineToCoarse = (hlong *) malloc(level->A->Ncols*sizeof(hlong));
+  hlong *globalAggStarts = (hlong *) calloc(size+1,sizeof(hlong));
+
+  formAggregates(level->A, C, FineToCoarse, globalAggStarts);
+
+  // adjustPartition(FineToCoarse, options);
+
+  dfloat *nullCoarseA;
+  parCSR *P = constructProlongation(level->A, FineToCoarse, globalAggStarts, &nullCoarseA);
+  parCSR *R = transpose(P);
+  parCSR *A = galerkinProd(level->A, P);
+
+  A->null = nullCoarseA;
+
+  agmgLevel *coarseLevel = new agmgLevel(A,P,R, ktype);
+
+  //update the number of columns required for this level (from R)
+  level->Ncols = (level->Ncols > R->Ncols) ? level->Ncols : R->Ncols;
+
+  return coarseLevel;
+}
+
+void setupAgmgSmoother(agmgLevel *level, SmoothType s, int ChebIterations){
+
+  level->stype = s;
+  level->ChebyshevIterations = ChebIterations;
+
+  if((s == DAMPED_JACOBI)||(s == CHEBYSHEV)){
+    // estimate rho(invD * A)
+    dfloat rho = level->A->rhoDinvA();
+
+    if (s == DAMPED_JACOBI) {
+      level->lambda = (4./3.)/rho;
+    } else if (s == CHEBYSHEV) {
+      level->lambda1 = rho;
+      level->lambda0 = rho/10.;
+    }
+  }
+}
+
+void allocateAgmgVectors(agmgLevel *level, int k, int AMGstartLev, CycleType ctype) {
+
+  if (k) level->x    = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
+  if (k) level->rhs  = (dfloat *) calloc(level->Nrows,sizeof(dfloat));
+
+  level->res  = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
+
+  //kcycle vectors
+  if (ctype==KCYCLE) {
+    if ((k>0) && (k<NUMKCYCLES+1)) {
+      level->ck = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
+      level->vk = (dfloat *) calloc(level->Nrows,sizeof(dfloat));
+      level->wk = (dfloat *) calloc(level->Nrows,sizeof(dfloat));
+    }
+  }
+}
+
+void syncAgmgToDevice(agmgLevel *level, int k, int AMGstartLev, CycleType ctype) {
+
+  occa::device device = level->A->device;
+
+  level->o_A = new parHYB(level->A);
+  level->o_A->syncToDevice();
+  if (k>AMGstartLev) {
+    level->o_R = new parHYB(level->R);
+    level->o_P = new parHYB(level->P);
+    level->o_R->syncToDevice();
+    level->o_P->syncToDevice();
+  }
+
+  if (level->x  ) level->o_x   = device.malloc(level->Ncols*sizeof(dfloat),level->x);
+  if (level->rhs) level->o_rhs = device.malloc(level->Nrows*sizeof(dfloat),level->rhs);
+  if (level->res) level->o_res = device.malloc(level->Ncols*sizeof(dfloat),level->res);
+
+  if (ctype==KCYCLE) {
+    if ((k>0) && (k<NUMKCYCLES+1)) {
+      level->o_ck = device.malloc(level->Ncols*sizeof(dfloat),level->ck);
+      level->o_vk = device.malloc(level->Nrows*sizeof(dfloat),level->vk);
+      level->o_wk = device.malloc(level->Nrows*sizeof(dfloat),level->wk);
+    }
+  }
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgSetup/constructProlongation.cpp b/libs/parAlmond/src/agmgSetup/constructProlongation.cpp
new file mode 100644
index 000000000..9887a6af3
--- /dev/null
+++ b/libs/parAlmond/src/agmgSetup/constructProlongation.cpp
@@ -0,0 +1,124 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+parCSR *constructProlongation(parCSR *A, hlong *FineToCoarse,
+                            hlong *globalAggStarts, dfloat **nullCoarseA){
+  // MPI info
+  int rank, size;
+  MPI_Comm_rank(A->comm, &rank);
+  MPI_Comm_size(A->comm, &size);
+
+  const dlong N = A->Nrows;
+
+  const hlong globalAggOffset = globalAggStarts[rank];
+  const dlong NCoarse = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local num agg
+
+  parCSR* P = new parCSR(N, NCoarse, A->comm, A->device);
+
+  P->globalRowStarts = A->globalRowStarts;
+  P->globalColStarts = globalAggStarts;
+
+  P->diag->rowStarts = (dlong *) calloc(N+1, sizeof(dlong));
+  P->offd->rowStarts = (dlong *) calloc(N+1, sizeof(dlong));
+
+  // each row has exactly one nonzero
+  for(dlong i=0; i<N; i++) {
+    hlong col = FineToCoarse[i];
+    if ((col>globalAggOffset-1)&&(col<globalAggOffset+NCoarse)) {
+      P->diag->rowStarts[i+1]++;
+    } else {
+      P->offd->rowStarts[i+1]++;
+    }
+  }
+  for(dlong i=0; i<N; i++) {
+    P->diag->rowStarts[i+1] += P->diag->rowStarts[i];
+    P->offd->rowStarts[i+1] += P->offd->rowStarts[i];
+  }
+  P->diag->nnz = P->diag->rowStarts[N];
+  P->offd->nnz = P->offd->rowStarts[N];
+
+  // Halo setup
+  hlong *colIds = (hlong *) malloc(P->offd->nnz*sizeof(hlong));
+  dlong cnt=0;
+  for (dlong i=0;i<N;i++) {
+    hlong col = FineToCoarse[i];
+    if ((col<globalAggOffset)||(col>globalAggOffset+NCoarse-1))
+      colIds[cnt++] = col;
+  }
+  P->haloSetup(colIds);
+
+  P->diag->cols = (dlong *)  calloc(P->diag->nnz, sizeof(dlong));
+  P->diag->vals = (dfloat *) calloc(P->diag->nnz, sizeof(dfloat));
+  P->offd->cols = (dlong *)  calloc(P->offd->nnz, sizeof(dlong));
+  P->offd->vals = (dfloat *) calloc(P->offd->nnz, sizeof(dfloat));
+
+  dlong diagCnt = 0;
+  dlong offdCnt = 0;
+  for(dlong i=0; i<N; i++) {
+    hlong col = FineToCoarse[i];
+    if ((col>globalAggStarts[rank]-1)&&(col<globalAggStarts[rank+1])) {
+      P->diag->cols[diagCnt  ] = (dlong) (col - globalAggOffset); //local index
+      P->diag->vals[diagCnt++] = A->null[i];
+    } else {
+      P->offd->cols[offdCnt  ] = colIds[offdCnt];
+      P->offd->vals[offdCnt++] = A->null[i];
+    }
+  }
+
+  // normalize the columns of P
+  *nullCoarseA = (dfloat *) calloc(P->Ncols,sizeof(dfloat));
+
+  //add local nonzeros
+  for(dlong i=0; i<P->diag->nnz; i++)
+    (*nullCoarseA)[P->diag->cols[i]] += P->diag->vals[i] * P->diag->vals[i];
+
+  //add nonlocal nonzeros
+  for(dlong i=0; i<P->offd->nnz; i++)
+    (*nullCoarseA)[P->offd->cols[i]] += P->offd->vals[i] * P->offd->vals[i];
+
+  ogsGatherScatter((*nullCoarseA), ogsDfloat,  ogsAdd, P->ogs);
+
+  for(dlong i=0; i<NCoarse; i++)
+    (*nullCoarseA)[i] = sqrt((*nullCoarseA)[i]);
+
+  for(dlong i=NCoarse; i<P->Ncols; i++)
+    (*nullCoarseA)[i] = 0.;
+
+  ogsGatherScatter((*nullCoarseA), ogsDfloat,  ogsAdd, P->ogs);
+
+  for(dlong i=0; i<P->diag->nnz; i++)
+    P->diag->vals[i] /= (*nullCoarseA)[P->diag->cols[i]];
+  for(dlong i=0; i<P->offd->nnz; i++)
+    P->offd->vals[i] /= (*nullCoarseA)[P->offd->cols[i]];
+
+  return P;
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgSetup/formAggregates.cpp b/libs/parAlmond/src/agmgSetup/formAggregates.cpp
new file mode 100644
index 000000000..12401a579
--- /dev/null
+++ b/libs/parAlmond/src/agmgSetup/formAggregates.cpp
@@ -0,0 +1,297 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+void formAggregates(parCSR *A, parCSR *C,
+                     hlong* FineToCoarse,
+                     hlong* globalAggStarts){
+
+  int rank, size;
+  MPI_Comm_rank(A->comm, &rank);
+  MPI_Comm_size(A->comm, &size);
+
+  const dlong N   = C->Nrows;
+  const dlong M   = C->Ncols;
+  const dlong diagNNZ = C->diag->nnz;
+  const dlong offdNNZ = C->offd->nnz;
+
+  dfloat *rands = (dfloat *) calloc(M, sizeof(dfloat));
+  int   *states = (int *)    calloc(M, sizeof(int));
+
+  dfloat *Tr = (dfloat *) calloc(M, sizeof(dfloat));
+  int    *Ts = (int *)    calloc(M, sizeof(int));
+  hlong  *Ti = (hlong *)  calloc(M, sizeof(hlong));
+  hlong  *Tc = (hlong *)  calloc(M, sizeof(hlong));
+
+  hlong *globalRowStarts = A->globalRowStarts;
+
+  for(dlong i=0; i<N; i++)
+    rands[i] = (dfloat) drand48();
+
+  // add the number of non-zeros in each column
+  int *colCnt = (int *) calloc(M,sizeof(int));
+  for(dlong i=0; i<diagNNZ; i++)
+    colCnt[C->diag->cols[i]]++;
+
+  for(dlong i=0; i<offdNNZ; i++)
+    colCnt[C->offd->cols[i]]++;
+
+  //gs for total column counts
+  ogsGatherScatter(colCnt, ogsInt, ogsAdd, A->ogs);
+
+  //add random pertubation
+  for(int i=0;i<N;++i)
+    rands[i] += colCnt[i];
+
+  //gs to fill halo region
+  ogsGatherScatter(rands, ogsDfloat, ogsAdd, A->ogs);
+
+  hlong done = 0;
+  while(!done){
+    // first neighbours
+    // #pragma omp parallel for
+    for(dlong i=0; i<N; i++){
+
+      int smax = states[i];
+      dfloat rmax = rands[i];
+      hlong imax = i + globalRowStarts[rank];
+
+      if(smax != 1){
+        //local entries
+        for(dlong jj=C->diag->rowStarts[i];jj<C->diag->rowStarts[i+1];jj++){
+          const dlong col = C->diag->cols[jj];
+          if (col==i) continue;
+          if(customLess(smax, rmax, imax, states[col], rands[col], col + globalRowStarts[rank])){
+            smax = states[col];
+            rmax = rands[col];
+            imax = col + globalRowStarts[rank];
+          }
+        }
+        //nonlocal entries
+        for(dlong jj=C->offd->rowStarts[i];jj<C->offd->rowStarts[i+1];jj++){
+          const dlong col = C->offd->cols[jj];
+          if(customLess(smax, rmax, imax, states[col], rands[col], A->colMap[col])) {
+            smax = states[col];
+            rmax = rands[col];
+            imax = A->colMap[col];
+          }
+        }
+      }
+      Ts[i] = smax;
+      Tr[i] = rmax;
+      Ti[i] = imax;
+    }
+
+    //share results
+    for (dlong n=N;n<M;n++) {
+      Tr[n] = 0.;
+      Ts[n] = 0;
+      Ti[n] = 0;
+    }
+    ogsGatherScatter(Tr, ogsDfloat, ogsAdd, A->ogs);
+    ogsGatherScatter(Ts, ogsInt,    ogsAdd, A->ogs);
+    ogsGatherScatter(Ti, ogsHlong,  ogsAdd, A->ogs);
+
+    // second neighbours
+    // #pragma omp parallel for
+    for(dlong i=0; i<N; i++){
+      int    smax = Ts[i];
+      dfloat rmax = Tr[i];
+      hlong  imax = Ti[i];
+
+      //local entries
+      for(dlong jj=C->diag->rowStarts[i];jj<C->diag->rowStarts[i+1];jj++){
+        const dlong col = C->diag->cols[jj];
+        if (col==i) continue;
+        if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
+          smax = Ts[col];
+          rmax = Tr[col];
+          imax = Ti[col];
+        }
+      }
+      //nonlocal entries
+      for(dlong jj=C->offd->rowStarts[i];jj<C->offd->rowStarts[i+1];jj++){
+        const dlong col = C->offd->cols[jj];
+        if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
+          smax = Ts[col];
+          rmax = Tr[col];
+          imax = Ti[col];
+        }
+      }
+
+      // if I am the strongest among all the 1 and 2 ring neighbours
+      // I am an MIS node
+      if((states[i] == 0) && (imax == (i + globalRowStarts[rank])))
+        states[i] = 1;
+
+      // if there is an MIS node within distance 2, I am removed
+      if((states[i] == 0) && (smax == 1))
+        states[i] = -1;
+    }
+
+    //share results
+    for (dlong n=N;n<M;n++) states[n] = 0;
+    ogsGatherScatter(states, ogsInt, ogsAdd, A->ogs);
+
+    // if number of undecided nodes = 0, algorithm terminates
+    hlong cnt = std::count(states, states+N, 0);
+    MPI_Allreduce(&cnt,&done,1,MPI_HLONG, MPI_SUM,A->comm);
+    done = (done == 0) ? 1 : 0;
+  }
+
+  dlong numAggs = 0;
+  dlong *gNumAggs = (dlong *) calloc(size,sizeof(dlong));
+
+  // count the coarse nodes/aggregates
+  for(dlong i=0; i<N; i++)
+    if(states[i] == 1) numAggs++;
+
+  MPI_Allgather(&numAggs,1,MPI_DLONG,gNumAggs,1,MPI_DLONG,A->comm);
+
+  globalAggStarts[0] = 0;
+  for (int r=0;r<size;r++)
+    globalAggStarts[r+1] = globalAggStarts[r] + gNumAggs[r];
+
+  numAggs = 0;
+  // enumerate the coarse nodes/aggregates
+  for(dlong i=0; i<N; i++) {
+    if(states[i] == 1) {
+      FineToCoarse[i] = globalAggStarts[rank] + numAggs++;
+    } else {
+      FineToCoarse[i] = -1;
+    }
+  }
+  for(dlong i=N; i<M; i++) FineToCoarse[i] = 0;
+
+  //share the initial aggregate flags
+  ogsGatherScatter(FineToCoarse, ogsHlong, ogsAdd, A->ogs);
+
+  // form the aggregates
+  // #pragma omp parallel for
+  for(dlong i=0; i<N; i++){
+    int   smax = states[i];
+    dfloat rmax = rands[i];
+    hlong  imax = i + globalRowStarts[rank];
+    hlong  cmax = FineToCoarse[i];
+
+    if(smax != 1){
+      //local entries
+      for(dlong jj=C->diag->rowStarts[i];jj<C->diag->rowStarts[i+1];jj++){
+        const dlong col = C->diag->cols[jj];
+        if (col==i) continue;
+        if(customLess(smax, rmax, imax, states[col], rands[col], col + globalRowStarts[rank])){
+          smax = states[col];
+          rmax = rands[col];
+          imax = col + globalRowStarts[rank];
+          cmax = FineToCoarse[col];
+        }
+      }
+      //nonlocal entries
+      for(dlong jj=C->offd->rowStarts[i];jj<C->offd->rowStarts[i+1];jj++){
+        const dlong col = C->offd->cols[jj];
+        if(customLess(smax, rmax, imax, states[col], rands[col], A->colMap[col])){
+          smax = states[col];
+          rmax = rands[col];
+          imax = A->colMap[col];
+          cmax = FineToCoarse[col];
+        }
+      }
+    }
+    Ts[i] = smax;
+    Tr[i] = rmax;
+    Ti[i] = imax;
+    Tc[i] = cmax;
+
+    if((states[i] == -1) && (smax == 1) && (cmax > -1))
+      FineToCoarse[i] = cmax;
+  }
+
+  //share results
+  for (dlong n=N;n<M;n++) {
+    FineToCoarse[n] = 0;
+    Tr[n] = 0.;
+    Ts[n] = 0;
+    Ti[n] = 0;
+    Tc[n] = 0;
+  }
+  ogsGatherScatter(FineToCoarse, ogsHlong,  ogsAdd, A->ogs);
+  ogsGatherScatter(Tr,     ogsDfloat, ogsAdd, A->ogs);
+  ogsGatherScatter(Ts,     ogsInt,    ogsAdd, A->ogs);
+  ogsGatherScatter(Ti,     ogsHlong,  ogsAdd, A->ogs);
+  ogsGatherScatter(Tc,     ogsHlong,  ogsAdd, A->ogs);
+
+  // second neighbours
+  // #pragma omp parallel for
+  for(dlong i=0; i<N; i++){
+    int    smax = Ts[i];
+    dfloat rmax = Tr[i];
+    hlong  imax = Ti[i];
+    hlong  cmax = Tc[i];
+
+    //local entries
+    for(dlong jj=C->diag->rowStarts[i];jj<C->diag->rowStarts[i+1];jj++){
+      const dlong col = C->diag->cols[jj];
+      if (col==i) continue;
+      if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
+        smax = Ts[col];
+        rmax = Tr[col];
+        imax = Ti[col];
+        cmax = Tc[col];
+      }
+    }
+    //nonlocal entries
+    for(dlong jj=C->offd->rowStarts[i];jj<C->offd->rowStarts[i+1];jj++){
+      const dlong col = C->offd->cols[jj];
+      if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
+        smax = Ts[col];
+        rmax = Tr[col];
+        imax = Ti[col];
+        cmax = Tc[col];
+      }
+    }
+
+    if((states[i] == -1) && (smax == 1) && (cmax > -1))
+      FineToCoarse[i] = cmax;
+  }
+
+  //share results
+  for (dlong n=N;n<M;n++) FineToCoarse[n] = 0;
+  ogsGatherScatter(FineToCoarse, ogsHlong,  ogsAdd, A->ogs);
+
+  free(rands);
+  free(states);
+  free(Tr);
+  free(Ts);
+  free(Ti);
+  free(Tc);
+
+  delete C;
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgSetup/galerkinProd.cpp b/libs/parAlmond/src/agmgSetup/galerkinProd.cpp
new file mode 100644
index 000000000..1bccf216a
--- /dev/null
+++ b/libs/parAlmond/src/agmgSetup/galerkinProd.cpp
@@ -0,0 +1,273 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+parCSR *galerkinProd(parCSR *A, parCSR *P){
+
+  // MPI info
+  int rank, size;
+  MPI_Comm_rank(A->comm, &rank);
+  MPI_Comm_size(A->comm, &size);
+
+  hlong *globalAggStarts = P->globalColStarts;
+  hlong globalAggOffset = globalAggStarts[rank];
+
+  //The galerkin product can be computed as
+  // (P^T A P)_IJ = sum_{i in Agg_I} sum_{j in Agg_J} P_iI A_ij P_jJ
+  // Since each row of P has only one entry, we can share the necessary
+  // P entries, form the products, and send them to their destination rank
+
+  const dlong N = A->Nrows;
+  const dlong M = A->Ncols;
+
+  //printf("Level has %d rows, and is making %d aggregates\n", N, globalAggStarts[rank+1]-globalAggStarts[rank]);
+
+  hlong  *Pcols = (hlong  *) calloc(M,sizeof(hlong));
+  dfloat *Pvals = (dfloat *) calloc(M,sizeof(dfloat));
+
+  //record the entries of P that this rank has
+  dlong cnt =0;
+  for (dlong i=0;i<N;i++) {
+    for (dlong j=P->diag->rowStarts[i];j<P->diag->rowStarts[i+1];j++) {
+      Pcols[cnt] = P->diag->cols[j] + globalAggOffset; //global ID
+      Pvals[cnt] = P->diag->vals[j];
+      cnt++;
+    }
+    for (dlong j=P->offd->rowStarts[i];j<P->offd->rowStarts[i+1];j++) {
+      Pcols[cnt] = P->colMap[P->offd->cols[j]]; //global ID
+      Pvals[cnt] = P->offd->vals[j];
+      cnt++;
+    }
+  }
+
+  //fill the halo region
+  ogsGatherScatter(Pcols, ogsHlong,  ogsAdd, A->ogs);
+  ogsGatherScatter(Pvals, ogsDfloat, ogsAdd, A->ogs);
+
+
+
+  dlong sendNtotal = A->diag->nnz+A->offd->nnz;
+  nonzero_t *sendPTAP = (nonzero_t *) calloc(sendNtotal,sizeof(nonzero_t));
+
+  // Make the MPI_NONZERO_T data type
+  nonzero_t NZ;
+  MPI_Datatype MPI_NONZERO_T;
+  MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT};
+  int blength[3] = {1, 1, 1};
+  MPI_Aint addr[3], displ[3];
+  MPI_Get_address ( &(NZ.row), addr+0);
+  MPI_Get_address ( &(NZ.col), addr+1);
+  MPI_Get_address ( &(NZ.val), addr+2);
+  displ[0] = 0;
+  displ[1] = addr[1] - addr[0];
+  displ[2] = addr[2] - addr[0];
+  MPI_Type_create_struct (3, blength, displ, dtype, &MPI_NONZERO_T);
+  MPI_Type_commit (&MPI_NONZERO_T);
+
+  //form the fine PTAP products
+  cnt =0;
+  for (dlong i=0;i<N;i++) {
+    dlong start = A->diag->rowStarts[i];
+    dlong end   = A->diag->rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      const dlong  col = A->diag->cols[j];
+      const dfloat val = A->diag->vals[j];
+
+      sendPTAP[cnt].row = Pcols[i];
+      sendPTAP[cnt].col = Pcols[col];
+      sendPTAP[cnt].val = val*Pvals[i]*Pvals[col];
+      cnt++;
+    }
+    start = A->offd->rowStarts[i];
+    end   = A->offd->rowStarts[i+1];
+    for (dlong j=start;j<end;j++) {
+      const dlong  col = A->offd->cols[j];
+      const dfloat val = A->offd->vals[j];
+
+      sendPTAP[cnt].row = Pcols[i];
+      sendPTAP[cnt].col = Pcols[col];
+      sendPTAP[cnt].val = val*Pvals[i]*Pvals[col];
+      cnt++;
+    }
+  }
+
+  free(Pcols);
+  free(Pvals);
+
+  //sort entries by the coarse row and col
+  qsort(sendPTAP, sendNtotal, sizeof(nonzero_t), compareNonZeroByRow);
+
+  //count number of non-zeros we're sending
+  int *sendCounts = (int *) calloc(size,sizeof(int));
+  int *recvCounts = (int *) calloc(size,sizeof(int));
+  int *sendOffsets = (int *) calloc(size+1,sizeof(int));
+  int *recvOffsets = (int *) calloc(size+1,sizeof(int));
+
+  int r=0;
+  for(dlong i=0;i<sendNtotal;++i) {
+    hlong id = sendPTAP[i].row;
+    while(id>=globalAggStarts[r+1]) r++;
+    sendCounts[r]++;
+  }
+
+  // find how many nodes to expect (should use sparse version)
+  MPI_Alltoall(sendCounts, 1, MPI_INT,
+               recvCounts, 1, MPI_INT, A->comm);
+
+  // find send and recv offsets for gather
+  for(int r=0;r<size;++r){
+    sendOffsets[r+1] = sendOffsets[r] + sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r] + recvCounts[r];
+  }
+  dlong recvNtotal = recvOffsets[size];
+
+  nonzero_t *recvPTAP = (nonzero_t *) calloc(recvNtotal,sizeof(nonzero_t));
+
+  MPI_Alltoallv(sendPTAP, sendCounts, sendOffsets, MPI_NONZERO_T,
+                recvPTAP, recvCounts, recvOffsets, MPI_NONZERO_T,
+                A->comm);
+
+  //clean up
+  MPI_Barrier(A->comm);
+  free(sendPTAP);
+  free(sendCounts); free(recvCounts);
+  free(sendOffsets); free(recvOffsets);
+
+  //sort entries by the coarse row and col
+  qsort(recvPTAP, recvNtotal, sizeof(nonzero_t), compareNonZeroByRow);
+
+  //count total number of nonzeros;
+  dlong nnz =0;
+  if (recvNtotal) nnz++;
+  for (dlong i=1;i<recvNtotal;i++)
+    if ((recvPTAP[i].row!=recvPTAP[i-1].row)||
+        (recvPTAP[i].col!=recvPTAP[i-1].col)) nnz++;
+
+  nonzero_t *PTAP = (nonzero_t *) calloc(nnz,sizeof(nonzero_t));
+
+  //compress nonzeros
+  nnz = 0;
+  if (recvNtotal) PTAP[nnz++] = recvPTAP[0];
+  for (dlong i=1;i<recvNtotal;i++) {
+    if ((recvPTAP[i].row!=recvPTAP[i-1].row)||
+        (recvPTAP[i].col!=recvPTAP[i-1].col)) {
+      PTAP[nnz++] = recvPTAP[i];
+    } else {
+      PTAP[nnz-1].val += recvPTAP[i].val;
+    }
+  }
+
+  //clean up
+  MPI_Barrier(A->comm);
+  free(recvPTAP);
+
+  dlong numAggs = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local number of aggregates
+
+  parCSR *Ac = new parCSR(numAggs, numAggs, A->comm, A->device);
+
+  Ac->globalRowStarts = globalAggStarts;
+  Ac->globalColStarts = globalAggStarts;
+
+  Ac->diag->rowStarts = (dlong *) calloc(numAggs+1, sizeof(dlong));
+  Ac->offd->rowStarts = (dlong *) calloc(numAggs+1, sizeof(dlong));
+
+  for (dlong n=0;n<nnz;n++) {
+    dlong row = (dlong) (PTAP[n].row - globalAggOffset);
+    if ((PTAP[n].col > globalAggStarts[rank]-1)&&
+        (PTAP[n].col < globalAggStarts[rank+1])) {
+      Ac->diag->rowStarts[row+1]++;
+    } else {
+      Ac->offd->rowStarts[row+1]++;
+    }
+  }
+
+  // cumulative sum
+  for(dlong i=0; i<numAggs; i++) {
+    Ac->diag->rowStarts[i+1] += Ac->diag->rowStarts[i];
+    Ac->offd->rowStarts[i+1] += Ac->offd->rowStarts[i];
+  }
+  Ac->diag->nnz = Ac->diag->rowStarts[numAggs];
+  Ac->offd->nnz = Ac->offd->rowStarts[numAggs];
+
+  // Halo setup
+  hlong *colIds = (hlong *) malloc(Ac->offd->nnz*sizeof(hlong));
+  cnt=0;
+  for (dlong n=0;n<nnz;n++) {
+    if ((PTAP[n].col <= (globalAggStarts[rank]-1))||
+        (PTAP[n].col >= globalAggStarts[rank+1])) {
+      colIds[cnt++] = PTAP[n].col;
+    }
+  }
+  Ac->haloSetup(colIds);
+
+  //fill the CSR matrices
+  Ac->diagA   = (dfloat *) calloc(Ac->Ncols, sizeof(dfloat));
+  Ac->diagInv = (dfloat *) calloc(Ac->Ncols, sizeof(dfloat));
+  Ac->diag->cols = (dlong *)  calloc(Ac->diag->nnz, sizeof(dlong));
+  Ac->offd->cols = (dlong *)  calloc(Ac->offd->nnz, sizeof(dlong));
+  Ac->diag->vals = (dfloat *) calloc(Ac->diag->nnz, sizeof(dfloat));
+  Ac->offd->vals = (dfloat *) calloc(Ac->offd->nnz, sizeof(dfloat));
+  dlong diagCnt = 0;
+  dlong offdCnt = 0;
+  for (dlong n=0;n<nnz;n++) {
+    if ((PTAP[n].col > globalAggStarts[rank]-1)&&
+        (PTAP[n].col < globalAggStarts[rank+1])) {
+      Ac->diag->cols[diagCnt] = (dlong) (PTAP[n].col - globalAggOffset);
+      Ac->diag->vals[diagCnt] = PTAP[n].val;
+
+      //record the diagonal
+      dlong row = (dlong) (PTAP[n].row - globalAggOffset);
+      if (row==Ac->diag->cols[diagCnt])
+        Ac->diagA[row] = Ac->diag->vals[diagCnt];
+
+      diagCnt++;
+    } else {
+      Ac->offd->cols[offdCnt] = colIds[offdCnt];
+      Ac->offd->vals[offdCnt] = PTAP[n].val;
+      offdCnt++;
+    }
+  }
+
+  //compute the inverse diagonal
+  for (dlong n=0;n<Ac->Nrows;n++) Ac->diagInv[n] = 1.0/Ac->diagA[n];
+
+  //propagate nullspace flag
+  Ac->nullSpace = A->nullSpace;
+  Ac->nullSpacePenalty = A->nullSpacePenalty;
+
+  //clean up
+  MPI_Barrier(A->comm);
+  MPI_Type_free(&MPI_NONZERO_T);
+  free(colIds);
+  free(PTAP);
+
+  return Ac;
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgSetup/strongGraph.cpp b/libs/parAlmond/src/agmgSetup/strongGraph.cpp
new file mode 100644
index 000000000..89766b67f
--- /dev/null
+++ b/libs/parAlmond/src/agmgSetup/strongGraph.cpp
@@ -0,0 +1,147 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+parCSR* strongGraph(parCSR *A){
+
+  const dlong N = A->Nrows;
+  const dlong M = A->Ncols;
+
+  parCSR *C = new parCSR(N, M);
+
+  C->diag->rowStarts = (dlong *) calloc(N+1,sizeof(dlong));
+  C->offd->rowStarts = (dlong *) calloc(N+1,sizeof(dlong));
+
+  dfloat *maxOD;
+  if (N) maxOD = (dfloat *) calloc(N,sizeof(dfloat));
+
+  dfloat *diagA = A->diagA;
+
+  // #pragma omp parallel for
+  for(dlong i=0; i<N; i++){
+    const int sign = (diagA[i] >= 0) ? 1:-1;
+    const dfloat Aii = fabs(diagA[i]);
+
+    //find maxOD
+    //local entries
+    dlong Jstart = A->diag->rowStarts[i];
+    dlong Jend   = A->diag->rowStarts[i+1];
+    for(dlong jj= Jstart; jj<Jend; jj++){
+      dlong col = A->diag->cols[jj];
+      if (col==i) continue;
+      dfloat Ajj = fabs(diagA[col]);
+      dfloat OD = -sign*A->diag->vals[jj]/(sqrt(Aii)*sqrt(Ajj));
+      if(OD > maxOD[i]) maxOD[i] = OD;
+    }
+    //non-local entries
+    Jstart = A->offd->rowStarts[i],
+    Jend   = A->offd->rowStarts[i+1];
+    for(dlong jj= Jstart; jj<Jend; jj++){
+      dlong col = A->offd->cols[jj];
+      dfloat Ajj = fabs(diagA[col]);
+      dfloat OD = -sign*A->offd->vals[jj]/(sqrt(Aii)*sqrt(Ajj));
+      if(OD > maxOD[i]) maxOD[i] = OD;
+    }
+
+    int diag_strong_per_row = 1; // diagonal entry
+    //local entries
+    Jstart = A->diag->rowStarts[i],
+    Jend   = A->diag->rowStarts[i+1];
+    for(dlong jj = Jstart; jj<Jend; jj++){
+      dlong col = A->diag->cols[jj];
+      if (col==i) continue;
+      dfloat Ajj = fabs(diagA[col]);
+      dfloat OD = -sign*A->diag->vals[jj]/(sqrt(Aii)*sqrt(Ajj));
+      if(OD > COARSENTHREASHOLD*maxOD[i]) diag_strong_per_row++;
+    }
+    int offd_strong_per_row = 0;
+    //non-local entries
+    Jstart = A->offd->rowStarts[i], Jend = A->offd->rowStarts[i+1];
+    for(dlong jj= Jstart; jj<Jend; jj++){
+      dlong col = A->offd->cols[jj];
+      dfloat Ajj = fabs(diagA[col]);
+      dfloat OD = -sign*A->offd->vals[jj]/(sqrt(Aii)*sqrt(Ajj));
+      if(OD > COARSENTHREASHOLD*maxOD[i]) offd_strong_per_row++;
+    }
+
+    C->diag->rowStarts[i+1] = diag_strong_per_row;
+    C->offd->rowStarts[i+1] = offd_strong_per_row;
+  }
+
+  // cumulative sum
+  for(dlong i=1; i<N+1 ; i++) {
+    C->diag->rowStarts[i] += C->diag->rowStarts[i-1];
+    C->offd->rowStarts[i] += C->offd->rowStarts[i-1];
+  }
+  C->diag->nnz = C->diag->rowStarts[N];
+  C->offd->nnz = C->offd->rowStarts[N];
+
+  C->diag->cols = (dlong *) malloc(C->diag->nnz*sizeof(dlong));
+  C->offd->cols = (dlong *) malloc(C->offd->nnz*sizeof(dlong));
+  // C->diag->vals = (dfloat *) malloc(0);
+  // C->offd->vals = (dfloat *) malloc(0);
+
+  // fill in the columns for strong connections
+  // #pragma omp parallel for
+  for(dlong i=0; i<N; i++){
+    const int sign = (diagA[i] >= 0) ? 1:-1;
+    const dfloat Aii = fabs(diagA[i]);
+
+    dlong diagCounter = C->diag->rowStarts[i];
+    dlong offdCounter = C->offd->rowStarts[i];
+
+    //local entries
+    dlong Jstart = A->diag->rowStarts[i];
+    dlong Jend   = A->diag->rowStarts[i+1];
+    for(dlong jj = Jstart; jj<Jend; jj++){
+      dlong col = A->diag->cols[jj];
+      if (col==i) {
+        C->diag->cols[diagCounter++] = col;// diag entry
+        continue;
+      }
+      dfloat Ajj = fabs(diagA[col]);
+      dfloat OD = -sign*A->diag->vals[jj]/(sqrt(Aii)*sqrt(Ajj));
+      if(OD > COARSENTHREASHOLD*maxOD[i])
+        C->diag->cols[diagCounter++] = col;
+    }
+    Jstart = A->offd->rowStarts[i], Jend = A->offd->rowStarts[i+1];
+    for(dlong jj = Jstart; jj<Jend; jj++){
+      dlong col = A->offd->cols[jj];
+      dfloat Ajj = fabs(diagA[col]);
+      dfloat OD = -sign*A->offd->vals[jj]/(sqrt(Aii)*sqrt(Ajj));
+      if(OD > COARSENTHREASHOLD*maxOD[i])
+        C->offd->cols[offdCounter++] = col;
+    }
+  }
+  if(N) free(maxOD);
+
+  return C;
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgSetup/transpose.cpp b/libs/parAlmond/src/agmgSetup/transpose.cpp
new file mode 100644
index 000000000..cc1801808
--- /dev/null
+++ b/libs/parAlmond/src/agmgSetup/transpose.cpp
@@ -0,0 +1,188 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+parCSR *transpose(parCSR *A){
+
+  // MPI info
+  int rank, size;
+  MPI_Comm_rank(A->comm, &rank);
+  MPI_Comm_size(A->comm, &size);
+
+  hlong *globalRowStarts = A->globalRowStarts;
+  hlong *globalColStarts = A->globalColStarts;
+
+  dlong Nrows = (dlong) (globalColStarts[rank+1]-globalColStarts[rank]);
+  dlong Ncols = (dlong) (globalRowStarts[rank+1]-globalRowStarts[rank]);
+
+  parCSR *At = new parCSR(Nrows, Ncols, A->comm, A->device);
+
+  At->globalRowStarts = globalColStarts;
+  At->globalColStarts = globalRowStarts;
+
+  At->diag = new CSR(At->Nrows, At->Ncols);
+  At->offd = new CSR(At->Nrows, At->Ncols);
+
+  At->diag->nnz = A->diag->nnz; //local entries remain local
+  At->diag->rowStarts = (dlong *) calloc(At->Nrows+1, sizeof(dlong));
+
+  //start with local entries
+  At->diag->cols = (dlong *)  calloc(At->diag->nnz, sizeof(dlong));
+  At->diag->vals = (dfloat *) calloc(At->diag->nnz, sizeof(dfloat));
+
+  // count the num of nonzeros per row for transpose
+  for(dlong i=0; i<A->diag->nnz; i++){
+    dlong row = A->diag->cols[i];
+    At->diag->rowStarts[row+1]++;
+  }
+
+  // cumulative sum for rows
+  for(dlong i=1; i<=At->Nrows; i++)
+    At->diag->rowStarts[i] += At->diag->rowStarts[i-1];
+
+  int *counter = (int *) calloc(At->Nrows+1,sizeof(int));
+  for (dlong i=0; i<At->Nrows+1; i++)
+    counter[i] = At->diag->rowStarts[i];
+
+  for(dlong i=0; i<A->Nrows; i++){
+    const dlong Jstart = A->diag->rowStarts[i];
+    const dlong Jend   = A->diag->rowStarts[i+1];
+
+    for(dlong jj=Jstart; jj<Jend; jj++){
+      dlong row = A->diag->cols[jj];
+      At->diag->cols[counter[row]] = i;
+      At->diag->vals[counter[row]] = A->diag->vals[jj];
+
+      counter[row]++;
+    }
+  }
+  free(counter);
+
+
+  // Make the MPI_NONZERO_T data type
+  nonzero_t NZ;
+  MPI_Datatype MPI_NONZERO_T;
+  MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT};
+  int blength[3] = {1, 1, 1};
+  MPI_Aint addr[3], displ[3];
+  MPI_Get_address ( &(NZ.row), addr+0);
+  MPI_Get_address ( &(NZ.col), addr+1);
+  MPI_Get_address ( &(NZ.val), addr+2);
+  displ[0] = 0;
+  displ[1] = addr[1] - addr[0];
+  displ[2] = addr[2] - addr[0];
+  MPI_Type_create_struct (3, blength, displ, dtype, &MPI_NONZERO_T);
+  MPI_Type_commit (&MPI_NONZERO_T);
+
+  nonzero_t *sendNonZeros = (nonzero_t *) calloc(A->offd->nnz, sizeof(nonzero_t));
+
+  // copy data from nonlocal entries into send buffer
+  for(dlong i=0;i<A->Nrows;++i){
+    for (dlong j=A->offd->rowStarts[i];j<A->offd->rowStarts[i+1];j++) {
+      hlong col =  A->colMap[A->offd->cols[j]]; //global ids
+      sendNonZeros[j].row = col;
+      sendNonZeros[j].col = i + globalRowStarts[rank];     //global ids
+      sendNonZeros[j].val = A->offd->vals[j];
+    }
+  }
+
+  //sort by destination row
+  qsort(sendNonZeros, A->offd->nnz, sizeof(nonzero_t), compareNonZeroByRow);
+
+  //count number of non-zeros we're sending
+  int *sendCounts = (int*) calloc(size, sizeof(int));
+  int *recvCounts = (int*) calloc(size, sizeof(int));
+  int *sendOffsets = (int*) calloc(size+1, sizeof(int));
+  int *recvOffsets = (int*) calloc(size+1, sizeof(int));
+
+  int r=0;
+  for (dlong n=0;n<A->offd->nnz;n++) {
+    dlong row = sendNonZeros[n].row;
+    while(row>=globalColStarts[r+1]) r++;
+    sendCounts[r]++;
+  }
+
+  MPI_Alltoall(sendCounts, 1, MPI_INT,
+               recvCounts, 1, MPI_INT, A->comm);
+
+  for (r=0;r<size;r++) {
+    sendOffsets[r+1] = sendOffsets[r]+sendCounts[r];
+    recvOffsets[r+1] = recvOffsets[r]+recvCounts[r];
+  }
+  At->offd->nnz = recvOffsets[size]; //total nonzeros
+
+  nonzero_t *recvNonZeros = (nonzero_t *) calloc(At->offd->nnz, sizeof(nonzero_t));
+
+  MPI_Alltoallv(sendNonZeros, sendCounts, sendOffsets, MPI_NONZERO_T,
+                recvNonZeros, recvCounts, recvOffsets, MPI_NONZERO_T,
+                A->comm);
+
+  //clean up
+  MPI_Barrier(A->comm);
+  free(sendNonZeros);
+  free(sendCounts);
+  free(recvCounts);
+  free(sendOffsets);
+  free(recvOffsets);
+
+  //sort by row
+  qsort(recvNonZeros, At->offd->nnz, sizeof(nonzero_t), compareNonZeroByRow);
+
+  hlong globalRowOffset = At->globalRowStarts[rank];
+
+  hlong *colIds = (hlong *) malloc(At->offd->nnz*sizeof(hlong));
+  dlong cnt=0;
+  for (dlong n=0;n<At->offd->nnz;n++) {
+    colIds[n] = recvNonZeros[n].col;
+  }
+  At->haloSetup(colIds);
+
+  //fill the CSR matrix
+  At->offd->rowStarts = (dlong *) calloc(At->Nrows+1, sizeof(dlong));
+  At->offd->cols = (dlong *)  calloc(At->offd->nnz, sizeof(dlong));
+  At->offd->vals = (dfloat *) calloc(At->offd->nnz, sizeof(dfloat));
+  for (dlong n=0;n<At->offd->nnz;n++) {
+    dlong row = (dlong) (recvNonZeros[n].row - globalRowOffset);
+    At->offd->rowStarts[row+1]++;
+    At->offd->cols[n] = colIds[n];
+    At->offd->vals[n] = recvNonZeros[n].val;
+  }
+
+  // cumulative sum for rows
+  for(dlong i=1; i<=At->Nrows; i++)
+    At->offd->rowStarts[i] += At->offd->rowStarts[i-1];
+
+  MPI_Barrier(A->comm);
+  free(recvNonZeros);
+  free(colIds);
+
+  return At;
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/agmgSmoother.cpp b/libs/parAlmond/src/agmgSmoother.cpp
new file mode 100644
index 000000000..4f1e3df62
--- /dev/null
+++ b/libs/parAlmond/src/agmgSmoother.cpp
@@ -0,0 +1,200 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+void agmgLevel::smoothJacobi(dfloat *r, dfloat *x,
+                             const bool x_is_zero) {
+
+  // x = x + inv(D)*(b-A*x)
+  if(x_is_zero){
+    vectorDotStar(Nrows,1.0,A->diagInv,r,0.0,x);
+    return;
+  }
+
+  static dfloat *res = (dfloat *) scratch;
+
+  A->SpMV(-1.0, x, 1.0, r, res);
+  vectorDotStar(Nrows, 1.0, A->diagInv, res, 1.0, x);
+}
+
+
+void agmgLevel::smoothDampedJacobi(dfloat *r, dfloat *x,
+                                   const bool x_is_zero) {
+
+  // x = x + alpha*inv(D)*(b-A*x)
+  if(x_is_zero){
+    vectorDotStar(Nrows,lambda,A->diagInv,r,0.0,x);
+    return;
+  }
+
+  static dfloat *res = (dfloat *) scratch;
+
+  A->SpMV(-1.0, x, 1.0, r, res);
+  vectorDotStar(Nrows, lambda, A->diagInv, res, 1.0, x);
+}
+
+void agmgLevel::smoothChebyshev(dfloat *r, dfloat *x,
+                                const bool x_is_zero) {
+
+  const dfloat theta = 0.5*(lambda1+lambda0);
+  const dfloat delta = 0.5*(lambda1-lambda0);
+  const dfloat invTheta = 1.0/theta;
+  const dfloat sigma = theta/delta;
+  dfloat rho_n = 1./sigma;
+  dfloat rho_np1;
+
+  static dfloat *res = ((dfloat*) scratch) + 0*Ncols;
+  static dfloat *Ad  = ((dfloat*) scratch) + 1*Ncols;
+  static dfloat *d   = ((dfloat*) scratch) + 2*Ncols;
+
+  if(x_is_zero){ //skip the Ax if x is zero
+    //res = D^{-1}r
+    vectorDotStar(Nrows, 1.0, A->diagInv, r, 0.0, res);
+    vectorSet(Nrows, 0.0, x);
+    //d = invTheta*res
+    vectorAdd(Nrows, invTheta, res, 0.0, d);
+  } else {
+    //res = D^{-1}(r-Ax)
+    A->SpMV(-1.0, x, 1.0, r, res);
+    vectorDotStar(Nrows, A->diagInv, res);
+
+    //d = invTheta*res
+    vectorAdd(Nrows, invTheta, res, 0.0, d);
+  }
+
+  for (int k=0;k<ChebyshevIterations;k++) {
+    //x_k+1 = x_k + d_k
+    vectorAdd(Nrows, 1.0, d, 1.0, x);
+
+    //r_k+1 = r_k - D^{-1}Ad_k
+    A->SpMV(1.0, d, 0.0, Ad);
+    vectorDotStar(Nrows, -1.0, A->diagInv, Ad, 1.0, res);
+
+    rho_np1 = 1.0/(2.*sigma-rho_n);
+
+    //d_k+1 = rho_k+1*rho_k*d_k  + 2*rho_k+1*r_k+1/delta
+    vectorAdd(Nrows, 2.0*rho_np1/delta, res, rho_np1*rho_n, d);
+    rho_n = rho_np1;
+  }
+  //x_k+1 = x_k + d_k
+  vectorAdd(Nrows, 1.0, d, 1.0, x);
+}
+
+void agmgLevel::smoothJacobi(occa::memory o_r, occa::memory o_x,
+                             bool x_is_zero) {
+
+  // occaTimerTic(parAlmond->device,"device smoothJacobi");
+  if(x_is_zero){
+    vectorDotStar(Nrows, 1.0, o_A->o_diagInv, o_r, 0.0, o_x);
+    // occaTimerToc(parAlmond->device,"device smoothJacobi");
+    return;
+  }
+
+  static occa::memory o_res = o_scratch;
+
+  // res = r-A*x
+  o_A->SpMV(-1.0, o_x, 1.0, o_r, o_res);
+
+  // x = x + alpha*inv(D)*res
+  vectorDotStar(Nrows, 1.0, o_A->o_diagInv, o_res, 1.0, o_x);
+  // occaTimerToc(parAlmond->device,"hyb smoothJacobi");
+}
+
+void agmgLevel::smoothDampedJacobi(occa::memory o_r, occa::memory o_x,
+                                   bool x_is_zero){
+
+  // occaTimerTic(parAlmond->device,"device smoothDampedJacobi");
+  if(x_is_zero){
+    vectorDotStar(Nrows, lambda, o_A->o_diagInv, o_r, 0.0, o_x);
+    // occaTimerToc(parAlmond->device,"device smoothDampedJacobi");
+    return;
+  }
+
+  static occa::memory o_res = o_scratch;
+
+  // res = r-A*x
+  o_A->SpMV(-1.0, o_x, 1.0, o_r, o_res);
+
+  // x = x + alpha*inv(D)*res
+  vectorDotStar(Nrows, lambda, o_A->o_diagInv, o_res, 1.0, o_x);
+  // occaTimerToc(parAlmond->device,"device smoothDampedJacobi");
+}
+
+void agmgLevel::smoothChebyshev(occa::memory o_r, occa::memory o_x,
+                                bool x_is_zero) {
+
+  const dfloat theta = 0.5*(lambda1+lambda0);
+  const dfloat delta = 0.5*(lambda1-lambda0);
+  const dfloat invTheta = 1.0/theta;
+  const dfloat sigma = theta/delta;
+  dfloat rho_n = 1./sigma;
+  dfloat rho_np1;
+
+  static occa::memory o_res = o_scratch + 0*Ncols*sizeof(dfloat);
+  static occa::memory o_Ad  = o_scratch + 1*Ncols*sizeof(dfloat);
+  static occa::memory o_d   = o_scratch + 2*Ncols*sizeof(dfloat);
+
+  // occaTimerTic(parAlmond->device,"device smoothChebyshev");
+
+  if(x_is_zero){ //skip the Ax if x is zero
+    //res = D^{-1}r
+    vectorDotStar(Nrows, 1.0, o_A->o_diagInv, o_r, 0.0, o_res);
+    vectorSet(Nrows, 0.0, o_x);
+    //d = invTheta*res
+    vectorAdd(Nrows, invTheta, o_res, 0.0, o_d);
+  } else {
+    //res = D^{-1}(r-Ax)
+    o_A->SpMV(-1.0, o_x, 1.0, o_r, o_res);
+    vectorDotStar(Nrows, o_A->o_diagInv, o_res);
+
+    //d = invTheta*res
+    vectorAdd(Nrows, invTheta, o_res, 0.0, o_d);
+  }
+
+  for (int k=0;k<ChebyshevIterations;k++) {
+    //x_k+1 = x_k + d_k
+    vectorAdd(Nrows, 1.0, o_d, 1.0, o_x);
+
+    //r_k+1 = r_k - D^{-1}Ad_k
+    o_A->SpMV(1.0, o_d, 0.0, o_Ad);
+    vectorDotStar(Nrows, -1.0, o_A->o_diagInv, o_Ad, 1.0, o_res);
+
+    rho_np1 = 1.0/(2.*sigma-rho_n);
+
+    //d_k+1 = rho_k+1*rho_k*d_k  + 2*rho_k+1*r_k+1/delta
+    vectorAdd(Nrows, 2.0*rho_np1/delta, o_res, rho_np1*rho_n, o_d);
+    rho_n = rho_np1;
+  }
+  //x_k+1 = x_k + d_k
+  vectorAdd(Nrows, 1.0, o_d, 1.0, o_x);
+
+  // occaTimerToc(parAlmond->device,"device smoothChebyshev");
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/coarseSolver.cpp b/libs/parAlmond/src/coarseSolver.cpp
new file mode 100644
index 000000000..90630f04f
--- /dev/null
+++ b/libs/parAlmond/src/coarseSolver.cpp
@@ -0,0 +1,389 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+coarseSolver::coarseSolver(setupAide options_) {
+  gatherLevel = false;
+  options = options_;
+}
+
+int coarseSolver::getTargetSize() {
+  return 1000;
+}
+
+//set up exact solver using xxt
+void coarseSolver::setup(parCSR *A) {
+
+  comm = A->comm;
+
+  int rank, size;
+  MPI_Comm_rank(comm,&rank);
+  MPI_Comm_size(comm,&size);
+
+  //copy the global coarse partition as ints
+  coarseOffsets = (int* ) calloc(size+1,sizeof(int));
+  for (int r=0;r<size+1;r++) coarseOffsets[r] = (int) A->globalRowStarts[r];
+
+  coarseTotal   = coarseOffsets[size];
+  coarseOffset  = coarseOffsets[rank];
+
+  N = (int) A->Nrows;
+
+  int sendNNZ = (int) (A->diag->nnz+A->offd->nnz);
+  int *rows;
+  int *cols;
+  dfloat *vals;
+
+  // if((rank==0)&&(options.compareArgs("VERBOSE","TRUE")))
+  //   printf("Setting up coarse solver...");fflush(stdout);
+
+  // Make the MPI_NONZERO_T data type
+  nonzero_t NZ;
+  MPI_Datatype MPI_NONZERO_T;
+  MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT};
+  int blength[3] = {1, 1, 1};
+  MPI_Aint addr[3], displ[3];
+  MPI_Get_address ( &(NZ.row), addr+0);
+  MPI_Get_address ( &(NZ.col), addr+1);
+  MPI_Get_address ( &(NZ.val), addr+2);
+  displ[0] = 0;
+  displ[1] = addr[1] - addr[0];
+  displ[2] = addr[2] - addr[0];
+  MPI_Type_create_struct (3, blength, displ, dtype, &MPI_NONZERO_T);
+  MPI_Type_commit (&MPI_NONZERO_T);
+
+  nonzero_t *sendNonZeros = (nonzero_t *) calloc(sendNNZ, sizeof(nonzero_t));
+
+  //populate matrix
+  int cnt = 0;
+  for (int n=0;n<N;n++) {
+    int start = (int) A->diag->rowStarts[n];
+    int end   = (int) A->diag->rowStarts[n+1];
+    for (int m=start;m<end;m++) {
+      sendNonZeros[cnt].row = n + coarseOffset;
+      sendNonZeros[cnt].col = A->diag->cols[m] + coarseOffset;
+      sendNonZeros[cnt].val = A->diag->vals[m];
+      cnt++;
+    }
+    start = (int) A->offd->rowStarts[n];
+    end   = (int) A->offd->rowStarts[n+1];
+    for (dlong m=start;m<end;m++) {
+      sendNonZeros[cnt].row = n + coarseOffset;
+      sendNonZeros[cnt].col = A->colMap[A->offd->cols[m]];
+      sendNonZeros[cnt].val = A->offd->vals[m];
+      cnt++;
+    }
+  }
+
+  //get the nonzero counts from all ranks
+  int *recvNNZ    = (int*) calloc(size,sizeof(int));
+  int *NNZoffsets = (int*) calloc(size+1,sizeof(int));
+  MPI_Allgather(&sendNNZ, 1, MPI_INT,
+                 recvNNZ, 1, MPI_INT, comm);
+
+  int totalNNZ = 0;
+  for (int r=0;r<size;r++) {
+    totalNNZ += recvNNZ[r];
+    NNZoffsets[r+1] = NNZoffsets[r] + recvNNZ[r];
+  }
+
+  nonzero_t *recvNonZeros = (nonzero_t *) calloc(totalNNZ, sizeof(nonzero_t));
+
+  MPI_Allgatherv(sendNonZeros, sendNNZ,             MPI_NONZERO_T,
+                 recvNonZeros, recvNNZ, NNZoffsets, MPI_NONZERO_T, comm);
+
+  //gather null vector
+  dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
+
+  coarseCounts = (int*) calloc(size,sizeof(int));
+  for (int r=0;r<size;r++)
+    coarseCounts[r] = coarseOffsets[r+1]-coarseOffsets[r];
+
+  MPI_Allgatherv(  A->null,          N,                MPI_DFLOAT,
+                 nullTotal, coarseCounts, coarseOffsets, MPI_DFLOAT,
+                 comm);
+
+  //clean up
+  MPI_Barrier(comm);
+  MPI_Type_free(&MPI_NONZERO_T);
+  free(sendNonZeros);
+  free(NNZoffsets);
+  free(recvNNZ);
+
+
+  //assemble the full matrix
+  dfloat *coarseA = (dfloat *) calloc(coarseTotal*coarseTotal,sizeof(dfloat));
+  for (int i=0;i<totalNNZ;i++) {
+    int n = recvNonZeros[i].row;
+    int m = recvNonZeros[i].col;
+    coarseA[n*coarseTotal+m] = recvNonZeros[i].val;
+  }
+
+  if (A->nullSpace) { //A is dense due to nullspace augmentation
+    for (int n=0;n<coarseTotal;n++) {
+      for (int m=0;m<coarseTotal;m++) {
+        coarseA[n*coarseTotal+m] += A->nullSpacePenalty*nullTotal[n]*nullTotal[m];
+      }
+    }
+  }
+
+  free(recvNonZeros);
+  free(nullTotal);
+
+  matrixInverse(coarseTotal, coarseA);
+
+  //store only the local rows of the full inverse
+  invCoarseA = (dfloat *) calloc(N*coarseTotal,sizeof(dfloat));
+  for (int n=0;n<N;n++) {
+    for (int m=0;m<coarseTotal;m++) {
+      invCoarseA[n*coarseTotal+m] = coarseA[(n+coarseOffset)*coarseTotal+m];
+    }
+  }
+
+  xLocal   = (dfloat*) calloc(N,sizeof(dfloat));
+  rhsLocal = (dfloat*) calloc(N,sizeof(dfloat));
+
+  xCoarse   = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
+  rhsCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
+
+  free(coarseA);
+
+  // if((rank==0)&&(options.compareArgs("VERBOSE","TRUE"))) printf("done.\n");
+}
+
+void coarseSolver::syncToDevice() {}
+
+void coarseSolver::solve(dfloat *rhs, dfloat *x) {
+
+  if (gatherLevel) {
+    ogsGather(Gx, rhs, ogsDfloat, ogsAdd, ogs);
+    //gather the full vector
+    MPI_Allgatherv(Gx,                  N,                MPI_DFLOAT,
+                   rhsCoarse, coarseCounts, coarseOffsets, MPI_DFLOAT, comm);
+
+    //multiply by local part of the exact matrix inverse
+    // #pragma omp parallel for
+    for (int n=0;n<N;n++) {
+      xLocal[n] = 0.;
+      for (int m=0;m<coarseTotal;m++) {
+        xLocal[n] += invCoarseA[n*coarseTotal+m]*rhsCoarse[m];
+      }
+    }
+    ogsScatter(x, xLocal, ogsDfloat, ogsAdd, ogs);
+
+  } else {
+    //gather the full vector
+    MPI_Allgatherv(rhs,                  N,                MPI_DFLOAT,
+                   rhsCoarse, coarseCounts, coarseOffsets, MPI_DFLOAT, comm);
+
+    //multiply by local part of the exact matrix inverse
+    // #pragma omp parallel for
+    for (int n=0;n<N;n++) {
+      x[n] = 0.;
+      for (int m=0;m<coarseTotal;m++) {
+        x[n] += invCoarseA[n*coarseTotal+m]*rhsCoarse[m];
+      }
+    }
+  }
+
+
+}
+
+void coarseSolver::solve(occa::memory o_rhs, occa::memory o_x) {
+
+  if (gatherLevel) {
+    ogsGather(o_Gx, o_rhs, ogsDfloat, ogsAdd, ogs);
+    o_Gx.copyTo(rhsLocal, N*sizeof(dfloat), 0);
+  } else {
+    o_rhs.copyTo(rhsLocal, N*sizeof(dfloat), 0);
+  }
+
+  //gather the full vector
+  MPI_Allgatherv(rhsLocal,             N,                MPI_DFLOAT,
+                 rhsCoarse, coarseCounts, coarseOffsets, MPI_DFLOAT, comm);
+
+  //multiply by local part of the exact matrix inverse
+  // #pragma omp parallel for
+  for (int n=0;n<N;n++) {
+    xLocal[n] = 0.;
+    for (int m=0;m<coarseTotal;m++) {
+      xLocal[n] += invCoarseA[n*coarseTotal+m]*rhsCoarse[m];
+    }
+  }
+
+  if (gatherLevel) {
+    o_Gx.copyFrom(xLocal, N*sizeof(dfloat), 0);
+    ogsScatter(o_x, o_Gx, ogsDfloat, ogsAdd, ogs);
+  } else {
+    o_x.copyFrom(xLocal, N*sizeof(dfloat), 0);
+  }
+}
+
+#if 0
+//set up exact solver using xxt
+void setupExactSolve(parAlmond_t *parAlmond, agmgLevel *level, bool nullSpace, dfloat nullSpacePenalty) {
+
+  int rank, size;
+  rank = agmg::rank;
+  size = agmg::size;
+
+  int* coarseOffsets = level->globalRowStarts;
+  int coarseTotal = coarseOffsets[size];
+  int coarseOffset = coarseOffsets[rank];
+
+  int *globalNumbering = (int *) calloc(coarseTotal,sizeof(int));
+  for (int n=0;n<coarseTotal;n++)
+    globalNumbering[n] = n;
+
+  csr *A = level->A;
+  int N = level->Nrows;
+
+  int totalNNZ;
+  int *rows;
+  int *cols;
+  dfloat *vals;
+
+  if(!nullSpace) {
+    //if no nullspace, use sparse A
+    totalNNZ = A->diagNNZ+A->offdNNZ;
+    if (totalNNZ) {
+      rows = (int *) calloc(totalNNZ,sizeof(int));
+      cols = (int *) calloc(totalNNZ,sizeof(int));
+      vals = (dfloat *) calloc(totalNNZ,sizeof(dfloat));
+    }
+
+    //populate matrix
+    int cnt = 0;
+    for (int n=0;n<N;n++) {
+      for (int m=A->diagRowStarts[n];m<A->diagRowStarts[n+1];m++) {
+        rows[cnt] = n + coarseOffset;
+        cols[cnt] = A->diagCols[m] + coarseOffset;
+        vals[cnt] = A->diagCoefs[m];
+        cnt++;
+      }
+      for (int m=A->offdRowStarts[n];m<A->offdRowStarts[n+1];m++) {
+        rows[cnt] = n + coarseOffset;
+        cols[cnt] = A->colMap[A->offdCols[m]];
+        vals[cnt] = A->offdCoefs[m];
+        cnt++;
+      }
+    }
+  } else {
+    totalNNZ = A->Nrows*coarseTotal; //A is dense due to nullspace augmentation
+    if (totalNNZ) {
+      rows = (int *) calloc(totalNNZ,sizeof(int));
+      cols = (int *) calloc(totalNNZ,sizeof(int));
+      vals = (dfloat *) calloc(totalNNZ,sizeof(dfloat));
+    }
+
+    //gather null vector
+    dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
+    int *nullCounts = (int*) calloc(size,sizeof(int));
+    for (int r=0;r<size;r++)
+      nullCounts[r] = coarseOffsets[r+1]-coarseOffsets[r];
+
+    MPI_Allgatherv(A->null, A->Nrows, MPI_DFLOAT, nullTotal, nullCounts, coarseOffsets, MPI_DFLOAT, agmg::comm);
+
+    //populate matrix
+    for (int n=0;n<N;n++) {
+      for (int m=0;m<coarseTotal;m++) {
+        rows[n*coarseTotal+m] = n + coarseOffset;
+        cols[n*coarseTotal+m] = m;
+        vals[n*coarseTotal+m] = nullSpacePenalty*nullTotal[n+coarseOffset]*nullTotal[m];
+      }
+    }
+
+    for (int n=0;n<N;n++) {
+      for (int m=A->diagRowStarts[n];m<A->diagRowStarts[n+1];m++) {
+        int col = A->diagCols[m] + coarseOffset;
+        vals[n*coarseTotal+col] += A->diagCoefs[m];
+      }
+      for (int m=A->offdRowStarts[n];m<A->offdRowStarts[n+1];m++) {
+        int col = A->colMap[A->offdCols[m]];
+        vals[n*coarseTotal+col] += A->offdCoefs[m];
+      }
+    }
+  }
+
+  parAlmond->ExactSolve = xxtSetup(A->Nrows,
+                                globalNumbering,
+                                totalNNZ,
+                                rows,
+                                cols,
+                                vals,
+                                0,
+                                "int",
+                                dfloatString);
+
+  parAlmond->coarseTotal = coarseTotal;
+  parAlmond->coarseOffset = coarseOffset;
+
+  parAlmond->xCoarse   = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
+  parAlmond->rhsCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
+
+  free(globalNumbering);
+  if (totalNNZ) {
+    free(rows);
+    free(cols);
+    free(vals);
+  }
+
+  printf("Done UberCoarse setup\n");
+}
+
+
+void exactCoarseSolve(parAlmond_t *parAlmond, int N, dfloat *rhs, dfloat *x) {
+
+  //use coarse solver
+  for (int n=0;n<parAlmond->coarseTotal;n++)
+    parAlmond->rhsCoarse[n] =0.;
+
+  for (int n=0;n<N;n++)
+    parAlmond->rhsCoarse[n+parAlmond->coarseOffset] = rhs[n];
+
+  xxtSolve(parAlmond->xCoarse, parAlmond->ExactSolve, parAlmond->rhsCoarse);
+
+  for (int n=0;n<N;n++)
+    x[n] = parAlmond->xCoarse[n+parAlmond->coarseOffset];
+
+}
+
+void device_exactCoarseSolve(parAlmond_t *parAlmond, int N, occa::memory o_rhs, occa::memory o_x) {
+
+  //use coarse solver
+  for (int n=0;n<parAlmond->coarseTotal;n++)
+    parAlmond->rhsCoarse[n] =0.;
+
+  o_rhs.copyTo(parAlmond->rhsCoarse+parAlmond->coarseOffset);
+  xxtSolve(parAlmond->xCoarse, parAlmond->ExactSolve, parAlmond->rhsCoarse);
+  o_x.copyFrom(parAlmond->xCoarse+parAlmond->coarseOffset,N*sizeof(dfloat));
+}
+#endif
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/kernels.cpp b/libs/parAlmond/src/kernels.cpp
new file mode 100644
index 000000000..9beff4710
--- /dev/null
+++ b/libs/parAlmond/src/kernels.cpp
@@ -0,0 +1,166 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+int Nrefs = 0;
+
+occa::kernel haloExtractKernel;
+
+occa::kernel SpMVcsrKernel1;
+occa::kernel SpMVcsrKernel2;
+occa::kernel SpMVellKernel1;
+occa::kernel SpMVellKernel2;
+occa::kernel SpMVmcsrKernel1;
+occa::kernel SpMVmcsrKernel2;
+
+occa::kernel vectorSetKernel;
+occa::kernel vectorScaleKernel;
+occa::kernel vectorAddScalarKernel;
+occa::kernel vectorAddKernel1;
+occa::kernel vectorAddKernel2;
+occa::kernel vectorDotStarKernel1;
+occa::kernel vectorDotStarKernel2;
+occa::kernel vectorInnerProdKernel;
+occa::kernel kcycleCombinedOp1Kernel;
+occa::kernel kcycleCombinedOp2Kernel;
+occa::kernel kcycleWeightedCombinedOp1Kernel;
+occa::kernel kcycleWeightedCombinedOp2Kernel;
+occa::kernel vectorAddInnerProdKernel;
+occa::kernel vectorAddWeightedInnerProdKernel;
+
+void buildParAlmondKernels(MPI_Comm comm, occa::device device){
+
+  int rank, size;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &size);
+
+  double seed = (double) rank;
+  srand48(seed);
+
+  occa::properties kernelInfo;
+  kernelInfo["defines"].asObject();
+  kernelInfo["includes"].asArray();
+  kernelInfo["header"].asArray();
+  kernelInfo["flags"].asObject();
+
+  if(sizeof(dlong)==4){
+    kernelInfo["defines/" "dlong"]="int";
+  }
+  if(sizeof(dlong)==8){
+    kernelInfo["defines/" "dlong"]="long long int";
+  }
+
+  if(sizeof(dfloat) == sizeof(double)){
+    kernelInfo["defines/" "dfloat"]= "double";
+    kernelInfo["defines/" "dfloat4"]= "double4";
+  }
+  else if(sizeof(dfloat) == sizeof(float)){
+    kernelInfo["defines/" "dfloat"]= "float";
+    kernelInfo["defines/" "dfloat4"]= "float4";
+  }
+
+  kernelInfo["defines/" "p_BLOCKSIZE"]= BLOCKSIZE;
+
+  if(device.mode()=="OpenCL"){
+    //kernelInfo["compiler_flags"] += "-cl-opt-disable";
+  }
+
+  if(device.mode()=="CUDA"){ // add backend compiler optimization for CUDA
+    kernelInfo["compiler_flags"] += "--ftz=true";
+    kernelInfo["compiler_flags"] += "--prec-div=false";
+    kernelInfo["compiler_flags"] += "--prec-sqrt=false";
+    kernelInfo["compiler_flags"] += "--use_fast_math";
+    kernelInfo["compiler_flags"] += "--fmad=true"; // compiler option for cuda
+  }
+
+  if (rank==0) printf("Compiling parALMOND Kernels...");fflush(stdout);
+
+  for (int r=0;r<size;r++) {
+    if (r==rank) {
+      SpMVcsrKernel1  = device.buildKernel(DPARALMOND"/okl/SpMVcsr.okl",  "SpMVcsr1",  kernelInfo);
+      SpMVcsrKernel2  = device.buildKernel(DPARALMOND"/okl/SpMVcsr.okl",  "SpMVcsr2",  kernelInfo);
+      SpMVellKernel1  = device.buildKernel(DPARALMOND"/okl/SpMVell.okl",  "SpMVell1",  kernelInfo);
+      SpMVellKernel2  = device.buildKernel(DPARALMOND"/okl/SpMVell.okl",  "SpMVell2",  kernelInfo);
+      SpMVmcsrKernel1 = device.buildKernel(DPARALMOND"/okl/SpMVmcsr.okl", "SpMVmcsr1", kernelInfo);
+      SpMVmcsrKernel2 = device.buildKernel(DPARALMOND"/okl/SpMVmcsr.okl", "SpMVmcsr2", kernelInfo);
+
+      vectorSetKernel = device.buildKernel(DPARALMOND"/okl/vectorSet.okl", "vectorSet", kernelInfo);
+      vectorScaleKernel = device.buildKernel(DPARALMOND"/okl/vectorScale.okl", "vectorScale", kernelInfo);
+      vectorAddScalarKernel = device.buildKernel(DPARALMOND"/okl/vectorAddScalar.okl", "vectorAddScalar", kernelInfo);
+      vectorAddKernel1 = device.buildKernel(DPARALMOND"/okl/vectorAdd.okl", "vectorAdd1", kernelInfo);
+      vectorAddKernel2 = device.buildKernel(DPARALMOND"/okl/vectorAdd.okl", "vectorAdd2", kernelInfo);
+      vectorDotStarKernel1 = device.buildKernel(DPARALMOND"/okl/vectorDotStar.okl", "vectorDotStar1", kernelInfo);
+      vectorDotStarKernel2 = device.buildKernel(DPARALMOND"/okl/vectorDotStar.okl", "vectorDotStar2", kernelInfo);
+      vectorInnerProdKernel = device.buildKernel(DPARALMOND"/okl/vectorInnerProd.okl", "vectorInnerProd", kernelInfo);
+
+      vectorAddInnerProdKernel = device.buildKernel(DPARALMOND"/okl/vectorAddInnerProd.okl", "vectorAddInnerProd", kernelInfo);
+      vectorAddWeightedInnerProdKernel = device.buildKernel(DPARALMOND"/okl/vectorAddInnerProd.okl", "vectorAddWeightedInnerProd", kernelInfo);
+
+      kcycleCombinedOp1Kernel = device.buildKernel(DPARALMOND"/okl/kcycleCombinedOp.okl", "kcycleCombinedOp1", kernelInfo);
+      kcycleCombinedOp2Kernel = device.buildKernel(DPARALMOND"/okl/kcycleCombinedOp.okl", "kcycleCombinedOp2", kernelInfo);
+      kcycleWeightedCombinedOp1Kernel = device.buildKernel(DPARALMOND"/okl/kcycleCombinedOp.okl", "kcycleWeightedCombinedOp1", kernelInfo);
+      kcycleWeightedCombinedOp2Kernel = device.buildKernel(DPARALMOND"/okl/kcycleCombinedOp.okl", "kcycleWeightedCombinedOp2", kernelInfo);
+
+      haloExtractKernel = device.buildKernel(DPARALMOND"/okl/haloExtract.okl", "haloExtract", kernelInfo);
+    }
+    MPI_Barrier(comm);
+  }
+  if(rank==0) printf("done.\n");
+}
+
+void freeParAlmondKernels() {
+
+  haloExtractKernel.free();
+
+  SpMVcsrKernel1.free();
+  SpMVcsrKernel2.free();
+  SpMVellKernel1.free();
+  SpMVellKernel2.free();
+  SpMVmcsrKernel1.free();
+  SpMVmcsrKernel2.free();
+
+  vectorSetKernel.free();
+  vectorScaleKernel.free();
+  vectorAddScalarKernel.free();
+  vectorAddKernel1.free();
+  vectorAddKernel2.free();
+  vectorDotStarKernel1.free();
+  vectorDotStarKernel2.free();
+  vectorInnerProdKernel.free();
+  kcycleCombinedOp1Kernel.free();
+  kcycleCombinedOp2Kernel.free();
+  kcycleWeightedCombinedOp1Kernel.free();
+  kcycleWeightedCombinedOp2Kernel.free();
+  vectorAddInnerProdKernel.free();
+  vectorAddWeightedInnerProdKernel.free();
+
+}
+
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/level.cpp b/libs/parAlmond/src/level.cpp
new file mode 100644
index 000000000..75146fd26
--- /dev/null
+++ b/libs/parAlmond/src/level.cpp
@@ -0,0 +1,176 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+multigridLevel::multigridLevel(dlong N, dlong M, KrylovType ktype_, MPI_Comm comm_):
+  Nrows(N), Ncols(M), ktype(ktype_) {
+  comm = comm_;
+}
+
+multigridLevel::~multigridLevel() {
+
+  if (x  ) free(x  );
+  if (rhs) free(rhs);
+  if (res) free(res);
+
+  if (ck) free(ck);
+  if (vk) free(vk);
+  if (wk) free(wk);
+
+  if (weight) free(weight);
+
+  if (o_x.size()) o_x.free();
+  if (o_rhs.size()) o_rhs.free();
+  if (o_res.size()) o_res.free();
+
+  if (o_ck.size()) o_ck.free();
+  if (o_vk.size()) o_vk.free();
+  if (o_wk.size()) o_wk.free();
+
+  if (o_weight.size()) o_weight.free();
+
+}
+
+void multigridLevel::kcycleOp1(dfloat *alpha1, dfloat *rho1,
+                               dfloat *norm_rhs, dfloat *norm_rhstilde) {
+
+  //ck = x
+  memcpy(ck, x, Nrows*sizeof(dfloat));
+
+  // vk = A*ck
+  this->Ax(ck,vk);
+
+  dfloat rho[3];
+
+  if(ktype == PCG)
+    kcycleCombinedOp1(Nrows, rho, ck, rhs, vk, weight, weighted, comm);
+
+  if(ktype == GMRES)
+    kcycleCombinedOp1(Nrows, rho, vk, rhs, vk, weight, weighted, comm);
+
+  *alpha1 = rho[0];
+  *rho1   = rho[1];
+  *norm_rhs = sqrt(rho[2]);
+
+  const dfloat a = -(*alpha1)/(*rho1);
+
+  // rhs = rhs - (alpha1/rho1)*vk
+  *norm_rhstilde = sqrt(vectorAddInnerProd(Nrows, a, vk, 1.0, rhs, o_weight, weighted,comm));
+}
+
+void multigridLevel::kcycleOp2(const dfloat alpha1, const dfloat rho1) {
+
+  // w = A*x
+  this->Ax(x,wk);
+
+  dfloat rho[3];
+
+  if(ktype == PCG)
+    kcycleCombinedOp2(Nrows,rho, x, vk, wk, rhs, weight, weighted, comm);
+
+  if(ktype == GMRES)
+    kcycleCombinedOp2(Nrows,rho, wk, vk, wk, rhs, weight, weighted, comm);
+
+  const dfloat gamma  = rho[0];
+  const dfloat beta   = rho[1];
+  const dfloat alpha2 = rho[2];
+
+  if(fabs(rho1) > (dfloat) 1e-20){
+
+    const dfloat rho2 = beta - gamma*gamma/rho1;
+
+    if(fabs(rho2) > (dfloat) 1e-20){
+      // x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*dk
+      const dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2);
+      const dfloat b = alpha2/rho2;
+
+      vectorAdd(Nrows, a, ck, b, x);
+    }
+  }
+}
+
+void multigridLevel::device_kcycleOp1(dfloat *alpha1, dfloat *rho1,
+                               dfloat *norm_rhs, dfloat *norm_rhstilde) {
+
+  //ck = x
+  o_ck.copyFrom(o_x, Nrows*sizeof(dfloat));
+
+  // vk = A*ck
+  this->Ax(o_ck,o_vk);
+
+  dfloat rho[3];
+
+  if(ktype == PCG)
+    kcycleCombinedOp1(Nrows, rho, o_ck, o_rhs, o_vk, o_weight, weighted, comm);
+
+  if(ktype == GMRES)
+    kcycleCombinedOp1(Nrows, rho, o_vk, o_rhs, o_vk, o_weight, weighted, comm);
+
+  *alpha1 = rho[0];
+  *rho1   = rho[1];
+  *norm_rhs = sqrt(rho[2]);
+
+  const dfloat a = -(*alpha1)/(*rho1);
+
+  // rhs = rhs - (alpha1/rho1)*vk
+  *norm_rhstilde = sqrt(vectorAddInnerProd(Nrows, a, o_vk, 1.0, o_rhs, o_weight, weighted,comm));
+}
+
+void multigridLevel::device_kcycleOp2(const dfloat alpha1, const dfloat rho1) {
+
+  // w = A*x
+  this->Ax(o_x,o_wk);
+
+  dfloat rho[3];
+
+  if(ktype == PCG)
+    kcycleCombinedOp2(Nrows,rho, o_x, o_vk, o_wk, o_rhs, o_weight, weighted, comm);
+
+  if(ktype == GMRES)
+    kcycleCombinedOp2(Nrows,rho, o_wk, o_vk, o_wk, o_rhs, o_weight, weighted, comm);
+
+  const dfloat gamma  = rho[0];
+  const dfloat beta   = rho[1];
+  const dfloat alpha2 = rho[2];
+
+  if(fabs(rho1) > (dfloat) 1e-20){
+
+    const dfloat rho2 = beta - gamma*gamma/rho1;
+
+    if(fabs(rho2) > (dfloat) 1e-20){
+      // x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*dk
+      const dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2);
+      const dfloat b = alpha2/rho2;
+
+      vectorAdd(Nrows, a, o_ck, b, o_x);
+    }
+  }
+}
+
+}
\ No newline at end of file
diff --git a/libs/parAlmond/src/matrix.cpp b/libs/parAlmond/src/matrix.cpp
new file mode 100644
index 000000000..0fde8d16c
--- /dev/null
+++ b/libs/parAlmond/src/matrix.cpp
@@ -0,0 +1,739 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+matrix_t::matrix_t(dlong N, dlong M): Nrows(N), Ncols(M) {}
+
+//------------------------------------------------------------------------
+//
+//  CSR matrix
+//
+//------------------------------------------------------------------------
+
+CSR::CSR(dlong N, dlong M): matrix_t(N,M) {}
+
+CSR::~CSR() {
+  free(rowStarts);
+  free(cols);
+  free(vals);
+
+  if (o_rowStarts.size()) o_rowStarts.free();
+  if (o_cols.size()) o_cols.free();
+  if (o_vals.size()) o_vals.free();
+}
+
+//------------------------------------------------------------------------
+//
+//  ELL matrix
+//
+//------------------------------------------------------------------------
+
+ELL::ELL(dlong N, dlong M): matrix_t(N,M) {}
+
+
+ELL::~ELL() {
+  free(cols);
+  free(vals);
+
+  if (o_cols.size()) o_cols.free();
+  if (o_vals.size()) o_vals.free();
+}
+
+void ELL::syncToDevice(occa::device device) {
+
+  dlong  *colsT = (dlong *)  malloc(Nrows*nnzPerRow*sizeof(dlong));
+  dfloat *valsT = (dfloat *) malloc(Nrows*nnzPerRow*sizeof(dfloat));
+  for (dlong n=0;n<Nrows;n++) {
+    for (int i=0;i<nnzPerRow;i++) {
+      colsT[n+i*Nrows] = cols[n*nnzPerRow+i];
+      valsT[n+i*Nrows] = vals[n*nnzPerRow+i];
+    }
+  }
+
+  if(nnzPerRow && Nrows){
+    o_cols = device.malloc(Nrows*nnzPerRow*sizeof(dlong),  colsT);
+    o_vals = device.malloc(Nrows*nnzPerRow*sizeof(dfloat), valsT);
+  }
+
+  free(colsT); free(valsT);
+}
+
+//------------------------------------------------------------------------
+//
+//  MCSR matrix
+//
+//------------------------------------------------------------------------
+MCSR::MCSR(dlong N, dlong M): matrix_t(N,M) {}
+
+MCSR::~MCSR() {
+  free(rowStarts);
+  free(rows);
+  free(cols);
+  free(vals);
+
+  if (o_rowStarts.size()) o_rowStarts.free();
+  if (o_rows.size()) o_rows.free();
+  if (o_cols.size()) o_cols.free();
+  if (o_vals.size()) o_vals.free();
+}
+
+void MCSR::syncToDevice(occa::device device) {
+  if (actualRows) {
+    o_rowStarts = device.malloc((actualRows+1)*sizeof(dlong), rowStarts);
+    o_rows      = device.malloc(actualRows*sizeof(dlong), rows);
+  }
+  if (nnz) {
+    o_cols = device.malloc(nnz*sizeof(dlong),   cols);
+    o_vals = device.malloc(nnz*sizeof(dfloat),  vals);
+  }
+}
+
+//------------------------------------------------------------------------
+//
+//  parCSR matrix
+//
+//------------------------------------------------------------------------
+parCSR::parCSR(dlong N, dlong M): matrix_t(N,M) {
+  diag = new CSR(N,M);
+  offd = new CSR(N,M);
+
+  nullSpace=false;
+}
+
+parCSR::parCSR(dlong N, dlong M,
+               MPI_Comm comm_,
+               occa::device device_): matrix_t(N,M) {
+  MPI_Comm_dup(comm_, &comm);
+  device = device_;
+
+  diag = new CSR(N,M);
+  offd = new CSR(N,M);
+
+  nullSpace=false;
+}
+
+//build a parCSR matrix from a distributed COO matrix (assumes square)
+parCSR::parCSR(dlong N,         // number of rows on this rank
+               hlong* starts,   // global partitioning
+               dlong nnz,       // number of nonzeros on this rank
+               hlong *Ai,       // global row ids
+               hlong *Aj,       // global column ids
+               dfloat *Avals,    // values
+               bool NullSpace,          //switch for nullspace
+               dfloat *Null,            //null vector (or low energy mode)
+               dfloat NullSpacePenalty, //penalty parameter for rank boost
+               MPI_Comm comm_,
+               occa::device device_) {
+
+  Nrows = N;
+  Ncols = N;
+  globalRowStarts = starts;
+  globalColStarts = starts;
+
+  device = device_;
+  MPI_Comm_dup(comm_, &comm);
+
+  int rank, size;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &size);
+
+  hlong globalOffset = globalRowStarts[rank];
+
+  null = (dfloat *) calloc(Nrows, sizeof(dfloat));
+  memcpy(null, Null, Nrows*sizeof(dfloat));
+
+  nullSpace = NullSpace;
+  nullSpacePenalty = NullSpacePenalty;
+
+  diag = new CSR(Nrows,Nrows);
+  offd = new CSR(Nrows,Nrows);
+
+  diag->rowStarts = (dlong *) calloc(Nrows+1, sizeof(dlong));
+  offd->rowStarts = (dlong *) calloc(Nrows+1, sizeof(dlong));
+
+  //count the entries in each row
+  for (dlong n=0;n<nnz;n++) {
+    dlong row = (dlong) (Ai[n] - globalOffset);
+    if ((Aj[n] < globalOffset) || (Aj[n]>globalOffset+Nrows-1))
+      offd->rowStarts[row+1]++;
+    else
+      diag->rowStarts[row+1]++;
+  }
+
+  // cumulative sum
+  for(dlong i=0; i<Nrows; i++) {
+    diag->rowStarts[i+1] += diag->rowStarts[i];
+    offd->rowStarts[i+1] += offd->rowStarts[i];
+  }
+  diag->nnz = diag->rowStarts[Nrows];
+  offd->nnz = offd->rowStarts[Nrows];
+
+  // Halo setup
+  hlong *colIds = (hlong *) malloc(offd->nnz*sizeof(hlong));
+  dlong cnt=0;
+  for (dlong n=0;n<nnz;n++) {
+    if ((Aj[n] < globalOffset) || (Aj[n]>globalOffset+N-1))
+      colIds[cnt++] = Aj[n];
+  }
+  this->haloSetup(colIds);
+
+  //fill the CSR matrices
+  diagA   = (dfloat *) calloc(Ncols, sizeof(dfloat));
+  diagInv = (dfloat *) calloc(Ncols, sizeof(dfloat));
+  diag->cols = (dlong *)  calloc(diag->nnz, sizeof(dlong));
+  offd->cols = (dlong *)  calloc(offd->nnz, sizeof(dlong));
+  diag->vals = (dfloat *) calloc(diag->nnz, sizeof(dfloat));
+  offd->vals = (dfloat *) calloc(offd->nnz, sizeof(dfloat));
+  dlong diagCnt = 0;
+  dlong offdCnt = 0;
+  for (dlong n=0;n<nnz;n++) {
+    if ((Aj[n] < globalOffset) || (Aj[n]>globalOffset+Nrows-1)) {
+      offd->cols[offdCnt] = colIds[offdCnt];
+      offd->vals[offdCnt] = Avals[n];
+      offdCnt++;
+    } else {
+      diag->cols[diagCnt] = (dlong) (Aj[n] - globalOffset);
+      diag->vals[diagCnt] = Avals[n];
+
+      //record the diagonal
+      dlong row = (dlong) (Ai[n] - globalOffset);
+      if (row==diag->cols[diagCnt])
+        diagA[row] = diag->vals[diagCnt];
+
+      diagCnt++;
+    }
+  }
+
+  //fill the halo region
+  ogsGatherScatter(diagA, ogsDfloat, ogsAdd, ogs);
+
+  //compute the inverse diagonal
+  for (dlong n=0;n<Nrows;n++) diagInv[n] = 1.0/diagA[n];
+}
+
+void parCSR::haloSetup(hlong *colIds) {
+
+  int rank, size;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &size);
+
+  hlong globalOffset = globalColStarts[rank];
+
+  //collect the unique nonlocal column ids
+  parallelId_t*  parIds = (parallelId_t*) malloc(offd->nnz*sizeof(parallelId_t));
+
+  for (dlong n=0;n<offd->nnz;n++) {
+    parIds[n].localId  = n;
+    parIds[n].globalId = colIds[n];
+  }
+
+  //sort by global index
+  qsort(parIds, offd->nnz, sizeof(parallelId_t), CompareGlobalId);
+
+  //count unique nonlocal column ids
+  dlong Noffdcols = 0; //number of unique columns
+  if(offd->nnz) parIds[0].newId = Noffdcols;
+  for (dlong n=1;n<offd->nnz;n++) {
+    if (parIds[n].globalId != parIds[n-1].globalId)
+      Noffdcols++;
+
+    parIds[n].newId = Noffdcols;
+  }
+  if(offd->nnz) Noffdcols++;
+
+  //record the global ids of the unique columns
+  hlong *offdcols = (hlong *) malloc(Noffdcols*sizeof(hlong));
+  Noffdcols = 0;
+  if(offd->nnz) offdcols[Noffdcols++] = parIds[0].globalId;
+  for (dlong n=1;n<offd->nnz;n++)
+    if (parIds[n].globalId != parIds[n-1].globalId)
+      offdcols[Noffdcols++] = parIds[n].globalId;
+
+  //sort back to local order
+  qsort(parIds, offd->nnz, sizeof(parallelId_t), CompareLocalId);
+
+  // be careful to make sure Ncols is set at this point
+  NlocalCols = Ncols;
+  Ncols += Noffdcols;
+
+  //make an array of all the column ids required on this rank (local first)
+  colMap = (hlong*) malloc(Ncols*sizeof(hlong));
+  for (dlong n=0; n<NlocalCols; n++)      colMap[n] = n+globalOffset+1; //local rows
+  for (dlong n=NlocalCols; n<Ncols; n++)  colMap[n] = offdcols[n-NlocalCols]+1;    //nonlocal rows
+
+  //make a gatherScatter to determine local ids which are shared
+  int verbose = 0;
+  ogs = ogsSetup(Ncols, colMap, comm, verbose, device);
+
+  //shift back to 0-indexed
+  for (dlong n=0; n<Ncols; n++) colMap[n]--;
+
+  int *minRank = (int *) calloc(Ncols,sizeof(int));
+  int *maxRank = (int *) calloc(Ncols,sizeof(int));
+  for (dlong i=0;i<Ncols;i++) {
+    minRank[i] = rank;
+    maxRank[i] = rank;
+  }
+
+  ogsGatherScatter(minRank, ogsInt, ogsMin, ogs); //minRank[n] contains the smallest rank taking part in the gather of node n
+  ogsGatherScatter(maxRank, ogsInt, ogsMax, ogs); //maxRank[n] contains the largest rank taking part in the gather of node n
+
+  //count the local nodes that must be shared
+  Nshared = 0;
+  for (dlong i=0;i<NlocalCols;i++)
+    if ((minRank[i]!=rank)||(maxRank[i]!=rank))
+      Nshared++;
+
+  //total nodes involved in communication is the local nodes which must be shared
+  // + the number of nodes which need to be recieved.
+  Nhalo = Nshared + Noffdcols;
+
+  //build of list of ids to share for the comm
+  haloIds = (dlong *) malloc(Nshared*sizeof(dlong));
+  hlong *ghaloIds = (hlong*) malloc(Nhalo*sizeof(hlong));
+  Nshared = 0;
+  Nhalo=0;
+  for (dlong i=0;i<NlocalCols;i++) {
+    if ((minRank[i]!=rank)||(maxRank[i]!=rank)) {
+      haloIds[Nshared++] = i;
+      ghaloIds[Nhalo++] = i+globalOffset+1;
+    }
+  }
+  for (dlong n=0; n<Noffdcols; n++) {
+    ghaloIds[Nhalo++] = -(offdcols[n]+1); //negative -> does not participate in sum
+  }
+
+  //construct the parCSR ogs object for comms
+  ogsHalo = ogsSetup(Nhalo, ghaloIds, comm, verbose, device);
+
+  MPI_Barrier(comm);
+  free(ghaloIds);
+  free(offdcols);
+  free(minRank);
+  free(maxRank);
+
+  //update column numbering
+  for (dlong n=0;n<offd->nnz;n++)
+    colIds[n] = NlocalCols + parIds[n].newId;
+
+  size_t requiredBytes = Nhalo*sizeof(dfloat);
+  allocatePinnedScratchSpace(requiredBytes, device);
+
+  free(parIds);
+}
+
+void parCSR::haloExchangeStart(dfloat *x) {
+  // copy data from outgoing elements into temporary send buffer
+  for(int i=0;i<Nshared;++i){
+    // outgoing element
+    dlong id = haloIds[i];
+    ((dfloat*)pinnedScratch)[i] = x[id];
+  }
+}
+
+void parCSR::haloExchangeFinish(dfloat *x) {
+  ogsGatherScatter(pinnedScratch, ogsDfloat, ogsAdd, ogsHalo);
+  memcpy(x+NlocalCols, ((dfloat*)pinnedScratch)+Nshared,
+          (Nhalo-Nshared)*sizeof(dfloat));
+}
+
+void parCSR::haloExchangeStart(occa::memory o_x) {
+  // copy data from outgoing elements into temporary send buffer
+  if (Nshared) {
+    haloExtractKernel(Nshared, o_haloIds, o_x, o_pinnedScratch);
+    o_pinnedScratch.copyTo(pinnedScratch, Nshared*sizeof(dfloat), 0);
+  }
+}
+
+void parCSR::haloExchangeFinish(occa::memory o_x) {
+  ogsGatherScatter(pinnedScratch, ogsDfloat, ogsAdd, ogsHalo);
+  if (Nhalo-Nshared)
+    o_x.copyFrom(((dfloat*)pinnedScratch)+Nshared,
+                 (Nhalo-Nshared)*sizeof(dfloat),
+                  NlocalCols*sizeof(dfloat));
+}
+
+parCSR::~parCSR() {
+  delete diag;
+  delete offd;
+
+  free(diagA);
+  free(diagInv);
+
+  if (o_diagA.size()) o_diagA.free();
+  if (o_diagInv.size()) o_diagInv.free();
+
+  free(null);
+  if (o_null.size()) o_null.free();
+
+  free(globalRowStarts);
+  free(globalColStarts);
+
+  free(colMap);
+  free(haloIds);
+
+  if (ogs)       ogsFree(ogs);
+  if (ogsHalo)   ogsFree(ogsHalo);
+}
+
+dfloat parCSR::rhoDinvA(){
+
+  int rank, size;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &size);
+
+  int k = 10;
+
+  hlong Ntotal = globalRowStarts[size];
+  if(k > Ntotal) k = (int) Ntotal;
+
+  // do an arnoldi
+
+  // allocate memory for Hessenberg matrix
+  double *H = (double *) calloc(k*k,sizeof(double));
+
+  // allocate memory for basis
+  dfloat **V = (dfloat **) calloc(k+1, sizeof(dfloat *));
+  dfloat *Vx = (dfloat *) calloc(Ncols, sizeof(dfloat));
+
+  for(int i=0; i<=k; i++)
+    V[i] = (dfloat *) calloc(Nrows, sizeof(dfloat));
+
+  // generate a random vector for initial basis vector
+  vectorRandomize(Nrows, Vx);
+
+  dfloat norm_vo = vectorNorm(Nrows,Vx, comm);
+  vectorScale(Nrows, 1.0/norm_vo, Vx);
+
+  memcpy(V[0], Vx, Nrows*sizeof(dfloat));
+
+  for(int j=0; j<k; j++){
+
+    memcpy(Vx, V[j], Nrows*sizeof(dfloat));
+
+    // v[j+1] = invD*(A*v[j])
+    this->SpMV(1.0, Vx, 0., V[j+1]);
+    vectorDotStar(Nrows, diagInv, V[j+1]);
+
+    // modified Gram-Schmidth
+    for(int i=0; i<=j; i++){
+      // H(i,j) = v[i]'*A*v[j]
+      dfloat hij = vectorInnerProd(Nrows, V[i], V[j+1],comm);
+
+      // v[j+1] = v[j+1] - hij*v[i]
+      vectorAdd(Nrows,-hij, V[i], 1.0, V[j+1]);
+
+      H[i + j*k] = (double) hij;
+    }
+
+    if(j+1 < k){
+
+      dfloat norm_vj = vectorNorm(Nrows,V[j+1],comm);
+
+      H[j+1+ j*k] = (double) norm_vj;
+
+      vectorScale(Nrows, 1./H[j+1 + j*k], V[j+1]);
+    }
+  }
+
+  double *WR = (double *) calloc(k,sizeof(double));
+  double *WI = (double *) calloc(k,sizeof(double));
+
+  eig(k, H, WR, WI);
+
+  double rho = 0.;
+
+  for(int i=0; i<k; i++){
+    double rho_i  = sqrt(WR[i]*WR[i] + WI[i]*WI[i]);
+
+    if(rho < rho_i) {
+      rho = rho_i;
+    }
+  }
+
+  free(H);
+  free(WR);
+  free(WI);
+
+  // free memory
+  for(int i=0; i<=k; i++) free(V[i]);
+  free(Vx);
+  free(V);
+
+  // printf("weight = %g \n", rho);
+
+  return rho;
+}
+
+
+
+
+//------------------------------------------------------------------------
+//
+//  parHYB matrix
+//
+//------------------------------------------------------------------------
+
+//build from parCSR
+parHYB::parHYB(parCSR *A): matrix_t(A->Nrows, A->Ncols) {
+
+  int *rowCounters = (int*) calloc(A->Nrows, sizeof(int));
+
+  int maxNnzPerRow = 0;
+  int minNnzPerRow = 0;
+  if (A->Nrows)
+    minNnzPerRow = (int) A->diag->rowStarts[1] - A->diag->rowStarts[0];
+
+  for(dlong i=0; i<A->Nrows; i++) {
+    int rowNnz = (int) A->diag->rowStarts[i+1] - A->diag->rowStarts[i];
+    rowCounters[i] = rowNnz;
+
+    maxNnzPerRow = (rowNnz > maxNnzPerRow) ? rowNnz : maxNnzPerRow;
+    minNnzPerRow = (rowNnz < minNnzPerRow) ? rowNnz : minNnzPerRow;
+  }
+
+  // This chooses the nnzPerRow by binning. Just pack all the local nonzeros in ELL
+  /*
+  // create bins
+  int numBins = maxNnzPerRow - minNnzPerRow + 1;
+
+  //zero row check
+  if (numBins<0) numBins =0;
+
+  int *bins;
+  if (numBins)
+    bins = (int *) calloc(numBins, sizeof(int));
+
+  for(dlong i=0; i<A->Nrows; i++)
+    bins[rowCounters[i]-minNnzPerRow]++;
+
+  dfloat threshold = 2.0/3.0;
+  dlong totalNNZ = csrA->diagNNZ+csrA->offdNNZ;
+  int nnzPerRow = 0;
+  dlong nnz = 0;
+
+  //increase the nnz per row in E until it holds threshold*totalnnz nonzeros
+  for(int i=0; i<numBins; i++){
+    nnz += bins[i] * (i+minNnzPerRow);
+    if((nnz > threshold*totalNNZ)||(i==numBins-1)){
+      nnzPerRow = i+minNnzPerRow;
+      break;
+    }
+  }
+  */
+  if(Nrows) {
+    free(rowCounters);
+    // free(bins);
+  }
+
+  int nnzPerRow = maxNnzPerRow;
+
+  //build the ELL matrix from the local CSR
+  E = new ELL(Nrows, Ncols);
+  C = new MCSR(Nrows, Ncols);
+
+  E->nnzPerRow = nnzPerRow;
+
+  E->cols  = (dlong *) calloc(Nrows*E->nnzPerRow, sizeof(dlong));
+  E->vals = (dfloat *) calloc(Nrows*E->nnzPerRow, sizeof(dfloat));
+
+  C->nnz = 0;
+  C->actualRows = 0;
+
+  for(dlong i=0; i<Nrows; i++){
+    dlong Jstart = A->diag->rowStarts[i];
+    dlong Jend   = A->diag->rowStarts[i+1];
+    int rowNnz = (int)  (Jend - Jstart);
+
+    // store only min of nnzPerRow and rowNnz
+    int maxNnz = (nnzPerRow >= rowNnz) ? rowNnz : nnzPerRow;
+
+    for(int c=0; c<maxNnz; c++){
+      E->cols[i*nnzPerRow+c] = A->diag->cols[Jstart+c];
+      E->vals[i*nnzPerRow+c] = A->diag->vals[Jstart+c];
+    }
+
+    for(int c=maxNnz; c<nnzPerRow; c++){
+      E->cols[i*nnzPerRow+c] = -1; //ignore this column
+    }
+
+    // count the number of nonzeros to be stored in MCSR format
+
+    //all of offd
+    int cnt= (int) (A->offd->rowStarts[i+1]-A->offd->rowStarts[i]);
+    if (rowNnz>nnzPerRow)
+      cnt += rowNnz-nnzPerRow; //excess of diag
+
+    if (cnt) {
+      C->nnz += cnt;
+      C->actualRows++;
+    }
+  }
+
+  C->rowStarts = (dlong *) calloc(C->actualRows+1, sizeof(dlong));
+  C->rows = (dlong  *) calloc(C->actualRows, sizeof(dlong));
+  C->cols = (dlong  *) calloc(C->nnz, sizeof(dlong));
+  C->vals = (dfloat *) calloc(C->nnz, sizeof(dfloat));
+
+  dlong row = 0;
+  dlong cnt = 0;
+  for(dlong i=0; i<Nrows; i++){
+    dlong Jstart = A->diag->rowStarts[i];
+    dlong Jend   = A->diag->rowStarts[i+1];
+    int rowNnz = (int)  (Jend - Jstart);
+    int rowCnt =0;
+
+    // store the remaining row in MCSR format
+    if(rowNnz > nnzPerRow){
+      rowCnt += rowNnz-nnzPerRow;
+      for(int c=nnzPerRow; c<rowNnz; c++){
+        C->cols[cnt] = A->diag->cols[Jstart+c];
+        C->vals[cnt] = A->diag->vals[Jstart+c];
+        cnt++;
+      }
+    }
+
+    //add the offd non-zeros
+    Jstart = A->offd->rowStarts[i];
+    Jend   = A->offd->rowStarts[i+1];
+    rowCnt += (int) (Jend-Jstart);
+    for (dlong j=Jstart;j<Jend;j++) {
+      C->cols[cnt] = A->offd->cols[j];
+      C->vals[cnt] = A->offd->vals[j];
+      cnt++;
+    }
+
+    if (rowCnt) {
+      C->rows[row++] = i;
+      C->rowStarts[row] = cnt;
+    }
+  }
+
+  nullSpace = A->nullSpace;
+  nullSpacePenalty = A->nullSpacePenalty;
+
+  null = A->null;
+  o_null = A->o_null;
+
+  diagA = A->diagA;
+  o_diagA = A->o_diagA;
+
+  diagInv = A->diagInv;
+  o_diagInv = A->o_diagInv;
+
+  comm = A->comm;
+  globalRowStarts = A->globalRowStarts;
+  globalColStarts = A->globalColStarts;
+  colMap = A->colMap;
+
+  ogs = A->ogs;
+  ogsHalo = A->ogsHalo;
+
+  Nhalo = A->Nhalo;
+  Nshared = A->Nshared;
+  NlocalCols = A->NlocalCols;
+
+  haloIds = A->haloIds;
+  o_haloIds = A->o_haloIds;
+
+  device = A->device;
+}
+
+parHYB::~parHYB() {
+  delete E;
+  delete C;
+
+  free(diagA);
+  free(diagInv);
+
+  if (o_diagA.size()) o_diagA.free();
+  if (o_diagInv.size()) o_diagInv.free();
+
+  free(null);
+  if (o_null.size()) o_null.free();
+
+  free(globalRowStarts);
+  free(globalColStarts);
+
+  free(colMap);
+  free(haloIds);
+
+  if (ogs)       ogsFree(ogs);
+  if (ogsHalo)   ogsFree(ogsHalo);
+};
+
+void parHYB::syncToDevice() {
+
+  E->syncToDevice(device);
+  C->syncToDevice(device);
+
+  if (Nrows) {
+    o_diagA   = device.malloc(Nrows*sizeof(dfloat), diagA);
+    o_diagInv = device.malloc(Nrows*sizeof(dfloat), diagInv);
+
+    if(nullSpace)
+      o_null = device.malloc(Nrows*sizeof(dfloat), null);
+  }
+
+  if (Nshared)
+    o_haloIds = device.malloc(Nshared*sizeof(dlong), haloIds);
+}
+
+void parHYB::haloExchangeStart(dfloat *x) {
+  // copy data from outgoing elements into temporary send buffer
+  for(int i=0;i<Nshared;++i){
+    // outgoing element
+    dlong id = haloIds[i];
+    ((dfloat*)pinnedScratch)[i] = x[id];
+  }
+}
+
+void parHYB::haloExchangeFinish(dfloat *x) {
+  ogsGatherScatter(pinnedScratch, ogsDfloat, ogsAdd, ogsHalo);
+  memcpy(x+NlocalCols, ((dfloat*)pinnedScratch)+Nshared,
+          (Nhalo-Nshared)*sizeof(dfloat));
+}
+
+void parHYB::haloExchangeStart(occa::memory o_x) {
+  // copy data from outgoing elements into temporary send buffer
+  if (Nshared) {
+    haloExtractKernel(Nshared, o_haloIds, o_x, o_pinnedScratch);
+    o_pinnedScratch.copyTo(pinnedScratch, Nshared*sizeof(dfloat), 0);
+  }
+}
+
+void parHYB::haloExchangeFinish(occa::memory o_x) {
+  ogsGatherScatter(pinnedScratch, ogsDfloat, ogsAdd, ogsHalo);
+  if (Nhalo-Nshared)
+    o_x.copyFrom(((dfloat*)pinnedScratch)+Nshared,
+                 (Nhalo-Nshared)*sizeof(dfloat),
+                  NlocalCols*sizeof(dfloat));
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/multigrid.cpp b/libs/parAlmond/src/multigrid.cpp
new file mode 100644
index 000000000..63ad65836
--- /dev/null
+++ b/libs/parAlmond/src/multigrid.cpp
@@ -0,0 +1,245 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+void solver_t::kcycle(int k){
+
+  multigridLevel *level = levels[k];
+
+  dlong m = level->Nrows;
+
+  dfloat* rhs = level->rhs;
+  dfloat*   x = level->x;
+  dfloat* res = level->res;
+
+  //check for base level
+  if(k==baseLevel) {
+    coarseLevel->solve(rhs, x);
+    return;
+  }
+
+  multigridLevel *levelC = levels[k+1];
+  dlong mCoarse = levelC->Nrows;
+  dfloat* rhsC   = levelC->rhs;
+  dfloat*   xC   = levelC->x;
+
+  //apply smoother to x and then return res = rhs-Ax
+  level->smooth(rhs, x, true);
+  level->residual(rhs, x, res);
+
+  // rhsC = P^T res
+  levelC->coarsen(res, rhsC);
+
+  if(k+1>NUMKCYCLES) {
+    this->vcycle(k+1);
+  } else{
+    // first inner krylov iteration
+    this->kcycle(k+1);
+
+    // ck = x
+    // alpha1=ck*rhsC, rho1=ck*Ack, norm_rhs=sqrt(rhsC*rhsC)
+    // rhsC = rhsC - (alpha1/rho1)*vkp1
+    // norm_rtilde = sqrt(rhsC*rhsC)
+    dfloat rho1, alpha1, norm_rhs, norm_rhstilde;
+    levelC->kcycleOp1(&alpha1, &rho1, &norm_rhs, &norm_rhstilde);
+
+    if(norm_rhstilde < KCYCLETOL*norm_rhs){
+      // xC = (alpha1/rho1)*xC
+      vectorScale(mCoarse, alpha1/rho1, xC);
+    } else{
+
+      // second inner krylov iteration
+      this->kcycle(k+1);
+
+      // gamma=xC*Ack, beta=xC*AxC, alpha2=xC*rhsC
+      // rho2=beta - gamma*gamma/rho1
+      // xC = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*xC
+      levelC->kcycleOp2(alpha1, rho1);
+    }
+  }
+
+  // x = x + P xC
+  levelC->prolongate(xC, x);
+
+  level->smooth(rhs, x, false);
+}
+
+
+void solver_t::device_kcycle(int k){
+
+  multigridLevel *level = levels[k];
+
+  dlong m = level->Nrows;
+
+  occa::memory o_rhs = level->o_rhs;
+  occa::memory o_x   = level->o_x;
+  occa::memory o_res = level->o_res;
+
+  //check for device<->host handoff
+  if(m < GPU_CPU_SWITCH_SIZE){
+    o_rhs.copyTo(level->rhs, m*sizeof(dfloat));
+    this->kcycle(k);
+    o_x.copyFrom(level->x, m*sizeof(dfloat));
+    return;
+  }
+
+  //check for base level
+  if(k==baseLevel) {
+    coarseLevel->solve(o_rhs, o_x);
+    return;
+  }
+
+  multigridLevel *levelC = levels[k+1];
+  dlong mCoarse = levelC->Nrows;
+  occa::memory o_rhsC = levelC->o_rhs;
+  occa::memory o_xC   = levelC->o_x;
+
+  //apply smoother to x and then compute res = rhs-Ax
+  level->smooth(o_rhs, o_x, true);
+  level->residual(o_rhs, o_x, o_res);
+
+  // rhsC = P^T res
+  levelC->coarsen(o_res, o_rhsC);
+
+  if(k+1>NUMKCYCLES) {
+    this->device_vcycle(k+1);
+  } else{
+    // first inner krylov iteration
+    this->device_kcycle(k+1);
+
+    // alpha1=ck*rhsC, rho1=ck*Ack, norm_rhs=sqrt(rhsC*rhsC)
+    // rhsC = rhsC - (alpha1/rho1)*vkp1
+    // norm_rtilde = sqrt(rhsC*rhsC)
+    dfloat rho1, alpha1, norm_rhs, norm_rhstilde;
+    levelC->device_kcycleOp1(&alpha1, &rho1, &norm_rhs, &norm_rhstilde);
+
+    if(norm_rhstilde < KCYCLETOL*norm_rhs){
+      // xC = (alpha1/rho1)*xC
+      vectorScale(mCoarse, alpha1/rho1, o_xC);
+    } else{
+
+      // second inner krylov iteration
+      this->device_kcycle(k+1);
+
+      // gamma=xC*Ack, beta=xC*AxC, alpha2=xC*rhsC
+      // rho2=beta - gamma*gamma/rho1
+      // xC = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ck + (alpha2/rho2)*xC
+      levelC->device_kcycleOp2(alpha1, rho1);
+    }
+  }
+
+  // x = x + P xC
+  levelC->prolongate(o_xC, o_x);
+  level->smooth(o_rhs, o_x, false);
+}
+
+
+
+void solver_t::vcycle(int k) {
+
+  multigridLevel *level = levels[k];
+
+  dlong m = level->Nrows;
+
+  dfloat* rhs = level->rhs;
+  dfloat*   x = level->x;
+  dfloat* res = level->res;
+
+  //check for base level
+  if(k==baseLevel) {
+    coarseLevel->solve(rhs, x);
+    return;
+  }
+
+  multigridLevel *levelC = levels[k+1];
+  dlong mCoarse = levelC->Nrows;
+  dfloat* rhsC   = levelC->rhs;
+  dfloat*   xC   = levelC->x;
+
+  //apply smoother to x and then return res = rhs-Ax
+  level->smooth(rhs, x, true);
+  level->residual(rhs, x, res);
+
+  // rhsC = P^T res
+  levelC->coarsen(res, rhsC);
+
+  this->vcycle(k+1);
+
+  // x = x + P xC
+  levelC->prolongate(xC, x);
+
+  level->smooth(rhs, x, false);
+}
+
+
+void solver_t::device_vcycle(int k){
+
+  multigridLevel *level = levels[k];
+
+  dlong m = level->Nrows;
+
+  occa::memory o_rhs = level->o_rhs;
+  occa::memory o_x   = level->o_x;
+  occa::memory o_res = level->o_res;
+
+  //check for device<->host handoff
+  if(m < GPU_CPU_SWITCH_SIZE){
+    o_rhs.copyTo(level->rhs, m*sizeof(dfloat));
+    vcycle(k);
+    o_x.copyFrom(level->x, m*sizeof(dfloat));
+    return;
+  }
+
+  //check for base level
+  if(k==baseLevel) {
+    coarseLevel->solve(o_rhs, o_x);
+    return;
+  }
+
+  multigridLevel *levelC = levels[k+1];
+  dlong mCoarse = levelC->Nrows;
+  occa::memory o_rhsC = levelC->o_rhs;
+  occa::memory o_xC   = levelC->o_x;
+
+  //apply smoother to x and then compute res = rhs-Ax
+  level->smooth(o_rhs, o_x, true);
+  level->residual(o_rhs, o_x, o_res);
+
+  // rhsC = P^T res
+  levelC->coarsen(o_res, o_rhsC);
+
+  this->device_vcycle(k+1);
+
+  // x = x + P xC
+  levelC->prolongate(o_xC, o_x);
+
+  level->smooth(o_rhs, o_x, false);
+}
+
+} //hamespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/parAlmond.cpp b/libs/parAlmond/src/parAlmond.cpp
new file mode 100644
index 000000000..7259d0792
--- /dev/null
+++ b/libs/parAlmond/src/parAlmond.cpp
@@ -0,0 +1,107 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+
+solver_t *Init(occa::device device, MPI_Comm comm, setupAide options) {
+  solver_t *M = new solver_t(device, comm, options);
+
+  if (Nrefs==0) buildParAlmondKernels(comm, device);
+  Nrefs++;
+
+  return M;
+}
+
+void AMGSetup(solver_t *MM,
+               hlong* globalRowStarts,       //global partition
+               dlong nnz,                    //--
+               hlong* Ai,                    //-- Local A matrix data (globally indexed, COO storage, row sorted)
+               hlong* Aj,                    //--
+               dfloat* Avals,                //--
+               bool nullSpace,
+               dfloat nullSpacePenalty){
+
+  solver_t *M = (solver_t *) MM;
+
+  int rank, size;
+  MPI_Comm_rank(M->comm, &rank);
+  MPI_Comm_size(M->comm, &size);
+
+  hlong TotalRows = globalRowStarts[M->size];
+  dlong numLocalRows = (dlong) (globalRowStarts[M->rank+1]-globalRowStarts[M->rank]);
+
+  if(rank==0) printf("Setting up AMG...");fflush(stdout);
+
+  //populate null space vector
+  dfloat *null = (dfloat *) calloc(numLocalRows, sizeof(dfloat));
+  for (dlong i=0;i<numLocalRows;i++) null[i] = 1/sqrt(TotalRows);
+
+  parCSR *A = new parCSR(numLocalRows,globalRowStarts,
+                          nnz, Ai, Aj, Avals,
+                          nullSpace, null, nullSpacePenalty,
+                          M->comm, M->device);
+  free(null);
+
+  M->AMGSetup(A);
+
+  if(rank==0) printf("done.\n");
+}
+
+void Precon(solver_t *M, occa::memory o_x, occa::memory o_rhs) {
+
+  M->levels[0]->o_x   = o_x;
+  M->levels[0]->o_rhs = o_rhs;
+
+  if       ((M->exact)&&(M->ktype==PCG)){
+    M->device_pcg(1000,1e-8);
+  } else if((M->exact)&&(M->ktype==GMRES)){
+    M->device_pgmres(1000,1e-8);
+  } else if(M->ctype==KCYCLE) {
+    M->device_kcycle(0);
+  } else if(M->ctype==VCYCLE) {
+    M->device_vcycle(0);
+  }
+}
+
+void Report(solver_t *M) {
+  M->Report();
+}
+
+void Free(solver_t* M) {
+  Nrefs--;
+  if (Nrefs==0) {
+    freeParAlmondKernels();
+    freeScratchSpace();
+    freePinnedScratchSpace();
+  }
+
+  delete M;
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/pcg.cpp b/libs/parAlmond/src/pcg.cpp
new file mode 100644
index 000000000..0bf5e7492
--- /dev/null
+++ b/libs/parAlmond/src/pcg.cpp
@@ -0,0 +1,240 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+void solver_t::pcg(const int maxIt, const dfloat tol){
+
+  const dlong m = levels[0]->Nrows;
+  const dlong n = levels[0]->Ncols;
+
+  ktype = PCG;
+
+  // use parAlmond's buffers
+  dfloat *r = levels[0]->rhs;
+  dfloat *z = levels[0]->x;
+
+  // initial residual
+  dfloat rdotr0 = vectorInnerProd(m, r, r, levels[0]->comm);
+
+  dfloat *x  = (dfloat *) calloc(n,sizeof(dfloat));
+  dfloat *Ap = (dfloat *) calloc(n,sizeof(dfloat));
+  dfloat *p  = (dfloat *) calloc(n,sizeof(dfloat));
+
+  //sanity check
+  if (rdotr0<=(tol*tol)) {
+    memcpy(levels[0]->x, x, m*sizeof(dfloat));
+    free(x); free(p); free(Ap);
+    return;
+  }
+
+  // Precondition, z = M^{-1}*r
+  if(ctype==KCYCLE) {
+    this->kcycle(0);
+  } else if(ctype==VCYCLE) {
+    this->vcycle(0);
+  }
+  memcpy(p, z, m*sizeof(dfloat));
+
+  dfloat rdotz0 = vectorInnerProd(m, r, z, levels[0]->comm);
+
+  dfloat rdotr1 = 0;
+  dfloat rdotz1 = 0;
+  dfloat alpha, beta, pAp;
+
+  int Niter = 0;
+  while(rdotr0>(tol*tol)){
+    //   Ap = A*p;
+    levels[0]->Ax(p, Ap);
+
+    dfloat pAp = vectorInnerProd(m, p, Ap, levels[0]->comm);
+
+    alpha = rdotz0/pAp;
+
+    // update solution
+    //    x = x + alpha * p;
+    vectorAdd(m, alpha, p, 1.0, x);
+
+    // update residual
+    // r = r - alpha * Ap;
+    vectorAdd(m, -alpha, Ap, 1.0, r);
+
+    dfloat rdotr1 = vectorInnerProd(m, r, r, levels[0]->comm);
+
+    if(rdotr1 < tol*tol) {
+      rdotr0 = rdotr1;
+      break;
+    }
+
+    // Precondition, z = M^{-1}*r
+    if(ctype==KCYCLE) {
+      this->kcycle(0);
+    } else if(ctype==VCYCLE) {
+      this->vcycle(0);
+    }
+
+    dfloat rdotz1 = vectorInnerProd(m, r, z, levels[0]->comm);
+
+    if(ctype==KCYCLE) {
+      // flexible pcg beta = (z.(-alpha*Ap))/zdotz0
+      dfloat zdotAp = vectorInnerProd(m, z, Ap, levels[0]->comm);
+      beta = -alpha*zdotAp/rdotz0;
+    } else {
+      beta = rdotz1/rdotz0;
+    }
+
+    // p = z + beta*p
+    vectorAdd(m, 1.0, z, beta, p);
+
+    // switch rdotz0 <= rdotz1
+    rdotz0 = rdotz1;
+
+    // switch rdotz0,rdotr0 <= rdotz1,rdotr1
+    rdotr0 = rdotr1;
+
+    Niter++;
+
+    printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0));
+
+    if(Niter==maxIt) break;
+  }
+
+  //copy result back to parAlmond's x storage
+  memcpy(levels[0]->x, x, m*sizeof(dfloat));
+  free(x); free(p); free(Ap);
+}
+
+void solver_t::device_pcg(const int maxIt, const dfloat tol){
+
+  const dlong m = levels[0]->Nrows;
+  const dlong n = levels[0]->Ncols;
+
+  ktype = PCG;
+
+  // use parAlmond's buffers
+  occa::memory &o_r = levels[0]->o_rhs;
+  occa::memory &o_z = levels[0]->o_x;
+
+  // initial residual
+  dfloat rdotr0 = vectorInnerProd(m, o_r, o_r, levels[0]->comm);
+
+  occa::memory o_x  = device.malloc(n*sizeof(dfloat),levels[0]->x);
+  occa::memory o_Ap = device.malloc(n*sizeof(dfloat),levels[0]->x);
+  occa::memory o_p  = device.malloc(n*sizeof(dfloat),levels[0]->x);
+
+  //    x = 0;
+  vectorSet(m, 0.0, o_x);
+
+  //sanity check
+  if (rdotr0<=(tol*tol)) {
+    levels[0]->o_x.copyFrom(o_x);
+    printf("Almond PCG iter %d, res = %g\n", 0, sqrt(rdotr0));
+    o_x.free(); o_p.free(); o_Ap.free();
+    return;
+  }
+
+  // Precondition, z = M^{-1}*r
+  if(ctype==KCYCLE) {
+    this->device_kcycle(0);
+  } else if(ctype==VCYCLE) {
+    this->device_vcycle(0);
+  }
+  o_p.copyFrom(o_z);
+
+  dfloat rdotz0 = vectorInnerProd(m, o_r, o_z, levels[0]->comm);
+
+  dfloat rdotr1 = 0;
+  dfloat rdotz1 = 0;
+  dfloat alpha, beta, pAp;
+
+  int Niter = 0;
+  while(rdotr0>(tol*tol)){
+    //   Ap = A*p;
+    levels[0]->Ax(o_p, o_Ap);
+
+    dfloat pAp = vectorInnerProd(m, o_p, o_Ap, levels[0]->comm);
+
+    alpha = rdotz0/pAp;
+
+    // update solution
+    //    x = x + alpha * p;
+    vectorAdd(m, alpha, o_p, 1.0, o_x);
+
+    // update residual
+    // r = r - alpha * Ap;
+    vectorAdd(m, -alpha, o_Ap, 1.0, o_r);
+
+    dfloat rdotr1 = vectorInnerProd(m, o_r, o_r, levels[0]->comm);
+
+    if(rdotr1 < tol*tol) {
+      rdotr0 = rdotr1;
+      break;
+    }
+
+    // Precondition, z = M^{-1}*r
+    if(ctype==KCYCLE) {
+      this->device_kcycle(0);
+    } else if(ctype==VCYCLE) {
+      this->device_vcycle(0);
+    }
+
+    dfloat rdotz1 = vectorInnerProd(m, o_r, o_z, levels[0]->comm);
+
+    if(ctype==KCYCLE) {
+      // flexible pcg beta = (z.(-alpha*Ap))/zdotz0
+      dfloat zdotAp = vectorInnerProd(m, o_z, o_Ap, levels[0]->comm);
+      beta = -alpha*zdotAp/rdotz0;
+    } else if(ctype==VCYCLE) {
+      beta = rdotz1/rdotz0;
+    }
+
+    // p = z + beta*p
+    vectorAdd(m, 1.0, o_z, beta, o_p);
+
+    // switch rdotz0 <= rdotz1
+    rdotz0 = rdotz1;
+
+    // switch rdotz0,rdotr0 <= rdotz1,rdotr1
+    rdotr0 = rdotr1;
+
+    Niter++;
+
+    //printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0));
+
+    if(Niter==maxIt) break;
+  }
+
+  //copy result back to parAlmond's x storage
+  levels[0]->o_x.copyFrom(o_x);
+
+  printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0));
+
+  o_x.free(); o_p.free(); o_Ap.free();
+}
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/solvers/parALMOND/src/gmres.c b/libs/parAlmond/src/pgmres.cpp
similarity index 58%
rename from solvers/parALMOND/src/gmres.c
rename to libs/parAlmond/src/pgmres.cpp
index 7d91efc48..35671912a 100644
--- a/solvers/parALMOND/src/gmres.c
+++ b/libs/parAlmond/src/pgmres.cpp
@@ -24,7 +24,9 @@ SOFTWARE.
 
 */
 
-#include "agmg.h"
+#include "parAlmond.hpp"
+
+namespace parAlmond {
 
 void gmresUpdate(dlong Nrows,
                  dfloat *x,
@@ -46,14 +48,13 @@ void gmresUpdate(dlong Nrows,
   }
 
   for(int j=0; j<Niter; ++j){
-    for(dlong n=0; n<Nrows; ++n)
-      x[n] += y[j]*V[j][n];
+    vectorAdd(Nrows, y[j], V[j], 1.0, x);
   }
 
   free(y);
 }
 
-void gmresUpdate(parAlmond_t *parAlmond, dlong Nrows,
+void gmresUpdate(dlong Nrows,
                  occa::memory o_x,
                  occa::memory *o_V,
                  dfloat *H,
@@ -61,7 +62,7 @@ void gmresUpdate(parAlmond_t *parAlmond, dlong Nrows,
                  int Niter,
                  int maxIt){
 
-  dfloat *y = (dfloat *) calloc(Niter+1, sizeof(dfloat));
+  dfloat *y = (dfloat *) calloc(Niter, sizeof(dfloat));
 
   for(int k=Niter-1; k>=0; --k){
     y[k] = s[k];
@@ -73,60 +74,47 @@ void gmresUpdate(parAlmond_t *parAlmond, dlong Nrows,
   }
 
   for(int j=0; j<Niter; ++j){
-    vectorAdd(parAlmond, Nrows, y[j], o_V[j], 1.0, o_x);
+    vectorAdd(Nrows, y[j], o_V[j], 1.0, o_x);
   }
 
   free(y);
 }
 
-void pgmres(parAlmond_t *parAlmond,
-           int maxIt,
-           dfloat tol){
+void solver_t::pgmres(const int maxIt,
+                      const dfloat tol){
 
-  csr *A = parAlmond->levels[0]->A;
+  const dlong m = levels[0]->Nrows;
+  const dlong n = levels[0]->Ncols;
 
-  const dlong m = A->Nrows;
-  // const dlong n = A->Ncols;
-
-  parAlmond->ktype = GMRES;
+  ktype = GMRES;
 
   // use parAlmond's buffers
-  dfloat *r = parAlmond->levels[0]->rhs;
-  dfloat *z = parAlmond->levels[0]->x;
+  dfloat *r = levels[0]->rhs;
+  dfloat *z = levels[0]->x;
 
   // initial residual
-  dfloat nbLocal = innerProd(m, r, r);
-  dfloat nb = 0;
-  MPI_Allreduce(&nbLocal,&nb,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-  nb = sqrt(nb);
+  dfloat nb = sqrt(vectorInnerProd(m, r, r, levels[0]->comm));
 
   //    x = 0;
-  dfloat *x  = (dfloat *) calloc(m,sizeof(dfloat));
-  setVector(m, x, 0.0);
+  dfloat *x  = (dfloat *) calloc(n,sizeof(dfloat));
+  vectorSet(m, 0.0, x);
 
   //sanity check
   if (nb<=tol) {
-    for (dlong i=0;i<m;i++)
-      parAlmond->levels[0]->x[i] = x[i];
-
-    free(x); 
+    memcpy(levels[0]->x, x, m*sizeof(dfloat));
+    free(x);
     return;
   }
 
   // M r = b - A*x0
-  if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-    kcycle(parAlmond, 0);
-  } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-    vcycle(parAlmond, 0);
-  } else {
-    for (dlong k=0;k<m;k++)
-      z[k] = r[k];  
+  if(ctype==KCYCLE) {
+    this->kcycle(0);
+  } else if(ctype==VCYCLE) {
+    this->vcycle(0);
   }
-  for (dlong k=0;k<m;k++)
-    r[k] = z[k];
+  memcpy(r, z, m*sizeof(dfloat));
 
-  dfloat nr = innerProd(m, r, r);
-  nr = sqrt(nr);
+  dfloat nr = sqrt(vectorInnerProd(m, r, r, levels[0]->comm));
 
   dfloat *s = (dfloat *) calloc(maxIt+1, sizeof(dfloat));
   s[0] = nr;
@@ -152,26 +140,20 @@ void pgmres(parAlmond_t *parAlmond,
 
     Niter = i+1;
     // Av = A*V(:.i)
-    axpy(A, 1.0, V[i], 0.0, Av,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
+    levels[0]->Ax(V[i], Av);
 
     // M w = A vi
-    for (dlong k=0;k<m;k++)
-      r[k] = Av[k];
-    if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-      kcycle(parAlmond, 0);
-    } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-      vcycle(parAlmond, 0);
-    } else {
-      for (dlong k=0;k<m;k++)
-        z[k] = r[k];  
+    memcpy(r, Av, m*sizeof(dfloat));
+
+    if(ctype==KCYCLE) {
+      this->kcycle(0);
+    } else if(ctype==VCYCLE) {
+      this->vcycle(0);
     }
-    for (dlong k=0;k<m;k++)
-      w[k] = z[k];
+    memcpy(w, z, m*sizeof(dfloat));
 
     for(int k=0; k<=i; ++k){
-      dfloat hkiLocal = innerProd(m, w, V[k]);
-      dfloat hki = 0.;
-      MPI_Allreduce(&hkiLocal,&hki,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
+      dfloat hki = vectorInnerProd(m, w, V[k], levels[0]->comm);
 
       // w = w - hki*V[k]
       vectorAdd(m, -hki, V[k], 1.0, w);
@@ -180,9 +162,7 @@ void pgmres(parAlmond_t *parAlmond,
       H[k + i*(maxIt+1)] = hki;
     }
 
-    dfloat wdotwLocal = innerProd(m, w, w);
-    dfloat wdotw = 0.;
-    MPI_Allreduce(&wdotwLocal,&wdotw,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
+    dfloat wdotw = vectorInnerProd(m, w, w, levels[0]->comm);
 
     H[i+1 + i*(maxIt+1)] = sqrt(wdotw);
 
@@ -215,10 +195,7 @@ void pgmres(parAlmond_t *parAlmond,
     if(fabs(s[i+1]) < tol) break;
 
     if(i < maxIt-1){
-      dfloat wdotwLocal = innerProd(m, w, w);
-      dfloat wdotw = 0.;
-      MPI_Allreduce(&wdotwLocal,&wdotw,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
+      dfloat wdotw = vectorInnerProd(m, w, w, levels[0]->comm);
       dfloat nw = sqrt(wdotw);
 
       // V(:,i+1) = w/nw
@@ -229,10 +206,9 @@ void pgmres(parAlmond_t *parAlmond,
   gmresUpdate(m, x, V, H, s, Niter, maxIt);
 
   //copy result back to parAlmond's x storage
-  for (dlong i=0;i<m;i++)
-    parAlmond->levels[0]->x[i] = x[i];
+  memcpy(levels[0]->x, x, m*sizeof(dfloat));
 
-  free(x); 
+  free(x);
   free(s); free(V);
   free(H); free(J);
   free(Av); free(w);
@@ -241,65 +217,52 @@ void pgmres(parAlmond_t *parAlmond,
     printf("gmres did not converge in given number of iterations\n");
 }
 
-//TODO need to link this with MPI
-void device_pgmres(parAlmond_t *parAlmond,
-           int maxIt,
-           dfloat tol){
-
-  hyb* A = parAlmond->levels[0]->deviceA;
+void solver_t::device_pgmres(const int maxIt,
+                             const dfloat tol){
 
-  const dlong m = A->Nrows;
-  // const dlong n = A->Ncols;
+  const dlong m = levels[0]->Nrows;
+  const dlong n = levels[0]->Ncols;
 
   // use parAlmond's buffers
-  occa::memory &o_r = parAlmond->levels[0]->o_rhs;
-  occa::memory &o_z = parAlmond->levels[0]->o_x;
+  occa::memory &o_r = levels[0]->o_rhs;
+  occa::memory &o_z = levels[0]->o_x;
 
   // initial residual
-  dfloat nbLocal = innerProd(parAlmond, m, o_r, o_r);
-  dfloat nb = 0;
-  MPI_Allreduce(&nbLocal,&nb,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-  nb = sqrt(nb);
+  dfloat nb = sqrt(vectorInnerProd(m, o_r, o_r, levels[0]->comm));
 
-  dfloat *dummy = (dfloat*) calloc(m, sizeof(dfloat));
-  occa::memory  o_x = parAlmond->device.malloc(m*sizeof(dfloat), dummy);
-  occa::memory  o_Av= parAlmond->device.malloc(m*sizeof(dfloat), dummy);
-  occa::memory  o_w = parAlmond->device.malloc(m*sizeof(dfloat), dummy);
+  occa::memory  o_x = device.malloc(n*sizeof(dfloat), levels[0]->x);
+  occa::memory  o_Av= device.malloc(n*sizeof(dfloat), levels[0]->x);
+  occa::memory  o_w = device.malloc(n*sizeof(dfloat), levels[0]->x);
 
   //sanity check
   if (nb<=tol) {
-    parAlmond->levels[0]->o_x.copyFrom(o_x);
+    levels[0]->o_x.copyFrom(o_x);
     printf("Almond PGMRES iter %d, res = %g\n", 0, nb);
     o_x.free(); o_Av.free(); o_w.free();
     return;
   }
 
   // M r = b - A*x0
-  if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-    device_kcycle(parAlmond, 0);
-  } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-    device_vcycle(parAlmond, 0);
-  } else {
-    o_z.copyFrom(o_r);
+  if(ctype==KCYCLE) {
+    this->device_kcycle(0);
+  } else if(ctype==VCYCLE) {
+    this->device_vcycle(0);
   }
   o_r.copyFrom(o_z);
 
-  dfloat nrLocal = innerProd(parAlmond, m, o_r, o_r);
-  dfloat nr = 0;
-  MPI_Allreduce(&nrLocal,&nr,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-  nr = sqrt(nr);
+
+  dfloat nr = sqrt(vectorInnerProd(m, o_r, o_r, levels[0]->comm));
 
   dfloat *s = (dfloat *) calloc(maxIt+1, sizeof(dfloat));
   s[0] = nr;
 
   occa::memory *o_V = (occa::memory *) calloc(maxIt, sizeof(occa::memory));
   for(int i=0; i<maxIt; ++i){
-    o_V[i] = parAlmond->device.malloc(m*sizeof(dfloat), dummy);
+    o_V[i] = device.malloc(n*sizeof(dfloat), levels[0]->x);
   }
-  free(dummy);
 
   // V(:,0) = r/nr
-  vectorAdd(parAlmond, m, (1./nr), o_r, 0., o_V[0]);
+  vectorAdd(m, (1./nr), o_r, 0., o_V[0]);
 
   dfloat *H = (dfloat *) calloc((maxIt+1)*(maxIt+1), sizeof(dfloat));
   dfloat *J = (dfloat *) calloc(4*maxIt, sizeof(dfloat));
@@ -312,33 +275,26 @@ void device_pgmres(parAlmond_t *parAlmond,
     Niter = i+1;
 
     // r = A*V(:.i)
-    axpy(parAlmond, A, 1.0, o_V[i], 0.0, o_r,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
+    levels[0]->Ax(o_V[i], o_r);
 
     // M w = A vi
-    if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-      device_kcycle(parAlmond, 0);
-    } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-      device_vcycle(parAlmond, 0);
-    } else {
-      o_z.copyFrom(o_r);
+    if(ctype==KCYCLE) {
+      this->device_kcycle(0);
+    } else if(ctype==VCYCLE) {
+      this->device_vcycle(0);
     }
 
     for(int k=0; k<=i; ++k){
-      dfloat hkiLocal = innerProd(parAlmond, m, o_z, o_V[k]);
-      dfloat hki = 0.;
-      MPI_Allreduce(&hkiLocal,&hki,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
+      dfloat hki = vectorInnerProd(m, o_z, o_V[k], levels[0]->comm);
 
       // w = w - hki*V[k]
-      vectorAdd(parAlmond, m, -hki, o_V[k], 1.0, o_z);
+      vectorAdd(m, -hki, o_V[k], 1.0, o_z);
 
       // H(k,i) = hki
       H[k + i*(maxIt+1)] = hki;
     }
 
-    dfloat nwLocal = innerProd(parAlmond, m, o_z, o_z);
-    dfloat nw = 0.;
-    MPI_Allreduce(&nwLocal,&nw,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-    nw = sqrt(nw);
+    dfloat nw = sqrt(vectorInnerProd(m, o_z, o_z, levels[0]->comm));
     H[i+1 + i*(maxIt+1)] = nw;
 
     for(int k=0; k<i; ++k){
@@ -371,14 +327,14 @@ void device_pgmres(parAlmond_t *parAlmond,
 
     if(i < maxIt-1){
       // V(:,i+1) = w/nw
-      vectorAdd(parAlmond, m, 1./nw, o_z, 0.0, o_V[i+1]);
+      vectorAdd(m, 1./nw, o_z, 0.0, o_V[i+1]);
     }
   }
 
-  gmresUpdate(parAlmond, m, o_x, o_V, H, s, Niter, maxIt);
+  gmresUpdate(m, o_x, o_V, H, s, Niter, maxIt);
 
   //copy result back to parAlmond's x storage
-  parAlmond->levels[0]->o_x.copyFrom(o_x);
+  levels[0]->o_x.copyFrom(o_x);
 
   printf("Almond PGMRES iter %d, res = %g\n", Niter, fabs(s[i+1]));
 
@@ -389,10 +345,12 @@ void device_pgmres(parAlmond_t *parAlmond,
     o_V[i].free();
   free((void*)o_V);
 
-  free(s); 
+  free(s);
   free(H); free(J);
 
   o_Av.free();
   o_w.free();
   o_x.free();
 }
+
+} //namepsace parAlmond
\ No newline at end of file
diff --git a/libs/parAlmond/src/solver.cpp b/libs/parAlmond/src/solver.cpp
new file mode 100644
index 000000000..b80705dce
--- /dev/null
+++ b/libs/parAlmond/src/solver.cpp
@@ -0,0 +1,97 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+solver_t::solver_t(occa::device device_, MPI_Comm comm_,
+                   setupAide options_) {
+
+  device = device_;
+
+  comm = comm_;
+  MPI_Comm_rank(comm, &rank);
+  MPI_Comm_size(comm, &size);
+
+  levels = (multigridLevel **) calloc(MAX_LEVELS,sizeof(multigridLevel *));
+  numLevels = 0;
+
+  options = options_;
+
+  if (options.compareArgs("PARALMOND CYCLE", "NONSYM")) {
+    ktype = GMRES;
+  } else {
+    ktype = PCG;
+  }
+
+  if(options.compareArgs("PARALMOND CYCLE", "EXACT"))
+    exact = true;
+  else
+    exact = false;
+
+  if(options.compareArgs("PARALMOND CYCLE", "VCYCLE"))
+    ctype = VCYCLE;
+  else
+    ctype = KCYCLE;
+
+  if (options.compareArgs("PARALMOND SMOOTHER", "CHEBYSHEV")) {
+    stype = CHEBYSHEV;
+    options.getArgs("PARALMOND CHEBYSHEV DEGREE", ChebyshevIterations);
+    if (!ChebyshevIterations) ChebyshevIterations=2; //default to 2
+  } else { //default to DAMPED_JACOBI
+    stype = DAMPED_JACOBI;
+  }
+}
+
+solver_t::~solver_t() {
+
+  for (int n=0;n<numLevels;n++)
+    delete levels[n];
+
+  free(levels);
+}
+
+void solver_t::Report() {
+
+  if(rank==0) {
+    printf("------------------Multigrid Report----------------------------------------\n");
+    printf("--------------------------------------------------------------------------\n");
+    printf("level|    Type    |    dimension   |   nnz per row   |   Smoother        |\n");
+    printf("     |            |  (min,max,avg) |  (min,max,avg)  |                   |\n");
+    printf("--------------------------------------------------------------------------\n");
+  }
+
+  for(int lev=0; lev<numLevels; lev++) {
+    if(rank==0) {printf(" %3d ", lev);fflush(stdout);}
+    levels[lev]->Report();
+  }
+
+  if(rank==0)
+    printf("--------------------------------------------------------------------------\n");
+}
+
+}
\ No newline at end of file
diff --git a/solvers/parALMOND/src/timer.c b/libs/parAlmond/src/timer.cpp
similarity index 98%
rename from solvers/parALMOND/src/timer.c
rename to libs/parAlmond/src/timer.cpp
index e98545ebd..45ea1a1c2 100644
--- a/solvers/parALMOND/src/timer.c
+++ b/libs/parAlmond/src/timer.cpp
@@ -24,7 +24,7 @@ SOFTWARE.
 
 */
 
-#include "agmg.h"
+#include "parAlmond.hpp"
 
 void occaTimerTic(occa::device device,std::string name) {
   std::string profilerOn = occa::env::var("OCCA_PROFILE");
diff --git a/libs/parAlmond/src/utils.cpp b/libs/parAlmond/src/utils.cpp
new file mode 100644
index 000000000..1d1196a8f
--- /dev/null
+++ b/libs/parAlmond/src/utils.cpp
@@ -0,0 +1,244 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+//scratch space
+size_t scratchSpaceBytes=0;
+void *scratch=NULL;
+occa::memory o_scratch;
+
+size_t pinnedScratchSpaceBytes=0;
+void *pinnedScratch=NULL;
+occa::memory o_pinnedScratch;
+
+size_t reductionScratchBytes=0;
+void *reductionScratch=NULL;
+occa::memory o_reductionScratch;
+
+void allocateScratchSpace(size_t requiredBytes, occa::device device) {
+
+  if (scratchSpaceBytes<requiredBytes) {
+    if (scratchSpaceBytes!=0) {
+      free(scratch);
+      o_scratch.free();
+    }
+    scratch   = malloc(requiredBytes);
+    memset(scratch, 0, requiredBytes);
+    o_scratch = device.malloc(requiredBytes, scratch);
+    scratchSpaceBytes = requiredBytes;
+  }
+  if (reductionScratchBytes==0) {
+    reductionScratchBytes = 3*NBLOCKS*sizeof(dfloat);
+    o_reductionScratch = device.mappedAlloc(reductionScratchBytes);
+    reductionScratch = o_reductionScratch.getMappedPointer();
+  }
+}
+
+void freeScratchSpace() {
+  if (scratchSpaceBytes!=0) {
+    free(scratch);
+    o_scratch.free();
+  }
+  scratchSpaceBytes=0;
+
+  if (reductionScratchBytes!=0) {
+    reductionScratchBytes = 0;
+    o_reductionScratch.free();
+  }
+}
+
+void allocatePinnedScratchSpace(size_t requiredBytes, occa::device device) {
+
+  if (pinnedScratchSpaceBytes<requiredBytes) {
+    if (pinnedScratchSpaceBytes!=0) {
+      o_pinnedScratch.free();
+    }
+    o_pinnedScratch = device.mappedAlloc(requiredBytes);
+    pinnedScratch = o_pinnedScratch.getMappedPointer();
+    pinnedScratchSpaceBytes = requiredBytes;
+  }
+}
+
+void freePinnedScratchSpace() {
+  if (pinnedScratchSpaceBytes!=0) {
+    free(pinnedScratch);
+    o_pinnedScratch.free();
+  }
+  pinnedScratchSpaceBytes=0;
+}
+
+// compare on global indices
+int CompareGlobalId(const void *a, const void *b){
+
+  parallelId_t *fa = (parallelId_t*) a;
+  parallelId_t *fb = (parallelId_t*) b;
+
+  if(fa->globalId < fb->globalId) return -1;
+  if(fa->globalId > fb->globalId) return +1;
+
+  if(fa->localId < fb->localId) return -1;
+  if(fa->localId > fb->localId) return +1;
+
+  return 0;
+}
+
+// compare on local indices
+int CompareLocalId(const void *a, const void *b){
+
+  parallelId_t *fa = (parallelId_t*) a;
+  parallelId_t *fb = (parallelId_t*) b;
+
+  if(fa->localId < fb->localId) return -1;
+  if(fa->localId > fb->localId) return +1;
+
+  if(fa->globalId < fb->globalId) return -1;
+  if(fa->globalId > fb->globalId) return +1;
+
+  return 0;
+}
+
+bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i){
+
+  if(s > smax) return true;
+  if(smax > s) return false;
+
+  if(r > rmax) return true;
+  if(rmax > r) return false;
+
+  if(i > imax) return true;
+  if(i < imax) return false;
+
+  return false;
+}
+
+int compareOwner(const void *a, const void *b){
+  parallelAggregate_t *pa = (parallelAggregate_t *) a;
+  parallelAggregate_t *pb = (parallelAggregate_t *) b;
+
+  if (pa->ownerRank < pb->ownerRank) return -1;
+  if (pa->ownerRank > pb->ownerRank) return +1;
+
+  return 0;
+}
+
+int compareAgg(const void *a, const void *b){
+  parallelAggregate_t *pa = (parallelAggregate_t *) a;
+  parallelAggregate_t *pb = (parallelAggregate_t *) b;
+
+  if (pa->coarseId < pb->coarseId) return -1;
+  if (pa->coarseId > pb->coarseId) return +1;
+
+  if (pa->originRank < pb->originRank) return -1;
+  if (pa->originRank > pb->originRank) return +1;
+
+  return 0;
+}
+
+int compareOrigin(const void *a, const void *b){
+  parallelAggregate_t *pa = (parallelAggregate_t *) a;
+  parallelAggregate_t *pb = (parallelAggregate_t *) b;
+
+  if (pa->originRank < pb->originRank) return -1;
+  if (pa->originRank > pb->originRank) return +1;
+
+  return 0;
+}
+
+int compareNonZeroByRow(const void *a, const void *b){
+  nonzero_t *pa = (nonzero_t *) a;
+  nonzero_t *pb = (nonzero_t *) b;
+
+  if (pa->row < pb->row) return -1;
+  if (pa->row > pb->row) return +1;
+
+  if (pa->col < pb->col) return -1;
+  if (pa->col > pb->col) return +1;
+
+  return 0;
+};
+
+
+void matrixInverse(int N, dfloat *A){
+  int lwork = N*N;
+  int info;
+
+  // compute inverse mass matrix
+  double *tmpInvA = (double*) calloc(N*N, sizeof(double));
+
+  int *ipiv = (int*) calloc(N, sizeof(int));
+  double *work = (double*) calloc(lwork, sizeof(double));
+
+  for(int n=0;n<N*N;++n){
+    tmpInvA[n] = A[n];
+  }
+
+  dgetrf_ (&N, &N, tmpInvA, &N, ipiv, &info);
+  dgetri_ (&N, tmpInvA, &N, ipiv, work, &lwork, &info);
+
+  if(info)
+    printf("inv: dgetrf/dgetri reports info = %d when inverting matrix\n", info);
+
+  for(int n=0;n<N*N;++n)
+    A[n] = tmpInvA[n];
+
+  free(work);
+  free(ipiv);
+  free(tmpInvA);
+}
+
+void eig(const int Nrows, double *A, double *WR, double *WI){
+
+  if(Nrows){
+    int NB  = 256;
+    char JOBVL  = 'V';
+    char JOBVR  = 'V';
+    int     N = Nrows;
+    int   LDA = Nrows;
+    int  LWORK  = (NB+2)*N;
+
+    double *WORK  = new double[LWORK];
+    double *VL  = new double[Nrows*Nrows];
+    double *VR  = new double[Nrows*Nrows];
+
+    int INFO = -999;
+
+    dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI,
+      VL, &LDA, VR, &LDA, WORK, &LWORK, &INFO);
+
+
+    // assert(INFO == 0);
+
+    delete [] VL;
+    delete [] VR;
+    delete [] WORK;
+  }
+}
+
+
+} //namespace parAlmond
diff --git a/libs/parAlmond/src/vector.cpp b/libs/parAlmond/src/vector.cpp
new file mode 100644
index 000000000..80b0fa91f
--- /dev/null
+++ b/libs/parAlmond/src/vector.cpp
@@ -0,0 +1,365 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "parAlmond.hpp"
+
+namespace parAlmond {
+
+//------------------------------------------------------------------------
+//
+//  Host vector operations
+//
+//------------------------------------------------------------------------
+
+void vectorSet(const dlong m, const dfloat alpha, dfloat *a){
+  // #pragma omp parallel for
+  for(dlong i=0; i<m; i++)
+    a[i] = alpha;
+}
+
+void vectorRandomize(const dlong m, dfloat *a){
+  // #pragma omp parallel for
+  for(dlong i=0; i<m; i++)
+    a[i] = (dfloat) drand48();
+}
+
+void vectorScale(const dlong m, const dfloat alpha, dfloat *a){
+  // #pragma omp parallel for
+  for(dlong i=0; i<m; i++)
+    a[i] *= alpha;
+}
+
+void vectorAddScalar(const dlong m, const dfloat alpha, dfloat *a){
+  // #pragma omp parallel for
+  for(dlong i=0; i<m; i++)
+    a[i] += alpha;
+}
+
+// y = beta*y + alpha*x
+void vectorAdd(const dlong n, const dfloat alpha, const dfloat *x,
+               const dfloat beta, dfloat *y){
+  if (beta) {
+    // #pragma omp parallel for
+    for(dlong i=0; i<n; i++)
+      y[i] = beta*y[i] + alpha*x[i];
+  } else {
+    // #pragma omp parallel for
+    for(dlong i=0; i<n; i++)
+      y[i] = alpha*x[i];
+  }
+}
+
+// z = beta*y + alpha*x
+void vectorAdd(const dlong n, const dfloat alpha, const dfloat *x,
+               const dfloat beta, const dfloat *y, dfloat *z){
+  // #pragma omp parallel for
+  for(dlong i=0; i<n; i++)
+    z[i] = beta*y[i] + alpha*x[i];
+}
+
+// b = a*b
+void vectorDotStar(const dlong m, const dfloat *a, dfloat *b){
+  // #pragma omp parallel for
+  for(dlong i=0; i<m; i++)
+    b[i] *= a[i];
+}
+
+// c = alpha*a*b + beta*c
+void vectorDotStar(const dlong m, const dfloat alpha, const dfloat *a,
+                   const dfloat *b, const dfloat beta,  dfloat *c){
+  if (beta) {
+    // #pragma omp parallel for
+    for(dlong i=0; i<m; i++)
+      c[i] = beta*c[i]+ alpha*a[i]*b[i];
+  } else {
+    // #pragma omp parallel for
+    for(dlong i=0; i<m; i++)
+      c[i] = alpha*a[i]*b[i];
+  }
+}
+
+dfloat vectorNorm(const dlong n, const dfloat *a, MPI_Comm comm){
+  dfloat result = 0., gresult = 0.;
+  // #pragma omp parallel for reduction(+:result)
+  for(dlong i=0; i<n; i++)
+    result += a[i]*a[i];
+
+  MPI_Allreduce(&result, &gresult, 1, MPI_DFLOAT, MPI_SUM, comm);
+  return sqrt(gresult);
+}
+
+dfloat vectorInnerProd(const dlong n, const dfloat *a, const dfloat *b,
+                       MPI_Comm comm){
+  dfloat result = 0., gresult = 0.;
+  // #pragma omp parallel for reduction(+:result)
+  for(dlong i=0; i<n; i++)
+    result += a[i]*b[i];
+
+  MPI_Allreduce(&result, &gresult, 1, MPI_DFLOAT, MPI_SUM, comm);
+  return gresult;
+}
+
+dfloat vectorMaxAbs(const dlong n, const dfloat *a, MPI_Comm comm){
+  dfloat maxVal=0.0;
+  dfloat gmaxVal=0.0;
+
+  //  #pragma omp parallel for reduction(max:maxVal)
+  for(dlong i=0; i<n; i++){
+    dfloat a2 = (a[i] < 0) ? -a[i] : a[i];
+    if(maxVal < a2){
+      maxVal = a2;
+    }
+  }
+
+  MPI_Allreduce(&maxVal, &gmaxVal, 1, MPI_DFLOAT, MPI_MAX, comm);
+  return gmaxVal;
+}
+
+// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b,
+void kcycleCombinedOp1(const dlong n, dfloat *aDotbc, const dfloat *a,
+                      const dfloat *b, const dfloat *c, const dfloat* w,
+                      const bool weighted, MPI_Comm comm) {
+  dfloat result[3] = {0.,0.,0.};
+  if (weighted) {
+    // #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:bDotb)
+    for(dlong i=0; i<n; i++) {
+      result[0] += w[i]*a[i]*b[i];
+      result[1] += w[i]*a[i]*c[i];
+      result[2] += w[i]*b[i]*b[i];
+    }
+  } else {
+    // #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:bDotb)
+    for(dlong i=0; i<n; i++) {
+      result[0] += a[i]*b[i];
+      result[1] += a[i]*c[i];
+      result[2] += b[i]*b[i];
+    }
+  }
+  MPI_Allreduce(result,aDotbc,3,MPI_DFLOAT,MPI_SUM,comm);
+}
+
+// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d,
+void kcycleCombinedOp2(const dlong n, dfloat *aDotbcd, const dfloat *a,
+                       const dfloat *b, const dfloat *c, const dfloat* d,
+                       const dfloat *w, const bool weighted, MPI_Comm comm) {
+  dfloat result[3] = {0.,0.,0.};
+  if (weighted) {
+    // #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:aDotd)
+    for(dlong i=0; i<n; i++) {
+      result[0] += w[i]*a[i]*b[i];
+      result[1] += w[i]*a[i]*c[i];
+      result[2] += w[i]*a[i]*d[i];
+    }
+  } else {
+    // #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:aDotd)
+    for(dlong i=0; i<n; i++) {
+      result[0] += a[i]*b[i];
+      result[1] += a[i]*c[i];
+      result[2] += a[i]*d[i];
+    }
+  }
+  MPI_Allreduce(result,aDotbcd,3,MPI_DFLOAT,MPI_SUM,comm);
+}
+
+
+
+// y = beta*y + alpha*x, and return y\dot y
+dfloat vectorAddInnerProd(const dlong n, const dfloat alpha, const dfloat *x,
+                          const dfloat beta, dfloat *y,
+                          const dfloat *w, const bool weighted, MPI_Comm comm){
+  dfloat result = 0.;
+  dfloat gresult = 0.;
+  if (weighted) {
+    if (beta) {
+      // #pragma omp parallel for reduction(+:result)
+      for(dlong i=0; i<n; i++) {
+        y[i] = beta*y[i] + alpha*x[i];
+        result += w[i]*y[i]*y[i];
+      }
+    } else {
+      // #pragma omp parallel for reduction(+:result)
+      for(dlong i=0; i<n; i++) {
+        y[i] = alpha*x[i];
+        result += w[i]*y[i]*y[i];
+      }
+    }
+  } else {
+    if (beta) {
+      // #pragma omp parallel for reduction(+:result)
+      for(dlong i=0; i<n; i++) {
+        y[i] = beta*y[i] + alpha*x[i];
+        result += y[i]*y[i];
+      }
+    } else {
+      // #pragma omp parallel for reduction(+:result)
+      for(dlong i=0; i<n; i++) {
+        y[i] = alpha*x[i];
+        result += y[i]*y[i];
+      }
+    }
+  }
+  MPI_Allreduce(&result,&gresult,1,MPI_DFLOAT,MPI_SUM,comm);
+  return gresult;
+}
+
+
+//------------------------------------------------------------------------
+//
+//  Device vector operations
+//
+//------------------------------------------------------------------------
+
+void vectorSet(const dlong N, const dfloat alpha, occa::memory o_a){
+  if (N) vectorSetKernel(N, alpha, o_a);
+}
+
+//void vectorRandomize(const dlong m, occa::memory o_a)
+
+void vectorScale(const dlong N, const dfloat alpha, occa::memory o_a){
+  if (N) vectorScaleKernel(N, alpha, o_a);
+}
+
+void vectorAddScalar(const dlong N, const dfloat alpha, occa::memory o_a){
+  if (N) vectorAddScalarKernel(N, alpha, o_a);
+}
+
+void vectorAdd(const dlong N, const dfloat alpha, occa::memory o_x,
+               const dfloat beta, occa::memory o_y){
+  if (N) vectorAddKernel1(N, alpha, beta, o_x, o_y);
+}
+
+void vectorAdd(const dlong N, const dfloat alpha, occa::memory o_x,
+               const dfloat beta, occa::memory o_y, occa::memory o_z){
+  if (N) vectorAddKernel2(N, alpha, beta, o_x, o_y, o_z);
+}
+
+void vectorDotStar(const dlong N, occa::memory o_a, occa::memory o_b){
+  if (N) vectorDotStarKernel1(N, o_a, o_b);
+}
+
+void vectorDotStar(const dlong N, const dfloat alpha, occa::memory o_a,
+                   occa::memory o_b, const dfloat beta, occa::memory o_c){
+  if (N) vectorDotStarKernel2(N, alpha, beta, o_a, o_b, o_c);
+}
+
+//dfloat vectorNorm(const dlong n, occa::memory o_a, MPI_Comm comm)
+
+dfloat vectorInnerProd(const dlong N, occa::memory o_x, occa::memory o_y,
+                       MPI_Comm comm){
+
+  dlong numBlocks = (N < NBLOCKS) ? N : NBLOCKS;
+
+  vectorInnerProdKernel(numBlocks,N,o_x,o_y,o_reductionScratch);
+  o_reductionScratch.copyTo(reductionScratch,numBlocks*sizeof(dfloat),0);
+
+  dfloat result =0., gresult = 0.;
+  //#pragma omp parallel for reduction(+:result)
+  for (dlong i=0; i<numBlocks; i++) {
+    result += ((dfloat*)reductionScratch)[i];
+  }
+  MPI_Allreduce(&result, &gresult, 1, MPI_DFLOAT, MPI_SUM, comm);
+  return gresult;
+}
+
+//dfloat vectorMaxAbs(const dlong n, occa::memory o_a)
+
+// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b,
+void kcycleCombinedOp1(const dlong N, dfloat *aDotbc, occa::memory o_a,
+                       occa::memory o_b, occa::memory o_c, occa::memory o_w,
+                       const bool weighted, MPI_Comm comm) {
+
+  dfloat result[3] = {0.,0.,0.};
+  dlong numBlocks = (N < NBLOCKS) ? N : NBLOCKS;
+
+  if (weighted) {
+    kcycleWeightedCombinedOp1Kernel(numBlocks,N,o_a,o_b,o_c,o_w,o_reductionScratch);
+  } else {
+    kcycleCombinedOp1Kernel(numBlocks,N,o_a,o_b,o_c,o_reductionScratch);
+  }
+  o_reductionScratch.copyTo(reductionScratch,3*numBlocks*sizeof(dfloat),0);
+
+  // #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:bDotb)
+  for(dlong i=0; i<numBlocks; i++) {
+    result[0] += ((dfloat*)reductionScratch)[3*i+0];
+    result[1] += ((dfloat*)reductionScratch)[3*i+1];
+    result[2] += ((dfloat*)reductionScratch)[3*i+2];
+  }
+  MPI_Allreduce(result,aDotbc,3,MPI_DFLOAT,MPI_SUM,comm);
+}
+
+// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d,
+void kcycleCombinedOp2(const dlong N, dfloat *aDotbcd,
+                        occa::memory o_a, occa::memory o_b,
+                        occa::memory o_c, occa::memory o_d,
+                        occa::memory o_w, const bool weighted, MPI_Comm comm) {
+
+  dfloat result[3] = {0.,0.,0.};
+  dlong numBlocks = (N < NBLOCKS) ? N : NBLOCKS;
+
+  if (weighted) {
+    kcycleWeightedCombinedOp2Kernel(numBlocks,N,o_a,o_b,o_c,o_d,o_w,o_reductionScratch);
+  } else {
+    kcycleCombinedOp2Kernel(numBlocks,N,o_a,o_b,o_c,o_d,o_reductionScratch);
+  }
+  o_reductionScratch.copyTo(reductionScratch,3*numBlocks*sizeof(dfloat),0);
+
+  dfloat aDotb = 0., aDotc = 0., aDotd = 0.;
+  // #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:aDotd)
+  for(dlong i=0; i<numBlocks; i++) {
+    result[0] += ((dfloat*)reductionScratch)[3*i+0];
+    result[1] += ((dfloat*)reductionScratch)[3*i+1];
+    result[2] += ((dfloat*)reductionScratch)[3*i+2];
+  }
+  MPI_Allreduce(result,aDotbcd,3,MPI_DFLOAT,MPI_SUM,comm);
+}
+
+// y = beta*y + alpha*x, and return y\dot y
+dfloat vectorAddInnerProd(const dlong N, const dfloat alpha, occa::memory o_x,
+                          const dfloat beta, occa::memory o_y,
+                          occa::memory o_w, const bool weighted, MPI_Comm comm){
+
+  dfloat result = 0.;
+  dfloat gresult = 0.;
+  dlong numBlocks = (N < NBLOCKS) ? N : NBLOCKS;
+
+  if (weighted) {
+    vectorAddWeightedInnerProdKernel(numBlocks,N,alpha,beta,o_x,o_y,o_w,o_reductionScratch);
+  } else {
+    vectorAddInnerProdKernel(numBlocks,N,alpha,beta,o_x,o_y,o_reductionScratch);
+  }
+  o_reductionScratch.copyTo(reductionScratch,numBlocks*sizeof(dfloat),0);
+
+  // #pragma omp parallel for reduction(+:result)
+  for (dlong i=0; i<numBlocks; i++) {
+    result += ((dfloat*)reductionScratch)[i];
+  }
+  MPI_Allreduce(&result,&gresult,1,MPI_DFLOAT,MPI_SUM,comm);
+  return gresult;
+}
+
+
+
+} //namespace parAlmond
\ No newline at end of file
diff --git a/solvers/elliptic/elliptic.h b/solvers/elliptic/elliptic.h
index 6b6a50597..6c9a74352 100644
--- a/solvers/elliptic/elliptic.h
+++ b/solvers/elliptic/elliptic.h
@@ -34,7 +34,7 @@ SOFTWARE.
 #include "mpi.h"
 #include "mesh2D.h"
 #include "mesh3D.h"
-#include "parAlmond.h"
+#include "parAlmond.hpp"
 #include "ellipticPrecon.h"
 
 // block size for reduction (hard coded)
@@ -134,6 +134,8 @@ typedef struct {
 
 }elliptic_t;
 
+#include "ellipticMultiGrid.h"
+
 elliptic_t *ellipticSetup(mesh2D *mesh, dfloat lambda, occa::properties &kernelInfo, setupAide options);
 
 void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda, occa::memory &o_r, occa::memory &o_z);
@@ -158,10 +160,10 @@ dfloat ellipticCascadingWeightedInnerProduct(elliptic_t *elliptic, occa::memory
 void ellipticOperator(elliptic_t *elliptic, dfloat lambda, occa::memory &o_q, occa::memory &o_Aq, const char *precision);
 
 dfloat ellipticWeightedNorm2(elliptic_t *elliptic, occa::memory &o_w, occa::memory &o_a);
-void ellipticBuildIpdg(elliptic_t* elliptic, int basisNp, dfloat *basis, dfloat lambda, 
+void ellipticBuildIpdg(elliptic_t* elliptic, int basisNp, dfloat *basis, dfloat lambda,
                         nonZero_t **A, dlong *nnzA, hlong *globalStarts);
 
-void ellipticBuildContinuous(elliptic_t* elliptic, dfloat lambda, nonZero_t **A, 
+void ellipticBuildContinuous(elliptic_t* elliptic, dfloat lambda, nonZero_t **A,
                                   dlong *nnz, ogs_t **ogs, hlong *globalStarts);
 
 void ellipticBuildJacobi(elliptic_t *elliptic, dfloat lambda, dfloat **invDiagA);
@@ -169,17 +171,17 @@ void ellipticBuildJacobi(elliptic_t *elliptic, dfloat lambda, dfloat **invDiagA)
 void ellipticBuildLocalPatches(elliptic_t *elliptic, dfloat lambda, dfloat rateTolerance,
                                dlong *Npataches, dlong **patchesIndex, dfloat **patchesInvA);
 
-//smoother setups
-void ellipticSetupSmoother(elliptic_t *elliptic, precon_t *precon, dfloat lambda);
-void ellipticSetupSmootherDampedJacobi    (elliptic_t *elliptic, precon_t *precon, agmgLevel *level, dfloat lambda);
-void ellipticSetupSmootherLocalPatch(elliptic_t *elliptic, precon_t *precon, agmgLevel *level, dfloat lambda, dfloat rateTolerance);
+// //smoother setups
+// void ellipticSetupSmoother(elliptic_t *elliptic, precon_t *precon, dfloat lambda);
+// void ellipticSetupSmootherDampedJacobi    (elliptic_t *elliptic, precon_t *precon, agmgLevel *level, dfloat lambda);
+// void ellipticSetupSmootherLocalPatch(elliptic_t *elliptic, precon_t *precon, agmgLevel *level, dfloat lambda, dfloat rateTolerance);
 
 void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda);
 elliptic_t *ellipticBuildMultigridLevel(elliptic_t *baseElliptic, int Nc, int Nf);
 
 void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda);
 
-dfloat maxEigSmoothAx(elliptic_t* elliptic, agmgLevel *level);
+// dfloat maxEigSmoothAx(elliptic_t* elliptic, agmgLevel *level);
 
 #define maxNthreads 256
 
diff --git a/solvers/elliptic/ellipticMultiGrid.h b/solvers/elliptic/ellipticMultiGrid.h
new file mode 100644
index 000000000..a59904ed3
--- /dev/null
+++ b/solvers/elliptic/ellipticMultiGrid.h
@@ -0,0 +1,121 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef ELLIPTIC_MGLEVEL_HPP
+#define ELLIPTIC_MGLEVEL_HPP
+
+typedef enum {RICHARDSON=1,
+              CHEBYSHEV=2} SmoothType;
+typedef enum {JACOBI=1,
+              LOCALPATCH=2} SmootherType;
+
+class MGLevel: public parAlmond::multigridLevel {
+
+public:
+
+  elliptic_t* elliptic;
+  mesh_t* mesh;
+  dfloat lambda;
+
+  int degree;
+
+  //coarsener
+  dfloat *R;
+  occa::memory o_R;
+  int NpF;
+  occa::memory o_invDegree;
+
+  //smoothing params
+  SmoothType stype;
+  SmootherType smtype;
+
+  dfloat lambda1, lambda0;
+  int ChebyshevIterations;
+
+  static size_t smootherResidualBytes;
+  static dfloat *smootherResidual;
+  static occa::memory o_smootherResidual;
+  static occa::memory o_smootherResidual2;
+  static occa::memory o_smootherUpdate;
+
+  //jacobi data
+  occa::memory o_invDiagA;
+
+  //local patch data
+  occa::memory o_invAP, o_patchesIndex, o_invDegreeAP;
+
+  setupAide options;
+
+  //build a single level
+  MGLevel(elliptic_t *ellipticBase, dfloat lambda_, int Nc,
+           setupAide options_, parAlmond::KrylovType ktype_, MPI_Comm comm_);
+  //build a level and connect it to the previous one
+  MGLevel(elliptic_t *ellipticBase, //finest level
+                   mesh_t **meshLevels,
+                   elliptic_t *ellipticFine, //previous level
+                   elliptic_t *ellipticCoarse, //current level
+                   dfloat lambda_,
+                   int Nf, int Nc,
+                   setupAide options_,
+                   parAlmond::KrylovType ktype_,
+                   MPI_Comm comm_);
+
+  void Ax(dfloat        *x, dfloat        *Ax) {};
+  void Ax(occa::memory o_x, occa::memory o_Ax);
+
+  void residual(dfloat        *rhs, dfloat        *x, dfloat        *res) {};
+  void residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res);
+
+  void coarsen(dfloat        *x, dfloat        *Cx) {};
+  void coarsen(occa::memory o_x, occa::memory o_Cx);
+
+  void prolongate(dfloat        *x, dfloat        *Px) {};
+  void prolongate(occa::memory o_x, occa::memory o_Px);
+
+  //smoother ops
+  void smooth(dfloat        *rhs, dfloat        *x, bool x_is_zero) {};
+  void smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero);
+
+  void smoother(occa::memory o_x, occa::memory o_Sx);
+
+  void smoothRichardson(occa::memory &o_r, occa::memory &o_x, bool xIsZero);
+  void smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZero);
+
+  void smootherLocalPatch(occa::memory &o_r, occa::memory &o_Sr);
+  void smootherJacobi    (occa::memory &o_r, occa::memory &o_Sr);
+
+  void Report();
+
+  void setupSmoother();
+  dfloat maxEigSmoothAx();
+
+  void buildCoarsenerTriTet(mesh_t **meshLevels, int Nf, int Nc);
+  void buildCoarsenerQuadHex(mesh_t **meshLevels, int Nf, int Nc);
+};
+
+void MGLevelAllocateStorage(MGLevel *level, int k, parAlmond::CycleType ctype);
+
+#endif
\ No newline at end of file
diff --git a/solvers/elliptic/ellipticPrecon.h b/solvers/elliptic/ellipticPrecon.h
index 8a6b6e8bf..18f0735e7 100644
--- a/solvers/elliptic/ellipticPrecon.h
+++ b/solvers/elliptic/ellipticPrecon.h
@@ -43,6 +43,9 @@ typedef struct {
   dfloat *zP;
   occa::memory o_zP;
 
+  dfloat *xG, *rhsG;
+  occa::memory o_xG, o_rhsG;
+
   occa::memory o_Gr;
   occa::memory o_Gz;
   occa::memory o_Sr;
@@ -109,7 +112,7 @@ typedef struct {
   dfloat *B, *tmp2;
   occa::memory *o_B, o_tmp2;
   void *xxt2;
-  parAlmond_t *parAlmond;
+  parAlmond::solver_t *parAlmond;
 
   // block Jacobi precon
   occa::memory o_invMM;
@@ -117,24 +120,10 @@ typedef struct {
   occa::kernel partialblockJacobiKernel;
 
   //dummy almond level to store the OAS smoothing op
-  agmgLevel *OASLevel;
-  void **OASsmoothArgs;
+  // agmgLevel *OASLevel;
+  // void **OASsmoothArgs;
 
   //SEMFEM variables
   mesh_t *femMesh;
 
 } precon_t;
-
-
-//Multigrid function callbacks
-void AxTri2D        (void **args, occa::memory &o_x, occa::memory &o_Ax);
-void coarsenTri2D   (void **args, occa::memory &o_x, occa::memory &o_Rx);
-void prolongateTri2D(void **args, occa::memory &o_x, occa::memory &o_Px);
-void ellipticGather (void **args, occa::memory &o_x, occa::memory &o_Gx);
-void ellipticScatter(void **args, occa::memory &o_x, occa::memory &o_Sx);
-void ellipticMultigridSmooth         (void **args, occa::memory &o_r, occa::memory &o_x, bool xIsZero);
-void ellipticMultigridSmoothChebyshev(void **args, occa::memory &o_r, occa::memory &o_x, bool xIsZero);
-
-//smoother ops
-void LocalPatch  (void **args, occa::memory &o_r, occa::memory &o_Sr);
-void dampedJacobi(void **args, occa::memory &o_r, occa::memory &o_Sr);
\ No newline at end of file
diff --git a/solvers/elliptic/makefile b/solvers/elliptic/makefile
index 531dcac9b..e59fd1190 100644
--- a/solvers/elliptic/makefile
+++ b/solvers/elliptic/makefile
@@ -11,7 +11,7 @@ include ${OCCA_DIR}/scripts/Makefile
 HDRDIR = ../../include
 GSDIR  = ../../3rdParty/gslib
 OGSDIR  = ../../libs/gatherScatter
-ALMONDDIR = ../parALMOND
+ALMONDDIR = ../../libs/parAlmond
 
 # set options for this machine
 # specify which compilers to use for c, fortran and linking
@@ -20,25 +20,22 @@ CC	= mpic++
 LD	= mpic++
 
 # compiler flags to be used (set to compile with debugging on)
-CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -g  -D DHOLMES='"${CURDIR}/../.."' -D DELLIPTIC='"${CURDIR}"'
-
+CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -I$(ALMONDDIR) -g  -D DHOLMES='"${CURDIR}/../.."' -D DELLIPTIC='"${CURDIR}"'
 
 # link flags to be used
-LDFLAGS	= -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g 
+LDFLAGS	= -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g
 
 # libraries to be linked in
-LIBS	=   -L$(ALMONDDIR) -lparALMOND  -L$(OGSDIR) -logs -L$(GSDIR)/lib -lgs \
+LIBS	=   -L$(ALMONDDIR) -lparAlmond  -L$(OGSDIR) -logs -L$(GSDIR)/lib -lgs \
 			-L$(OCCA_DIR)/lib  $(links) -L../../3rdParty/BlasLapack -lBlasLapack -lgfortran
 
-#-llapack -lblas
-
 INCLUDES = elliptic.h ellipticPrecon.h
 DEPS = $(INCLUDES) \
 $(HDRDIR)/mesh.h \
 $(HDRDIR)/mesh2D.h \
 $(HDRDIR)/mesh3D.h \
-$(HDRDIR)/ogs_t.h \
-$(ALMONDDIR)/parALMOND.h \
+$(OGSDIR)/ogs.hpp \
+$(ALMONDDIR)/parAlmond.hpp \
 
 # types of files we are going to construct rules for
 .SUFFIXES: .c
@@ -57,17 +54,17 @@ AOBJS    = \
 ./src/ellipticBuildLocalPatches.o \
 ./src/ellipticBuildMultigridLevel.o \
 ./src/ellipticHaloExchange.o\
-./src/ellipticMultiGridSetup.o \
 ./src/ellipticOperator.o \
 ./src/ellipticPreconditioner.o\
 ./src/ellipticPreconditionerSetup.o\
-./src/ellipticSEMFEMSetup.o\
 ./src/ellipticSetup.o \
-./src/ellipticSmoother.o \
-./src/ellipticSmootherSetup.o \
 ./src/ellipticSolve.o\
 ./src/ellipticSolveSetup.o\
 ./src/ellipticVectors.o \
+./src/ellipticSEMFEMSetup.o\
+./src/ellipticMultiGridSetup.o \
+./src/ellipticMultiGridLevel.o \
+./src/ellipticMultiGridLevelSetup.o \
 
 # library objects
 LOBJS = \
@@ -127,7 +124,7 @@ LOBJS = \
 ../../src/occaHostMallocPinned.o \
 ../../src/timer.o
 
-ellipticMain:$(AOBJS) $(LOBJS) ./src/ellipticMain.o libblas libogs libparALMOND
+ellipticMain:$(AOBJS) $(LOBJS) ./src/ellipticMain.o libblas libogs libparAlmond
 	$(LD)  $(LDFLAGS)  -o ellipticMain ./src/ellipticMain.o $(COBJS) $(AOBJS) $(LOBJS) $(paths) $(LIBS)
 
 lib:$(AOBJS)
@@ -139,14 +136,14 @@ libogs:
 libblas:
 	cd ../../3rdParty/BlasLapack; make -j lib; cd ../../solvers/elliptic
 
-libparALMOND:
-	cd ../parALMOND; make -j lib; cd ../elliptic
+libparAlmond:
+	cd ../../libs/parAlmond; make -j lib; cd ../../solvers/elliptic
 
 all: lib ellipticMain
 
 # what to do if user types "make clean"
 clean:
-	cd ../parALMOND; make clean; cd ../elliptic
+	cd ../../libs/parAlmond; make clean; cd ../../solvers/elliptic
 	cd ../../src; rm *.o; cd ../solvers/elliptic
 	cd ../../libs/gatherScatter; make clean; cd ../../solvers/elliptic
 	rm src/*.o ellipticMain libelliptic.a
@@ -154,7 +151,7 @@ clean:
 realclean:
 	cd ../../3rdParty/BlasLapack; make clean; cd ../../solvers/elliptic
 	cd ../../libs/gatherScatter; make realclean; cd ../../solvers/elliptic
-	cd ../parALMOND; make clean; cd ../elliptic
+	cd ../../libs/parAlmond; make clean; cd ../../solvers/elliptic
 	cd ../../src; rm *.o; cd ../solvers/elliptic
 	rm src/*.o ellipticMain libelliptic.a
 
diff --git a/solvers/elliptic/setups/setupTri2D.rc b/solvers/elliptic/setups/setupTri2D.rc
index 8c0c2eaa1..293f19fc0 100644
--- a/solvers/elliptic/setups/setupTri2D.rc
+++ b/solvers/elliptic/setups/setupTri2D.rc
@@ -1,6 +1,11 @@
 [FORMAT]
 1.0
 
+[BENCHMARK]
+SOLVE
+#NONE
+#BP5
+
 [DATA FILE]
 data/ellipticHomogeneous2D.h
 
@@ -14,10 +19,10 @@ data/ellipticHomogeneous2D.h
 3
 
 [POLYNOMIAL DEGREE]
-4
+6
 
 [THREAD MODEL]
-Serial
+CUDA
 
 [PLATFORM NUMBER]
 0
diff --git a/solvers/elliptic/src/ellipticMain.c b/solvers/elliptic/src/ellipticMain.c
index 64d0f5be0..9a1c375e5 100644
--- a/solvers/elliptic/src/ellipticMain.c
+++ b/solvers/elliptic/src/ellipticMain.c
@@ -40,7 +40,7 @@ int main(int argc, char **argv){
 
   // if argv > 2 then should load input data from argv
   setupAide options(argv[1]);
-  
+
   // set up mesh stuff
   string fileName;
   int N, dim, elementType;
@@ -65,7 +65,7 @@ int main(int argc, char **argv){
 
   if(mesh->Nelements<10)
     meshPrint3D(mesh);
-  
+
   // parameter for elliptic problem (-laplacian + lambda)*q = f
   dfloat lambda;
   options.getArgs("LAMBDA", lambda);
@@ -81,12 +81,12 @@ int main(int argc, char **argv){
 
   if(options.compareArgs("BENCHMARK", "BK5") ||
      options.compareArgs("BENCHMARK", "BP5")){
-    
+
     // test Ax throughput
     occa::streamTag startAx = mesh->device.tagStream();
-    
+
     int NAx = 1;
-    
+
     for(int it=0;it<NAx;++it){
       // include gather-scatter
       if(options.compareArgs("BENCHMARK", "BP5"))
@@ -94,60 +94,62 @@ int main(int argc, char **argv){
 
       if(options.compareArgs("BENCHMARK", "BK5")){
         if(!options.compareArgs("ELEMENT MAP", "TRILINEAR")){
-          elliptic->partialAxKernel(mesh->NlocalGatherElements,                           
+          elliptic->partialAxKernel(mesh->NlocalGatherElements,
                                     mesh->o_localGatherElementList,
                                     mesh->o_ggeo, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM,
                                     lambda, elliptic->o_x, elliptic->o_Ax);
         }
         else{
-          elliptic->partialAxKernel(mesh->NlocalGatherElements,                           
+          elliptic->partialAxKernel(mesh->NlocalGatherElements,
                                     mesh->o_localGatherElementList,
                                     elliptic->o_EXYZ, elliptic->o_gllzw, mesh->o_Dmatrices, mesh->o_Smatrices, mesh->o_MM,
                                     lambda, elliptic->o_x, elliptic->o_Ax);
         }
       }
     }
-      
+
     occa::streamTag stopAx = mesh->device.tagStream();
-      
+
     mesh->device.finish();
-      
+
     double elapsedAx = mesh->device.timeBetween(startAx, stopAx);
     elapsedAx /= NAx;
-      
-      
-    printf("%d, %d, %g, %d, %g, %g; \%\%elemental: N, dofs, elapsed, dummy, time per node, nodes/time %s\n",
+
+
+    if (mesh->rank==0)
+      printf("%d, %d, %g, %d, %g, %g; \%\%elemental: N, dofs, elapsed, dummy, time per node, nodes/time %s\n",
            mesh->N,
            mesh->NlocalGatherElements*mesh->Np,
-           0,
            elapsedAx,
+           0,
            elapsedAx/(mesh->Np*mesh->Nelements),
            mesh->Nelements*mesh->Np/elapsedAx,
-           options.getArgs("DISCRETIZATION").c_str());
-      
+           (char*) options.getArgs("DISCRETIZATION").c_str());
+
   }
   else{
-    
+
     // convergence tolerance
     dfloat tol = 1e-8;
-  
+
     occa::streamTag startTag = mesh->device.tagStream();
-  
+
     int it = ellipticSolve(elliptic, lambda, tol, elliptic->o_r, elliptic->o_x);
 
     occa::streamTag stopTag = mesh->device.tagStream();
     mesh->device.finish();
-  
+
     double elapsed = mesh->device.timeBetween(startTag, stopTag);
 
-    printf("%d, %d, %g, %d, %g, %g; \%\%global: N, dofs, elapsed, iterations, time per node, nodes*iterations/time %s\n",
+    if (mesh->rank==0)
+      printf("%d, %d, %g, %d, %g, %g; \%\%global: N, dofs, elapsed, iterations, time per node, nodes*iterations/time %s\n",
            mesh->N,
            mesh->Nelements*mesh->Np,
            elapsed,
            it,
            elapsed/(mesh->Np*mesh->Nelements),
            mesh->Nelements*(it*mesh->Np/elapsed),
-           options.getArgs("PRECONDITIONER").c_str());
+           (char*) options.getArgs("PRECONDITIONER").c_str());
 
     if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){
       dfloat zero = 0.;
@@ -159,13 +161,13 @@ int main(int argc, char **argv){
                             elliptic->o_mapB,
                             elliptic->o_x);
     }
-      
+
     // copy solution from DEVICE to HOST
     elliptic->o_x.copyTo(mesh->q);
-      
+
     if (options.compareArgs("BASIS","BERN"))
       meshApplyElementMatrix(mesh,mesh->VB,mesh->q,mesh->q);
-      
+
     dfloat maxError = 0;
     for(dlong e=0;e<mesh->Nelements;++e){
       for(int n=0;n<mesh->Np;++n){
@@ -173,23 +175,23 @@ int main(int argc, char **argv){
         dfloat xn = mesh->x[id];
         dfloat yn = mesh->y[id];
         dfloat zn = mesh->z[id];
-      
+
         dfloat exact;
         if (elliptic->dim==2)
           exact = sin(M_PI*xn)*sin(M_PI*yn);
-        else 
+        else
           exact = cos(M_PI*xn)*cos(M_PI*yn)*cos(M_PI*zn);
         dfloat error = fabs(exact-mesh->q[id]);
-          
+
         maxError = mymax(maxError, error);
       }
     }
-      
+
     dfloat globalMaxError = 0;
     MPI_Allreduce(&maxError, &globalMaxError, 1, MPI_DFLOAT, MPI_MAX, mesh->comm);
     if(mesh->rank==0)
-      fprintf(stderr,"globalMaxError = %g\n", globalMaxError);
-      
+      printf("globalMaxError = %g\n", globalMaxError);
+
 #if 0
     char fname[BUFSIZ];
     string outName;
@@ -197,7 +199,7 @@ int main(int argc, char **argv){
     sprintf(fname, "%s_%04d.vtu",(char*)outName.c_str(), rank);
     if(elliptic->dim==3)
       meshPlotVTU3D(mesh, fname, 0);
-    else 
+    else
       meshPlotVTU2D(mesh, fname, 0);
 #endif
   }
@@ -217,10 +219,9 @@ int main(int argc, char **argv){
     ellipticPlotVTUHex3D(mesh, "bah", 0);
   }
 #endif
-  
+
   // close down MPI
   MPI_Finalize();
-  
-  exit(0);
+
   return 0;
 }
diff --git a/solvers/elliptic/src/ellipticMultiGridLevel.c b/solvers/elliptic/src/ellipticMultiGridLevel.c
new file mode 100644
index 000000000..6838a1318
--- /dev/null
+++ b/solvers/elliptic/src/ellipticMultiGridLevel.c
@@ -0,0 +1,164 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "elliptic.h"
+
+void MGLevel::Ax(occa::memory o_x, occa::memory o_Ax) {
+  ellipticOperator(elliptic,lambda,
+                    o_x,o_Ax, dfloatString); // "float" ); // hard coded for testing (should make an option)
+}
+
+void MGLevel::residual(occa::memory o_rhs, occa::memory o_x, occa::memory o_res) {
+  ellipticOperator(elliptic,lambda,
+                    o_x,o_res, dfloatString); // "float" ); // hard coded for testing (should make an option)
+
+  // subtract r = b - A*x
+  ellipticScaledAdd(elliptic, 1.f, o_rhs, -1.f, o_res);
+}
+
+void MGLevel::coarsen(occa::memory o_x, occa::memory o_Rx) {
+  if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
+    elliptic->dotMultiplyKernel(mesh->Nelements*NpF, o_invDegree, o_x, o_x);
+
+  elliptic->precon->coarsenKernel(mesh->Nelements, o_R, o_x, o_Rx);
+
+  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
+    ogsGatherScatter(o_Rx, ogsDfloat, ogsAdd, elliptic->ogs);
+    if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Rx);
+  }
+}
+
+void MGLevel::prolongate(occa::memory o_x, occa::memory o_Px) {
+  elliptic->precon->prolongateKernel(mesh->Nelements, o_R, o_x, o_Px);
+}
+
+void MGLevel::smooth(occa::memory o_rhs, occa::memory o_x, bool x_is_zero) {
+  if (stype==RICHARDSON) {
+    this->smoothRichardson(o_rhs, o_x, x_is_zero);
+  } else if (stype==CHEBYSHEV) {
+    this->smoothChebyshev(o_rhs, o_x, x_is_zero);
+  }
+}
+
+void MGLevel::smoother(occa::memory o_x, occa::memory o_Sx) {
+  if (smtype==JACOBI) {
+    this->smootherJacobi(o_x, o_Sx);
+  } else if (smtype==LOCALPATCH) {
+    this->smootherLocalPatch(o_x, o_Sx);
+  }
+}
+
+void MGLevel::smoothRichardson(occa::memory &o_r, occa::memory &o_x, bool xIsZero) {
+
+  occa::memory o_res = o_smootherResidual;
+
+  if (xIsZero) {
+    this->smoother(o_r, o_x);
+    return;
+  }
+
+  dfloat one = 1.; dfloat mone = -1.;
+
+  //res = r-Ax
+  this->Ax(o_x,o_res);
+  elliptic->scaledAddKernel(Nrows, one, o_r, mone, o_res);
+
+  //smooth the fine problem x = x + S(r-Ax)
+  this->smoother(o_res, o_res);
+  elliptic->scaledAddKernel(Nrows, one, o_res, one, o_x);
+}
+
+void MGLevel::smoothChebyshev (occa::memory &o_r, occa::memory &o_x, bool xIsZero) {
+
+  const dfloat theta = 0.5*(lambda1+lambda0);
+  const dfloat delta = 0.5*(lambda1-lambda0);
+  const dfloat invTheta = 1.0/theta;
+  const dfloat sigma = theta/delta;
+  dfloat rho_n = 1./sigma;
+  dfloat rho_np1;
+
+  dfloat one = 1., mone = -1., zero = 0.0;
+
+  occa::memory o_res = o_smootherResidual;
+  occa::memory o_Ad  = o_smootherResidual2;
+  occa::memory o_d   = o_smootherUpdate;
+
+  if(xIsZero){ //skip the Ax if x is zero
+    //res = Sr
+    this->smoother(o_r, o_res);
+
+    //d = invTheta*res
+    elliptic->scaledAddKernel(Nrows, invTheta, o_res, zero, o_d);
+  } else {
+    //res = S(r-Ax)
+    this->Ax(o_x,o_res);
+    elliptic->scaledAddKernel(Nrows, one, o_r, mone, o_res);
+    this->smoother(o_res, o_res);
+
+    //d = invTheta*res
+    elliptic->scaledAddKernel(Nrows, invTheta, o_res, zero, o_d);
+  }
+
+  for (int k=0;k<ChebyshevIterations;k++) {
+    //x_k+1 = x_k + d_k
+    if (xIsZero&&(k==0))
+      elliptic->scaledAddKernel(Nrows, one, o_d, zero, o_x);
+    else
+      elliptic->scaledAddKernel(Nrows, one, o_d, one, o_x);
+
+    //r_k+1 = r_k - SAd_k
+    this->Ax(o_d,o_Ad);
+    this->smoother(o_Ad, o_Ad);
+    elliptic->scaledAddKernel(Nrows, mone, o_Ad, one, o_res);
+
+    rho_np1 = 1.0/(2.*sigma-rho_n);
+    dfloat rhoDivDelta = 2.0*rho_np1/delta;
+
+    //d_k+1 = rho_k+1*rho_k*d_k  + 2*rho_k+1*r_k+1/delta
+    elliptic->scaledAddKernel(Nrows, rhoDivDelta, o_res, rho_np1*rho_n, o_d);
+
+    rho_n = rho_np1;
+  }
+  //x_k+1 = x_k + d_k
+  elliptic->scaledAddKernel(Nrows, one, o_d, one, o_x);
+}
+
+void MGLevel::smootherLocalPatch(occa::memory &o_r, occa::memory &o_Sr) {
+
+  // occaTimerTic(mesh->device,"approxBlockJacobiSolveKernel");
+  elliptic->precon->approxBlockJacobiSolverKernel(mesh->Nelements,
+                            elliptic->precon->o_patchesIndex,
+                            elliptic->precon->o_invAP,
+                            elliptic->precon->o_invDegreeAP,
+                            o_r,
+                            o_Sr);
+  // occaTimerToc(mesh->device,"approxBlockJacobiSolveKernel");
+}
+
+void MGLevel::smootherJacobi(occa::memory &o_r, occa::memory &o_Sr) {
+  elliptic->dotMultiplyKernel(mesh->Np*mesh->Nelements,o_invDiagA,o_r,o_Sr);
+}
+
diff --git a/solvers/elliptic/src/ellipticMultiGridLevelSetup.c b/solvers/elliptic/src/ellipticMultiGridLevelSetup.c
new file mode 100644
index 000000000..183a2b87a
--- /dev/null
+++ b/solvers/elliptic/src/ellipticMultiGridLevelSetup.c
@@ -0,0 +1,453 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "elliptic.h"
+
+size_t  MGLevel::smootherResidualBytes;
+dfloat* MGLevel::smootherResidual;
+occa::memory MGLevel::o_smootherResidual;
+occa::memory MGLevel::o_smootherResidual2;
+occa::memory MGLevel::o_smootherUpdate;
+
+//build a single level
+MGLevel::MGLevel(elliptic_t *ellipticBase, dfloat lambda_, int Nc,
+                setupAide options_, parAlmond::KrylovType ktype_, MPI_Comm comm_):
+  multigridLevel(ellipticBase->mesh->Nelements*ellipticBase->mesh->Np,
+                (ellipticBase->mesh->Nelements+ellipticBase->mesh->totalHaloPairs)*ellipticBase->mesh->Np,
+                ktype_,
+                comm_)   {
+
+  elliptic = ellipticBase;
+  mesh = elliptic->mesh;
+  options = options_;
+  lambda = lambda_;
+  degree = Nc;
+  weighted = false;
+
+  //use weighted inner products
+  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
+    weighted = true;
+    o_weight = elliptic->o_invDegree;
+    weight   = elliptic->invDegree;
+  }
+
+  this->setupSmoother();
+}
+
+//build a level and connect it to the previous one
+MGLevel::MGLevel(elliptic_t *ellipticBase, //finest level
+                 mesh_t **meshLevels,
+                 elliptic_t *ellipticFine, //previous level
+                 elliptic_t *ellipticCoarse, //current level
+                 dfloat lambda_,
+                 int Nf, int Nc,
+                 setupAide options_,
+                 parAlmond::KrylovType ktype_,
+                 MPI_Comm comm_):
+  multigridLevel(ellipticCoarse->mesh->Nelements*ellipticCoarse->mesh->Np,
+                (ellipticCoarse->mesh->Nelements+ellipticCoarse->mesh->totalHaloPairs)*ellipticCoarse->mesh->Np,
+                ktype_,
+                comm_)   {
+
+  elliptic = ellipticCoarse;
+  mesh = elliptic->mesh;
+  options = options_;
+  lambda = lambda_;
+  degree = Nc;
+  weighted = false;
+
+  //use weighted inner products
+  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
+    weighted = true;
+    o_weight = elliptic->o_invDegree;
+    weight   = elliptic->invDegree;
+
+    NpF = ellipticFine->mesh->Np;
+    o_invDegree = ellipticFine->ogs->o_invDegree;
+  }
+
+  this->setupSmoother();
+
+  /* build coarsening and prologation operators to connect levels */
+  if (elliptic->elementType==TRIANGLES||elliptic->elementType==TETRAHEDRA){
+    this->buildCoarsenerTriTet(meshLevels, Nf, Nc);
+  } else {
+    this->buildCoarsenerQuadHex(meshLevels, Nf, Nc);
+  }
+}
+
+void MGLevel::setupSmoother() {
+
+  //set up the fine problem smoothing
+  if(options.compareArgs("MULTIGRID SMOOTHER","LOCALPATCH")){
+    smtype = LOCALPATCH;
+
+    dfloat *invAP;
+    dlong Npatches;
+    dlong *patchesIndex;
+
+    dfloat rateTolerance;    // 0 - accept no approximate patches, 1 - accept all approximate patches
+    if(options.compareArgs("MULTIGRID SMOOTHER","EXACT")){
+      rateTolerance = 0.0;
+    } else {
+      rateTolerance = 1.0;
+    }
+
+    //initialize the full inverse operators on each 4 element patch
+    ellipticBuildLocalPatches(elliptic, lambda, rateTolerance, &Npatches, &patchesIndex, &invAP);
+
+    o_invAP = mesh->device.malloc(Npatches*mesh->Np*mesh->Np*sizeof(dfloat),invAP);
+    o_patchesIndex = mesh->device.malloc(mesh->Nelements*sizeof(dlong), patchesIndex);
+
+    dfloat *invDegree = (dfloat*) calloc(mesh->Nelements,sizeof(dfloat));
+    for (dlong e=0;e<mesh->Nelements;e++) invDegree[e] = 1.0;
+
+    o_invDegreeAP = mesh->device.malloc(mesh->Nelements*sizeof(dfloat),invDegree);
+
+    if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) {
+      stype = CHEBYSHEV;
+
+      if (!options.getArgs("MULTIGRID CHEBYSHEV DEGREE", ChebyshevIterations))
+        ChebyshevIterations = 2; //default to degree 2
+
+      //estimate the max eigenvalue of S*A
+      dfloat rho = this->maxEigSmoothAx();
+
+      lambda1 = rho;
+      lambda0 = rho/10.;
+    } else {
+      stype = RICHARDSON;
+
+      //estimate the max eigenvalue of S*A
+      dfloat rho = this->maxEigSmoothAx();
+
+      //set the stabilty weight (jacobi-type interation)
+      lambda0 = (4./3.)/rho;
+
+      for (dlong n=0;n<mesh->Nelements;n++)
+        invDegree[n] *= lambda0;
+
+      //update diagonal with weight
+      o_invDegreeAP.copyFrom(invDegree);
+    }
+    free(invDegree); free(invAP); free(patchesIndex);
+
+  } else if (options.compareArgs("MULTIGRID SMOOTHER","DAMPEDJACOBI")) { //default to damped jacobi
+    smtype = JACOBI;
+    dfloat *invDiagA;
+    ellipticBuildJacobi(elliptic,lambda, &invDiagA);
+
+    o_invDiagA = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat), invDiagA);
+
+    if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) {
+      stype = CHEBYSHEV;
+
+      if (!options.getArgs("MULTIGRID CHEBYSHEV DEGREE", ChebyshevIterations))
+        ChebyshevIterations = 2; //default to degree 2
+
+      //estimate the max eigenvalue of S*A
+      dfloat rho = this->maxEigSmoothAx();
+
+      lambda1 = rho;
+      lambda0 = rho/10.;
+    } else {
+      stype = RICHARDSON;
+
+      //estimate the max eigenvalue of S*A
+      dfloat rho = this->maxEigSmoothAx();
+
+      //set the stabilty weight (jacobi-type interation)
+      lambda0 = (4./3.)/rho;
+
+      for (dlong n=0;n<mesh->Np*mesh->Nelements;n++)
+        invDiagA[n] *= lambda0;
+
+      //update diagonal with weight
+      o_invDiagA.copyFrom(invDiagA);
+    }
+    free(invDiagA);
+  }
+}
+
+void MGLevel::Report() {
+
+  hlong hNrows = (hlong) Nrows;
+
+  dlong minNrows=0, maxNrows=0;
+  hlong totalNrows=0;
+  dfloat avgNrows;
+
+  MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, mesh->comm);
+  MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, mesh->comm);
+  avgNrows = (dfloat) totalNrows/mesh->size;
+
+  if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min
+  MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, mesh->comm);
+
+  char smootherString[BUFSIZ];
+  if (stype==RICHARDSON&&smtype==JACOBI)
+    strcpy(smootherString, "Damped Jacobi   ");
+  else if (stype==CHEBYSHEV&&smtype==JACOBI)
+    strcpy(smootherString, "Chebyshev       ");
+  else if (stype==RICHARDSON&&smtype==LOCALPATCH)
+    strcpy(smootherString, "Local Patch     ");
+  else if (stype==RICHARDSON&&smtype==LOCALPATCH)
+    strcpy(smootherString, "Local Patch+Cheb");
+
+  if (mesh->rank==0){
+    printf(     "|    pMG     |    %10d  |   Matrix-free   |   %s|\n",minNrows, smootherString);
+    printf("     |            |    %10d  |     Degree %2d   |                   |\n", maxNrows, degree);
+    printf("     |            |    %10d  |                 |                   |\n", (int) avgNrows);
+  }
+}
+
+
+void MGLevel::buildCoarsenerTriTet(mesh_t **meshLevels, int Nf, int Nc) {
+
+  int NpFine   = meshLevels[Nf]->Np;
+  int NpCoarse = meshLevels[Nc]->Np;
+  dfloat *P    = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat));
+  dfloat *Ptmp = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat));
+
+  //initialize P as identity (which it is for SPARSE)
+  for (int i=0;i<NpCoarse;i++) P[i*NpCoarse+i] = 1.0;
+
+  for (int n=Nc;n<Nf;n++) {
+    int Npp1 = meshLevels[n+1]->Np;
+    int Np   = meshLevels[n  ]->Np;
+
+    //copy P
+    for (int i=0;i<Np*NpCoarse;i++) Ptmp[i] = P[i];
+
+    //Multiply by the raise op
+    for (int i=0;i<Npp1;i++) {
+      for (int j=0;j<NpCoarse;j++) {
+        P[i*NpCoarse + j] = 0.;
+        for (int k=0;k<Np;k++) {
+          P[i*NpCoarse + j] += meshLevels[n]->interpRaise[i*Np+k]*Ptmp[k*NpCoarse + j];
+        }
+      }
+    }
+  }
+
+  if (elliptic->options.compareArgs("BASIS","BERN")) {
+    dfloat* BBP = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat));
+    for (int j=0;j<NpFine;j++) {
+      for (int i=0;i<NpCoarse;i++) {
+        for (int k=0;k<NpCoarse;k++) {
+          for (int l=0;l<NpFine;l++) {
+            BBP[i+j*NpCoarse] += meshLevels[Nf]->invVB[l+j*NpFine]*P[k+l*NpCoarse]*meshLevels[Nc]->VB[i+k*NpCoarse];
+          }
+        }
+      }
+    }
+    for (int j=0;j<NpFine;j++) {
+      for (int i=0;i<NpCoarse;i++) {
+        P[i+j*NpCoarse] = BBP[i+j*NpCoarse];
+      }
+    }
+    free(BBP);
+  }
+
+  //the coarsen matrix is P^T
+  R = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat));
+  for (int i=0;i<NpCoarse;i++) {
+    for (int j=0;j<NpFine;j++) {
+      R[i*NpFine+j] = P[j*NpCoarse+i];
+    }
+  }
+  o_R = elliptic->mesh->device.malloc(NpFine*NpCoarse*sizeof(dfloat), R);
+
+  free(P); free(Ptmp);
+}
+
+void MGLevel::buildCoarsenerQuadHex(mesh_t **meshLevels, int Nf, int Nc) {
+
+  int NqFine   = Nf+1;
+  int NqCoarse = Nc+1;
+  dfloat *P    = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat));
+  dfloat *Ptmp = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat));
+
+  //initialize P as identity
+  for (int i=0;i<NqCoarse;i++) P[i*NqCoarse+i] = 1.0;
+
+  for (int n=Nc;n<Nf;n++) {
+
+    int Nqp1 = n+2;
+    int Nq   = n+1;
+
+    //copy P
+    for (int i=0;i<Nq*NqCoarse;i++) Ptmp[i] = P[i];
+
+    //Multiply by the raise op
+    for (int i=0;i<Nqp1;i++) {
+      for (int j=0;j<NqCoarse;j++) {
+        P[i*NqCoarse + j] = 0.;
+        for (int k=0;k<Nq;k++) {
+          P[i*NqCoarse + j] += meshLevels[n]->interpRaise[i*Nq+k]*Ptmp[k*NqCoarse + j];
+        }
+      }
+    }
+  }
+
+  //the coarsen matrix is P^T
+  R = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat));
+  for (int i=0;i<NqCoarse;i++) {
+    for (int j=0;j<NqFine;j++) {
+      R[i*NqFine+j] = P[j*NqCoarse+i];
+    }
+  }
+  o_R = elliptic->mesh->device.malloc(NqFine*NqCoarse*sizeof(dfloat), R);
+
+  free(P); free(Ptmp);
+}
+
+
+static void eig(const int Nrows, double *A, double *WR, double *WI){
+
+  int NB  = 256;
+  char JOBVL  = 'V';
+  char JOBVR  = 'V';
+  int     N = Nrows;
+  int   LDA = Nrows;
+  int  LWORK  = (NB+2)*N;
+
+  double *WORK  = new double[LWORK];
+  double *VL  = new double[Nrows*Nrows];
+  double *VR  = new double[Nrows*Nrows];
+
+  int INFO = -999;
+
+  dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI,
+    VL, &LDA, VR, &LDA, WORK, &LWORK, &INFO);
+
+
+  assert(INFO == 0);
+
+  delete [] VL;
+  delete [] VR;
+  delete [] WORK;
+}
+
+dfloat MGLevel::maxEigSmoothAx(){
+
+  const dlong N = Nrows;
+  const dlong M = Ncols;
+
+  int k = 10;
+
+  hlong Nlocal = (hlong) Nrows;
+  hlong Ntotal = 0;
+  MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, mesh->comm);
+  if(k > Ntotal) k = (int) Ntotal;
+
+  // do an arnoldi
+
+  // allocate memory for Hessenberg matrix
+  double *H = (double *) calloc(k*k,sizeof(double));
+
+  // allocate memory for basis
+  dfloat *Vx = (dfloat*) calloc(M, sizeof(dfloat));
+  occa::memory *o_V = (occa::memory *) calloc(k+1, sizeof(occa::memory));
+
+  occa::memory o_Vx  = mesh->device.malloc(M*sizeof(dfloat),Vx);
+  occa::memory o_AVx = mesh->device.malloc(M*sizeof(dfloat),Vx);
+
+  for(int i=0; i<=k; i++)
+    o_V[i] = mesh->device.malloc(M*sizeof(dfloat),Vx);
+
+  // generate a random vector for initial basis vector
+  for (dlong i=0;i<N;i++) Vx[i] = (dfloat) drand48();
+
+  //gather-scatter
+  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
+    ogsGatherScatter(Vx, ogsDfloat, ogsAdd, mesh->ogs);
+
+    for (dlong i=0;i<elliptic->Nmasked;i++) Vx[elliptic->maskIds[i]] = 0.;
+  }
+
+  o_Vx.copyFrom(Vx); //copy to device
+  dfloat norm_vo = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_Vx, o_Vx);
+  norm_vo = sqrt(norm_vo);
+
+  ellipticScaledAdd(elliptic, 1./norm_vo, o_Vx, 0. , o_V[0]);
+
+  for(int j=0; j<k; j++){
+    // v[j+1] = invD*(A*v[j])
+    this->Ax(o_V[j],o_AVx);
+    this->smoother(o_AVx, o_V[j+1]);
+
+    // modified Gram-Schmidth
+    for(int i=0; i<=j; i++){
+      // H(i,j) = v[i]'*A*v[j]
+      dfloat hij = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_V[i], o_V[j+1]);
+
+      // v[j+1] = v[j+1] - hij*v[i]
+      ellipticScaledAdd(elliptic, -hij, o_V[i], 1., o_V[j+1]);
+
+      H[i + j*k] = (double) hij;
+    }
+
+    if(j+1 < k){
+      // v[j+1] = v[j+1]/||v[j+1]||
+      dfloat norm_vj = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_V[j+1], o_V[j+1]);
+      norm_vj = sqrt(norm_vj);
+      ellipticScaledAdd(elliptic, 1/norm_vj, o_V[j+1], 0., o_V[j+1]);
+
+      H[j+1+ j*k] = (double) norm_vj;
+    }
+  }
+
+  double *WR = (double *) calloc(k,sizeof(double));
+  double *WI = (double *) calloc(k,sizeof(double));
+
+  eig(k, H, WR, WI);
+
+  double rho = 0.;
+
+  for(int i=0; i<k; i++){
+    double rho_i  = sqrt(WR[i]*WR[i] + WI[i]*WI[i]);
+
+    if(rho < rho_i) {
+      rho = rho_i;
+    }
+  }
+
+  // free memory
+  free(H);
+  free(WR);
+  free(WI);
+
+  free(Vx);
+  o_Vx.free();
+  o_AVx.free();
+  for(int i=0; i<=k; i++) o_V[i].free();
+  free((void*)o_V);
+
+  // if((mesh->rank==0)&&(options.compareArgs("VERBOSE","TRUE"))) printf("weight = %g \n", rho);
+
+  return rho;
+}
diff --git a/solvers/elliptic/src/ellipticMultiGridSetup.c b/solvers/elliptic/src/ellipticMultiGridSetup.c
index 72db6933c..7534a7018 100644
--- a/solvers/elliptic/src/ellipticMultiGridSetup.c
+++ b/solvers/elliptic/src/ellipticMultiGridSetup.c
@@ -26,74 +26,6 @@ SOFTWARE.
 
 #include "elliptic.h"
 
-void ellipticMultigridAx(void **args, occa::memory &o_x, occa::memory &o_Ax) {
-
-  elliptic_t *elliptic = (elliptic_t *) args[0];
-  dfloat *lambda = (dfloat *) args[1];
-
-  ellipticOperator(elliptic,*lambda,o_x,o_Ax, dfloatString); // "float" ); // hard coded for testing (should make an option)
-}
-
-void ellipticMultigridCoarsen(void **args, occa::memory &o_x, occa::memory &o_Rx) {
-
-  elliptic_t *elliptic = (elliptic_t *) args[0];
-  elliptic_t *Felliptic = (elliptic_t *) args[1];
-  setupAide options = elliptic->options;
-
-  mesh_t *mesh = elliptic->mesh;
-  mesh_t *Fmesh = Felliptic->mesh;
-  precon_t *precon = elliptic->precon;
-  occa::memory o_R = elliptic->o_R;
-
-  if (options.compareArgs("DISCRETIZATION","CONTINUOUS"))
-    Felliptic->dotMultiplyKernel(Fmesh->Nelements*Fmesh->Np, Fmesh->ogs->o_invDegree, o_x, o_x);
-
-  precon->coarsenKernel(mesh->Nelements, o_R, o_x, o_Rx);
-
-  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
-    ogsGatherScatter(o_Rx, ogsDfloat, ogsAdd, mesh->ogs);  
-    if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_Rx);
-  }
-}
-
-void ellipticMultigridProlongate(void **args, occa::memory &o_x, occa::memory &o_Px) {
-
-  elliptic_t *elliptic = (elliptic_t *) args[0];
-  mesh_t *mesh = elliptic->mesh;
-  precon_t *precon = elliptic->precon;
-  occa::memory o_R = elliptic->o_R;
-
-  precon->prolongateKernel(mesh->Nelements, o_R, o_x, o_Px);
-}
-
-void ellipticGather(void **args, occa::memory &o_x, occa::memory &o_Gx) {
-
-  elliptic_t *elliptic = (elliptic_t *) args[0];
-  ogs_t *ogs       = (ogs_t *) args[1];
-  occa::memory *o_s= (occa::memory *) args[2];
-  
-  mesh_t *mesh      = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  ogsGather(o_Gx, o_x, ogsDfloat, ogsAdd, ogs);
-  elliptic->dotMultiplyKernel(ogs->Ngather, ogs->o_gatherInvDegree, o_Gx, o_Gx);
-}
-
-void ellipticScatter(void **args, occa::memory &o_x, occa::memory &o_Sx) {
-
-  elliptic_t *elliptic = (elliptic_t *) args[0];
-  ogs_t *ogs       = (ogs_t *) args[1];
-  occa::memory *o_s= (occa::memory *) args[2];
-  
-  mesh_t *mesh      = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  ogsScatter(o_Sx, o_x, ogsDfloat, ogsAdd, ogs);
-}
-
-void buildCoarsenerTriTet(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc);
-void buildCoarsenerQuadHex(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc);
-
 void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda) {
 
   mesh_t *mesh = elliptic->mesh;
@@ -105,7 +37,7 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd
     meshLevels[n] = (mesh_t *) calloc(1,sizeof(mesh_t));
     meshLevels[n]->Nverts = mesh->Nverts;
     meshLevels[n]->Nfaces = mesh->Nfaces;
-    
+
     switch(elliptic->elementType){
     case TRIANGLES:
       meshLoadReferenceNodesTri2D(meshLevels[n], n); break;
@@ -119,35 +51,35 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd
   }
 
   //set the number of MG levels and their degree
-  int numLevels;
+  int numMGLevels;
   int *levelDegree;
 
   if (options.compareArgs("MULTIGRID COARSENING","ALLDEGREES")) {
-    numLevels = mesh->N;
-    levelDegree= (int *) calloc(numLevels,sizeof(int));
-    for (int n=0;n<numLevels;n++) levelDegree[n] = mesh->N - n; //all degrees
+    numMGLevels = mesh->N;
+    levelDegree= (int *) calloc(numMGLevels,sizeof(int));
+    for (int n=0;n<numMGLevels;n++) levelDegree[n] = mesh->N - n; //all degrees
   } else if (options.compareArgs("MULTIGRID COARSENING","HALFDEGREES")) {
-    numLevels = floor(mesh->N/2.)+1;
-    levelDegree= (int *) calloc(numLevels,sizeof(int));
-    for (int n=0;n<numLevels;n++) levelDegree[n] = mesh->N - 2*n; //decrease by two
-    levelDegree[numLevels-1] = 1; //ensure the last level is degree 1
+    numMGLevels = floor(mesh->N/2.)+1;
+    levelDegree= (int *) calloc(numMGLevels,sizeof(int));
+    for (int n=0;n<numMGLevels;n++) levelDegree[n] = mesh->N - 2*n; //decrease by two
+    levelDegree[numMGLevels-1] = 1; //ensure the last level is degree 1
   } else { //default "HALFDOFS"
     // pick the degrees so the dofs of each level halfs (roughly)
     //start by counting the number of levels neccessary
-    numLevels = 1;
+    numMGLevels = 1;
     int degree = mesh->N;
     int dofs = meshLevels[degree]->Np;
     int basedofs = mesh->Nverts;
     while (dofs>basedofs) {
-      numLevels++;
+      numMGLevels++;
       for (;degree>0;degree--)
         if (meshLevels[degree]->Np<=dofs/2)
           break;
       dofs = meshLevels[degree]->Np;
     }
-    levelDegree= (int *) calloc(numLevels,sizeof(int));
+    levelDegree= (int *) calloc(numMGLevels,sizeof(int));
     degree = mesh->N;
-    numLevels = 1;
+    numMGLevels = 1;
     levelDegree[0] = degree;
     dofs = meshLevels[degree]->Np;
     while (dofs>basedofs) {
@@ -155,133 +87,48 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd
         if (meshLevels[degree]->Np<=dofs/2)
           break;
       dofs = meshLevels[degree]->Np;
-      levelDegree[numLevels] = degree;
-      numLevels++;
+      levelDegree[numMGLevels] = degree;
+      numMGLevels++;
     }
   }
 
-  //storage for lambda parameter
-  dfloat *vlambda = (dfloat *) calloc(1,sizeof(dfloat));
-  *vlambda = lambda;
+  int Nmax = levelDegree[0];
+  int Nmin = levelDegree[numMGLevels-1];
 
   //initialize parAlmond
-  precon->parAlmond = parAlmondInit(mesh, elliptic->options);
-  agmgLevel **levels = precon->parAlmond->levels;
+  precon->parAlmond = parAlmond::Init(mesh->device, mesh->comm, options);
+  parAlmond::multigridLevel **levels = precon->parAlmond->levels;
+
+  //set up the finest level
+  if (Nmax>Nmin) {
+    levels[0] = new MGLevel(elliptic, lambda, Nmax, options,
+                            precon->parAlmond->ktype, mesh->comm);
+    MGLevelAllocateStorage((MGLevel*) levels[0], 0,
+                            precon->parAlmond->ctype);
+    precon->parAlmond->numLevels++;
+  }
 
-  //build a elliptic struct for every degree
-  elliptic_t **ellipticsN = (elliptic_t**) calloc(mesh->N+1,sizeof(elliptic_t*));
-  ellipticsN[mesh->N] = elliptic; //top level
-  for (int n=1;n<numLevels;n++) {  //build elliptic for this degree
-    int Nf = levelDegree[n-1];
+  //build a MGLevel for every degree (except degree 1)
+  for (int n=1;n<numMGLevels-1;n++) {
     int Nc = levelDegree[n];
-    printf("=============BUILDING MULTIGRID LEVEL OF DEGREE %d==================\n", Nc);
-    ellipticsN[Nc] = ellipticBuildMultigridLevel(elliptic,Nc,Nf);
-  }
+    int Nf = levelDegree[n-1];
 
-  // set multigrid operators for fine levels
-  for (int n=0;n<numLevels-1;n++) {
-    int N = levelDegree[n];
-    elliptic_t *ellipticL = ellipticsN[N];
+    //build elliptic struct for this degree
+    printf("=============BUILDING MULTIGRID LEVEL OF DEGREE %d==================\n", Nc);
+    elliptic_t *ellipticC = ellipticBuildMultigridLevel(elliptic,Nc,Nf);
 
     //add the level manually
+    levels[n] = new MGLevel(elliptic,
+                           meshLevels,
+                           ((MGLevel*) levels[n-1])->elliptic,
+                           ellipticC,
+                           lambda,
+                           Nf, Nc,
+                           options,
+                           precon->parAlmond->ktype, mesh->comm);
+    MGLevelAllocateStorage((MGLevel*) levels[n], n,
+                            precon->parAlmond->ctype);
     precon->parAlmond->numLevels++;
-    levels[n] = (agmgLevel *) calloc(1,sizeof(agmgLevel));
-    levels[n]->gatherLevel = false;   //dont gather this level
-    if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {//use weighted inner products
-      precon->parAlmond->levels[n]->weightedInnerProds = true;
-      precon->parAlmond->levels[n]->o_weight = ellipticL->o_invDegree;
-      precon->parAlmond->levels[n]->weight = ellipticL->invDegree;
-    }
-
-    //use the matrix free Ax
-    levels[n]->AxArgs = (void **) calloc(2,sizeof(void*));
-    levels[n]->AxArgs[0] = (void *) ellipticL;
-    levels[n]->AxArgs[1] = (void *) vlambda;
-    levels[n]->device_Ax = ellipticMultigridAx;
-
-    levels[n]->smoothArgs = (void **) calloc(2,sizeof(void*));
-    levels[n]->smoothArgs[0] = (void *) ellipticL;
-    levels[n]->smoothArgs[1] = (void *) levels[n];
-
-    levels[n]->Nrows = mesh->Nelements*ellipticL->mesh->Np;
-    levels[n]->Ncols = (mesh->Nelements+mesh->totalHaloPairs)*ellipticL->mesh->Np;
-
-    if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) {
-      if (!options.getArgs("MULTIGRID CHEBYSHEV DEGREE", levels[n]->ChebyshevIterations))
-        levels[n]->ChebyshevIterations = 2; //default to degree 2
-
-      levels[n]->device_smooth = ellipticMultigridSmoothChebyshev;
-
-      levels[n]->smootherResidual = (dfloat *) calloc(levels[n]->Ncols,sizeof(dfloat));
-
-      // extra storage for smoothing op
-      levels[n]->o_smootherResidual = mesh->device.malloc(levels[n]->Ncols*sizeof(dfloat),levels[n]->smootherResidual);
-      levels[n]->o_smootherResidual2 = mesh->device.malloc(levels[n]->Ncols*sizeof(dfloat),levels[n]->smootherResidual);
-      levels[n]->o_smootherUpdate = mesh->device.malloc(levels[n]->Ncols*sizeof(dfloat),levels[n]->smootherResidual);
-    } else {
-      levels[n]->device_smooth = ellipticMultigridSmooth;
-
-      // extra storage for smoothing op
-      levels[n]->o_smootherResidual = mesh->device.malloc(levels[n]->Ncols*sizeof(dfloat));
-    }
-
-    levels[n]->smootherArgs = (void **) calloc(2,sizeof(void*));
-    levels[n]->smootherArgs[0] = (void *) ellipticL;
-    levels[n]->smootherArgs[1] = (void *) vlambda;
-
-    dfloat rateTolerance;    // 0 - accept not approximate patches, 1 - accept all approximate patches
-    if(options.compareArgs("MULTIGRID SMOOTHER","EXACT")){
-      rateTolerance = 0.0;
-    } else {
-      rateTolerance = 1.0;
-    }
-
-    //set up the fine problem smoothing
-    if(options.compareArgs("MULTIGRID SMOOTHER","LOCALPATCH")){
-      ellipticSetupSmootherLocalPatch(ellipticL, ellipticL->precon, levels[n], lambda, rateTolerance);
-    } else { //default to damped jacobi
-      ellipticSetupSmootherDampedJacobi(ellipticL, ellipticL->precon, levels[n], lambda);
-    }
-  }
-
-  //report top levels
-  if (options.compareArgs("VERBOSE","TRUE")) {
-    if((mesh->rank==0)&&(numLevels>0)) { //report the upper multigrid levels
-      printf("------------------Multigrid Report---------------------\n");
-      printf("-------------------------------------------------------\n");
-      printf("level|  Degree  |    dimension   |      Smoother       \n");
-      printf("     |  Degree  |  (min,max,avg) |      Smoother       \n");
-      printf("-------------------------------------------------------\n");
-    }
-
-    for(int lev=0; lev<numLevels; lev++){
-
-      dlong Nrows = (lev==numLevels-1) ? mesh->Nverts*mesh->Nelements: levels[lev]->Nrows;
-      hlong hNrows = (hlong) Nrows;
-
-      dlong minNrows=0, maxNrows=0;
-      hlong totalNrows=0;
-      dfloat avgNrows;
-
-      MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, mesh->comm);
-      MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, mesh->comm);
-      avgNrows = (dfloat) totalNrows/mesh->size;
-
-      if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min
-      MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, mesh->comm);
-
-      char smootherString[BUFSIZ];
-      strcpy(smootherString, (char*) (options.getArgs("MULTIGRID SMOOTHER")).c_str());
-
-      if (mesh->rank==0){
-        printf(" %3d |   %3d    |    %10.2f  |   %s  \n",
-          lev, levelDegree[lev], (dfloat)minNrows, smootherString);
-        printf("     |          |    %10.2f  |   \n", (dfloat)maxNrows);
-        printf("     |          |    %10.2f  |   \n", avgNrows);
-      }
-    }
-    if((mesh->rank==0)&&(numLevels>0)) 
-      printf("-------------------------------------------------------\n");
   }
 
   /* build degree 1 problem and pass to AMG */
@@ -289,18 +136,27 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd
   dlong nnzCoarseA;
   ogs_t *coarseogs;
 
-  elliptic_t* ellipticL = ellipticsN[1];
-  int basisNp = ellipticL->mesh->Np;
+  //set up the base level
+  elliptic_t* ellipticCoarse;
+  if (Nmax>Nmin) {
+    int Nc = levelDegree[numMGLevels-1];
+    int Nf = levelDegree[numMGLevels-2];
+    printf("=============BUILDING MULTIGRID LEVEL OF DEGREE %d==================\n", Nmin);
+    ellipticCoarse = ellipticBuildMultigridLevel(elliptic,Nc,Nf);
+  } else {
+    ellipticCoarse = elliptic;
+  }
+  int basisNp = ellipticCoarse->mesh->Np;
   dfloat *basis = NULL;
 
-  if (options.compareArgs("BASIS","BERN")) basis = ellipticL->mesh->VB;
+  if (options.compareArgs("BASIS","BERN")) basis = ellipticCoarse->mesh->VB;
 
   hlong *coarseGlobalStarts = (hlong*) calloc(mesh->size+1, sizeof(hlong));
 
   if (options.compareArgs("DISCRETIZATION","IPDG")) {
-    ellipticBuildIpdg(ellipticL, basisNp, basis, lambda, &coarseA, &nnzCoarseA,coarseGlobalStarts);
+    ellipticBuildIpdg(ellipticCoarse, basisNp, basis, lambda, &coarseA, &nnzCoarseA,coarseGlobalStarts);
   } else if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
-    ellipticBuildContinuous(ellipticL,lambda,&coarseA,&nnzCoarseA,&coarseogs,coarseGlobalStarts);
+    ellipticBuildContinuous(ellipticCoarse,lambda,&coarseA,&nnzCoarseA,&coarseogs,coarseGlobalStarts);
   }
 
   hlong *Rows = (hlong *) calloc(nnzCoarseA, sizeof(hlong));
@@ -312,168 +168,124 @@ void ellipticMultiGridSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambd
     Cols[i] = coarseA[i].col;
     Vals[i] = coarseA[i].val;
   }
+  free(coarseA);
 
   // build amg starting at level N=1
-  parAlmondAgmgSetup(precon->parAlmond,
-                     coarseGlobalStarts,
-                     nnzCoarseA,
-                     Rows,
-                     Cols,
-                     Vals,
-                     elliptic->allNeumann,
-                     elliptic->allNeumannPenalty);
-  free(coarseA); free(Rows); free(Cols); free(Vals);
-
-  //tell parAlmond to gather this level
-  agmgLevel *coarseLevel = precon->parAlmond->levels[numLevels-1];
-  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
-    coarseLevel->gatherLevel = true;
-    coarseLevel->weightedInnerProds = false;
-    
-    coarseLevel->Srhs = (dfloat*) calloc(ellipticL->mesh->Np*ellipticL->mesh->Nelements,sizeof(dfloat));
-    coarseLevel->Sx   = (dfloat*) calloc(ellipticL->mesh->Np*ellipticL->mesh->Nelements,sizeof(dfloat));
-    coarseLevel->o_Srhs = ellipticL->mesh->device.malloc(ellipticL->mesh->Np*ellipticL->mesh->Nelements*sizeof(dfloat),coarseLevel->Srhs);
-    coarseLevel->o_Sx   = ellipticL->mesh->device.malloc(ellipticL->mesh->Np*ellipticL->mesh->Nelements*sizeof(dfloat),coarseLevel->Sx);
-
-    coarseLevel->gatherArgs = (void **) calloc(3,sizeof(void*));  
-    coarseLevel->gatherArgs[0] = (void *) ellipticL;
-    coarseLevel->gatherArgs[1] = (void *) ellipticL->ogs;
-    coarseLevel->gatherArgs[2] = (void *) &(coarseLevel->o_Sx);
-    coarseLevel->scatterArgs = coarseLevel->gatherArgs;
-
-    coarseLevel->device_gather  = ellipticGather;
-    coarseLevel->device_scatter = ellipticScatter;        
+  parAlmond::AMGSetup(precon->parAlmond,
+                       coarseGlobalStarts,
+                       nnzCoarseA,
+                       Rows,
+                       Cols,
+                       Vals,
+                       elliptic->allNeumann,
+                       elliptic->allNeumannPenalty);
+  free(Rows); free(Cols); free(Vals);
+
+  //overwrite the finest AMG level with the degree 1 matrix free level
+  // delete levels[numMGLevels-1];
+  if (Nmax>Nmin) {
+    int Nc = levelDegree[numMGLevels-1];
+    int Nf = levelDegree[numMGLevels-2];
+    elliptic_t *ellipticFine = ((MGLevel*) levels[numMGLevels-2])->elliptic;
+    levels[numMGLevels-1] = new MGLevel(elliptic,
+                                       meshLevels,
+                                       ellipticFine,
+                                       ellipticCoarse,
+                                       lambda,
+                                       Nf, Nc,
+                                       options,
+                                       precon->parAlmond->ktype, mesh->comm);
+  } else {
+    levels[numMGLevels-1] = new MGLevel(ellipticCoarse, lambda, Nmin, options,
+                                       precon->parAlmond->ktype, mesh->comm);
   }
+  MGLevelAllocateStorage((MGLevel*) levels[numMGLevels-1], numMGLevels-1,
+                            precon->parAlmond->ctype);
 
-  /* build coarsening and prologation operators to connect levels */
-  for(int n=1; n<numLevels; n++) {
-    //build coarsen and prologation ops
-    int Nf = levelDegree[n-1]; //higher degree
-    int Nc = levelDegree[n];  
-
-    elliptic_t *ellipticL = ellipticsN[Nc];
-    elliptic_t *ellipticF = ellipticsN[Nf];
-
-    if (elliptic->elementType==TRIANGLES||elliptic->elementType==TETRAHEDRA){
-      buildCoarsenerTriTet(ellipticL, meshLevels, Nf, Nc);
+  //tell parAlmond to gather when going to the next level
+  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
+    if (precon->parAlmond->numLevels > numMGLevels) {
+      parAlmond::agmgLevel *nextLevel
+            = (parAlmond::agmgLevel*)precon->parAlmond->levels[numMGLevels];
+
+      nextLevel->gatherLevel = true;
+      nextLevel->ogs = ellipticCoarse->ogs;
+      nextLevel->Gx = (dfloat*) calloc(levels[numMGLevels-1]->Ncols,sizeof(dfloat));
+      nextLevel->Sx = (dfloat*) calloc(ellipticCoarse->mesh->Np*ellipticCoarse->mesh->Nelements,sizeof(dfloat));
+      nextLevel->o_Gx = ellipticCoarse->mesh->device.malloc(levels[numMGLevels-1]->Ncols*sizeof(dfloat),nextLevel->Gx);
+      nextLevel->o_Sx = ellipticCoarse->mesh->device.malloc(ellipticCoarse->mesh->Np*ellipticCoarse->mesh->Nelements*sizeof(dfloat),nextLevel->Sx);
     } else {
-      buildCoarsenerQuadHex(ellipticL, meshLevels, Nf, Nc);
+      //this level is the base
+      parAlmond::coarseSolver *coarseLevel = precon->parAlmond->coarseLevel;
+
+      coarseLevel->gatherLevel = true;
+      coarseLevel->ogs = ellipticCoarse->ogs;
+      coarseLevel->Gx = (dfloat*) calloc(coarseLevel->ogs->Ngather,sizeof(dfloat));
+      coarseLevel->Sx = (dfloat*) calloc(ellipticCoarse->mesh->Np*ellipticCoarse->mesh->Nelements,sizeof(dfloat));
+      coarseLevel->o_Gx = ellipticCoarse->mesh->device.malloc(coarseLevel->ogs->Ngather*sizeof(dfloat),coarseLevel->Gx);
+      coarseLevel->o_Sx = ellipticCoarse->mesh->device.malloc(ellipticCoarse->mesh->Np*ellipticCoarse->mesh->Nelements*sizeof(dfloat),coarseLevel->Sx);
     }
-    
-    levels[n]->coarsenArgs = (void **) calloc(2,sizeof(void*));
-    levels[n]->coarsenArgs[0] = (void *) ellipticL;
-    levels[n]->coarsenArgs[1] = (void *) ellipticF;
-
-    levels[n]->prolongateArgs = levels[n]->coarsenArgs;
-    
-    levels[n]->device_coarsen = ellipticMultigridCoarsen;
-    levels[n]->device_prolongate = ellipticMultigridProlongate;
   }
 
   for (int n=1;n<mesh->N+1;n++) free(meshLevels[n]);
   free(meshLevels);
-}
-
-
-
-void buildCoarsenerTriTet(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc) {
-
-  int NpFine   = meshLevels[Nf]->Np;
-  int NpCoarse = meshLevels[Nc]->Np;
-  dfloat *P    = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat));
-  dfloat *Ptmp = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat));
-
-  //initialize P as identity (which it is for SPARSE)
-  for (int i=0;i<NpCoarse;i++) P[i*NpCoarse+i] = 1.0;
 
-
-  for (int n=Nc;n<Nf;n++) {
-
-    int Npp1 = meshLevels[n+1]->Np;
-    int Np   = meshLevels[n  ]->Np;
-
-    //copy P
-    for (int i=0;i<Np*NpCoarse;i++) Ptmp[i] = P[i];
-
-    //Multiply by the raise op
-    for (int i=0;i<Npp1;i++) {
-      for (int j=0;j<NpCoarse;j++) {
-        P[i*NpCoarse + j] = 0.;
-        for (int k=0;k<Np;k++) {
-          P[i*NpCoarse + j] += meshLevels[n]->interpRaise[i*Np+k]*Ptmp[k*NpCoarse + j];
-        }
-      }
+  //report top levels
+  if (options.compareArgs("VERBOSE","TRUE")) {
+    if (mesh->rank==0) { //report the upper multigrid levels
+      printf("------------------Multigrid Report----------------------------------------\n");
+      printf("--------------------------------------------------------------------------\n");
+      printf("level|    Type    |    dimension   |   nnz per row   |   Smoother        |\n");
+      printf("     |            |  (min,max,avg) |  (min,max,avg)  |                   |\n");
+      printf("--------------------------------------------------------------------------\n");
     }
-  }
 
-  if (elliptic->options.compareArgs("BASIS","BERN")) {
-    dfloat* BBP = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat));
-    for (int j=0;j<NpFine;j++) {
-      for (int i=0;i<NpCoarse;i++) {
-        for (int k=0;k<NpCoarse;k++) {
-          for (int l=0;l<NpFine;l++) {
-            BBP[i+j*NpCoarse] += meshLevels[Nf]->invVB[l+j*NpFine]*P[k+l*NpCoarse]*meshLevels[Nc]->VB[i+k*NpCoarse];
-          }
-        }
-      }
-    }
-    for (int j=0;j<NpFine;j++) {
-      for (int i=0;i<NpCoarse;i++) {
-        P[i+j*NpCoarse] = BBP[i+j*NpCoarse];
-      }
+    for(int lev=0; lev<precon->parAlmond->numLevels; lev++) {
+      if(mesh->rank==0) {printf(" %3d ", lev);fflush(stdout);}
+      levels[lev]->Report();
     }
-    free(BBP);
-  }
 
-  //the coarsen matrix is P^T
-  elliptic->R = (dfloat *) calloc(NpFine*NpCoarse,sizeof(dfloat));
-  for (int i=0;i<NpCoarse;i++) {
-    for (int j=0;j<NpFine;j++) {
-      elliptic->R[i*NpFine+j] = P[j*NpCoarse+i];
-    }
+    if (mesh->rank==0)
+      printf("--------------------------------------------------------------------------\n");
   }
-  elliptic->o_R = elliptic->mesh->device.malloc(NpFine*NpCoarse*sizeof(dfloat), elliptic->R);
-
-  free(P); free(Ptmp);
 }
 
-void buildCoarsenerQuadHex(elliptic_t* elliptic, mesh_t **meshLevels, int Nf, int Nc) {
-
-  int NqFine   = Nf+1;
-  int NqCoarse = Nc+1;
-  dfloat *P    = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat));
-  dfloat *Ptmp = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat));
 
-  //initialize P as identity
-  for (int i=0;i<NqCoarse;i++) P[i*NqCoarse+i] = 1.0;
-
-  for (int n=Nc;n<Nf;n++) {
-
-    int Nqp1 = n+2;
-    int Nq   = n+1;
-
-    //copy P
-    for (int i=0;i<Nq*NqCoarse;i++) Ptmp[i] = P[i];
-
-    //Multiply by the raise op
-    for (int i=0;i<Nqp1;i++) {
-      for (int j=0;j<NqCoarse;j++) {
-        P[i*NqCoarse + j] = 0.;
-        for (int k=0;k<Nq;k++) {
-          P[i*NqCoarse + j] += meshLevels[n]->interpRaise[i*Nq+k]*Ptmp[k*NqCoarse + j];
-        }
-      }
+void MGLevelAllocateStorage(MGLevel *level, int k, parAlmond::CycleType ctype) {
+  // extra storage for smoothing op
+  size_t Nbytes = level->Ncols*sizeof(dfloat);
+  if (MGLevel::smootherResidualBytes < Nbytes) {
+    if (MGLevel::o_smootherResidual.size()) {
+      free(MGLevel::smootherResidual);
+      MGLevel::o_smootherResidual.free();
+      MGLevel::o_smootherResidual2.free();
+      MGLevel::o_smootherUpdate.free();
     }
+
+    MGLevel::smootherResidual = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
+    MGLevel::o_smootherResidual = level->mesh->device.malloc(Nbytes,MGLevel::smootherResidual);
+    MGLevel::o_smootherResidual2 = level->mesh->device.malloc(Nbytes,MGLevel::smootherResidual);
+    MGLevel::o_smootherUpdate = level->mesh->device.malloc(Nbytes,MGLevel::smootherResidual);
+    MGLevel::smootherResidualBytes = Nbytes;
   }
 
-  //the coarsen matrix is P^T
-  elliptic->R = (dfloat *) calloc(NqFine*NqCoarse,sizeof(dfloat));
-  for (int i=0;i<NqCoarse;i++) {
-    for (int j=0;j<NqFine;j++) {
-      elliptic->R[i*NqFine+j] = P[j*NqCoarse+i];
+  if (k) level->x    = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
+  if (k) level->rhs  = (dfloat *) calloc(level->Nrows,sizeof(dfloat));
+  if (k) level->o_x   = level->mesh->device.malloc(level->Ncols*sizeof(dfloat),level->x);
+  if (k) level->o_rhs = level->mesh->device.malloc(level->Nrows*sizeof(dfloat),level->rhs);
+
+  level->res  = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
+  level->o_res = level->mesh->device.malloc(level->Ncols*sizeof(dfloat),level->res);
+
+  //kcycle vectors
+  if (ctype==parAlmond::KCYCLE) {
+    if ((k>0) && (k<NUMKCYCLES+1)) {
+      level->ck = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
+      level->vk = (dfloat *) calloc(level->Nrows,sizeof(dfloat));
+      level->wk = (dfloat *) calloc(level->Nrows,sizeof(dfloat));
+      level->o_ck = level->mesh->device.malloc(level->Ncols*sizeof(dfloat),level->ck);
+      level->o_vk = level->mesh->device.malloc(level->Nrows*sizeof(dfloat),level->vk);
+      level->o_wk = level->mesh->device.malloc(level->Nrows*sizeof(dfloat),level->wk);
     }
   }
-  elliptic->o_R = elliptic->mesh->device.malloc(NqFine*NqCoarse*sizeof(dfloat), elliptic->R);
-
-  free(P); free(Ptmp);
 }
diff --git a/solvers/elliptic/src/ellipticPreconditioner.c b/solvers/elliptic/src/ellipticPreconditioner.c
index c824aa7ec..7a3a3f1fd 100644
--- a/solvers/elliptic/src/ellipticPreconditioner.c
+++ b/solvers/elliptic/src/ellipticPreconditioner.c
@@ -32,14 +32,29 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda,
   mesh_t *mesh = elliptic->mesh;
   precon_t *precon = elliptic->precon;
   setupAide options = elliptic->options;
-  
-  if (   options.compareArgs("PRECONDITIONER", "FULLALMOND")
-      || options.compareArgs("PRECONDITIONER", "MULTIGRID")) {
+
+  if (options.compareArgs("PRECONDITIONER", "MULTIGRID")) {
 
     occaTimerTic(mesh->device,"parALMOND");
-    parAlmondPrecon(precon->parAlmond, o_z, o_r);
+    parAlmond::Precon(precon->parAlmond, o_z, o_r);
     occaTimerToc(mesh->device,"parALMOND");
 
+  } else if (options.compareArgs("PRECONDITIONER", "FULLALMOND")) {
+
+    if (options.compareArgs("DISCRETIZATION", "IPDG")) {
+      occaTimerTic(mesh->device,"parALMOND");
+      parAlmond::Precon(precon->parAlmond, o_z, o_r);
+      occaTimerToc(mesh->device,"parALMOND");
+    } else if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {
+      ogsGather(precon->o_rhsG, o_r, ogsDfloat, ogsAdd, elliptic->ogs);
+      elliptic->dotMultiplyKernel(elliptic->ogs->Ngather,
+                      elliptic->ogs->o_gatherInvDegree, precon->o_rhsG, precon->o_rhsG);
+      occaTimerTic(mesh->device,"parALMOND");
+      parAlmond::Precon(precon->parAlmond, precon->o_xG, precon->o_rhsG);
+      occaTimerToc(mesh->device,"parALMOND");
+      ogsScatter(o_z, precon->o_xG, ogsDfloat, ogsAdd, elliptic->ogs);
+    }
+
   } else if(options.compareArgs("PRECONDITIONER", "MASSMATRIX")){
 
     dfloat invLambda = 1./lambda;
@@ -53,20 +68,20 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda,
 
       elliptic->dotMultiplyKernel(mesh->Nelements*mesh->Np, ogs->o_invDegree, o_r, elliptic->o_rtmp);
 
-      if(mesh->NglobalGatherElements) 
-        precon->partialblockJacobiKernel(mesh->NglobalGatherElements, 
+      if(mesh->NglobalGatherElements)
+        precon->partialblockJacobiKernel(mesh->NglobalGatherElements,
                                 mesh->o_globalGatherElementList,
                                 invLambda, mesh->o_vgeo, precon->o_invMM, elliptic->o_rtmp, o_z);
 
       ogsGatherScatterStart(o_z, ogsDfloat, ogsAdd, ogs);
 
       if(mesh->NlocalGatherElements)
-        precon->partialblockJacobiKernel(mesh->NlocalGatherElements, 
+        precon->partialblockJacobiKernel(mesh->NlocalGatherElements,
                                 mesh->o_localGatherElementList,
                                 invLambda, mesh->o_vgeo, precon->o_invMM, elliptic->o_rtmp, o_z);
-      
+
       ogsGatherScatterFinish(o_z, ogsDfloat, ogsAdd, ogs);
-      
+
       elliptic->dotMultiplyKernel(mesh->Nelements*mesh->Np, ogs->o_invDegree, o_z, o_z);
 
       //post-mask
@@ -81,7 +96,7 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda,
       precon->SEMFEMInterpKernel(mesh->Nelements,mesh->o_SEMFEMAnterp,o_z,precon->o_rFEM);
       ogsGather(precon->o_GrFEM, precon->o_rFEM, ogsDfloat, ogsAdd, precon->FEMogs);
       occaTimerTic(mesh->device,"parALMOND");
-      parAlmondPrecon(precon->parAlmond, precon->o_GzFEM, precon->o_GrFEM);
+      parAlmond::Precon(precon->parAlmond, precon->o_GzFEM, precon->o_GrFEM);
       occaTimerToc(mesh->device,"parALMOND");
       ogsScatter(precon->o_zFEM, precon->o_GzFEM, ogsDfloat, ogsAdd, precon->FEMogs);
       precon->SEMFEMAnterpKernel(mesh->Nelements,mesh->o_SEMFEMAnterp,precon->o_zFEM,o_z);
@@ -90,9 +105,13 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda,
       ogsGatherScatter(o_z, ogsDfloat, ogsAdd, elliptic->ogs);
       if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, o_z);
     } else {
+      ogsGather(precon->o_rhsG, o_r, ogsDfloat, ogsAdd, precon->FEMogs);
+      elliptic->dotMultiplyKernel(precon->FEMogs->Ngather,
+                      precon->FEMogs->o_gatherInvDegree, precon->o_rhsG, precon->o_rhsG);
       occaTimerTic(mesh->device,"parALMOND");
-      parAlmondPrecon(precon->parAlmond, o_z, o_r);
+      parAlmond::Precon(precon->parAlmond, precon->o_xG, precon->o_rhsG);
       occaTimerToc(mesh->device,"parALMOND");
+      ogsScatter(o_z, precon->o_xG, ogsDfloat, ogsAdd, precon->FEMogs);
     }
 
   } else if(options.compareArgs("PRECONDITIONER", "JACOBI")){
@@ -102,7 +121,7 @@ void ellipticPreconditioner(elliptic_t *elliptic, dfloat lambda,
     occaTimerTic(mesh->device,"dotDivideKernel");
     elliptic->dotMultiplyKernel(Ntotal, o_r, precon->o_invDiagA, o_z);
     occaTimerToc(mesh->device,"dotDivideKernel");
-  
+
   } else{ // turn off preconditioner
     o_z.copyFrom(o_r);
   }
diff --git a/solvers/elliptic/src/ellipticPreconditionerSetup.c b/solvers/elliptic/src/ellipticPreconditionerSetup.c
index fe7d5ab24..934addc3e 100644
--- a/solvers/elliptic/src/ellipticPreconditionerSetup.c
+++ b/solvers/elliptic/src/ellipticPreconditionerSetup.c
@@ -52,15 +52,16 @@ void ellipticPreconditionerSetup(elliptic_t *elliptic, ogs_t *ogs, dfloat lambda
     hlong *Rows = (hlong *) calloc(nnz, sizeof(hlong));
     hlong *Cols = (hlong *) calloc(nnz, sizeof(hlong));
     dfloat *Vals = (dfloat*) calloc(nnz,sizeof(dfloat));
-    
+
     for (dlong n=0;n<nnz;n++) {
       Rows[n] = A[n].row;
       Cols[n] = A[n].col;
       Vals[n] = A[n].val;
     }
+    free(A);
 
-    precon->parAlmond = parAlmondInit(mesh, options);
-    parAlmondAgmgSetup(precon->parAlmond,
+    precon->parAlmond = parAlmond::Init(mesh->device, mesh->comm, options);
+    parAlmond::AMGSetup(precon->parAlmond,
                        globalStarts,
                        nnz,
                        Rows,
@@ -68,70 +69,20 @@ void ellipticPreconditionerSetup(elliptic_t *elliptic, ogs_t *ogs, dfloat lambda
                        Vals,
                        elliptic->allNeumann,
                        elliptic->allNeumannPenalty);
-    free(A); free(Rows); free(Cols); free(Vals);
-
-    if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {//tell parAlmond to gather this level
-      agmgLevel *baseLevel = precon->parAlmond->levels[0];
-
-      baseLevel->gatherLevel = true;
-      baseLevel->Srhs = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat));
-      baseLevel->Sx   = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat));
-      baseLevel->o_Srhs = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat));
-      baseLevel->o_Sx   = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat));
+    free(Rows); free(Cols); free(Vals);
 
-      baseLevel->weightedInnerProds = false;
+    if (options.compareArgs("VERBOSE", "TRUE"))
+      parAlmond::Report(precon->parAlmond);
 
-      baseLevel->gatherArgs = (void **) calloc(3,sizeof(void*));  
-      baseLevel->gatherArgs[0] = (void *) elliptic;
-      baseLevel->gatherArgs[1] = (void *) precon->ogs;
-      baseLevel->gatherArgs[2] = (void *) &(baseLevel->o_Sx);
-      baseLevel->scatterArgs = baseLevel->gatherArgs;
+    if (options.compareArgs("DISCRETIZATION", "CONTINUOUS")) {//tell parAlmond to gather this level
+      parAlmond::multigridLevel *baseLevel = precon->parAlmond->levels[0];
 
-      baseLevel->device_gather  = ellipticGather;
-      baseLevel->device_scatter = ellipticScatter;        
+      precon->rhsG = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat));
+      precon->xG   = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat));
+      precon->o_rhsG = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat));
+      precon->o_xG   = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat));
     }
 
-/*
-    if (strstr(options,"MATRIXFREE")&&strstr(options,"IPDG")) { //swap the top AMG level ops for matrix free versions
-      agmgLevel *baseLevel = precon->parAlmond->levels[0];
-
-      dfloat *vlambda = (dfloat *) calloc(1,sizeof(dfloat));
-      *vlambda = lambda;
-      baseLevel->AxArgs = (void **) calloc(3,sizeof(void*));
-      baseLevel->AxArgs[0] = (void *) elliptic;
-      baseLevel->AxArgs[1] = (void *) vlambda;
-      baseLevel->AxArgs[2] = (void *) options;
-      baseLevel->device_Ax = AxTri2D;
-
-      baseLevel->smoothArgs = (void **) calloc(2,sizeof(void*));
-      baseLevel->smoothArgs[0] = (void *) elliptic;
-      baseLevel->smoothArgs[1] = (void *) baseLevel;
-      baseLevel->device_smooth = smoothTri2D;
-
-      baseLevel->smootherArgs = (void **) calloc(1,sizeof(void*));
-      baseLevel->smootherArgs[0] = (void *) elliptic;
-
-      baseLevel->Nrows = mesh->Nelements*mesh->Np;
-      baseLevel->Ncols = (mesh->Nelements+mesh->totalHaloPairs)*mesh->Np;
-
-      // extra storage for smoothing op
-      baseLevel->o_smootherResidual = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat),baseLevel->x);
-
-      dfloat rateTolerance;    // 0 - accept not approximate patches, 1 - accept all approximate patches
-      if(strstr(options, "EXACT")){
-        rateTolerance = 0.0;
-      } else {
-        rateTolerance = 1.0;
-      }
-
-      //set up the fine problem smoothing
-      if(strstr(options, "LOCALPATCH")){
-        ellipticSetupSmootherLocalPatch(elliptic, precon, baseLevel, tau, lambda, BCType, rateTolerance, options);
-      } else { //default to damped jacobi
-        ellipticSetupSmootherDampedJacobi(elliptic, precon, baseLevel, tau, lambda, BCType, options);
-      }
-    }
-*/
   } else if (options.compareArgs("PRECONDITIONER", "MASSMATRIX")){
 
     precon->o_invMM = mesh->device.malloc(mesh->Np*mesh->Np*sizeof(dfloat), mesh->invMM);
diff --git a/solvers/elliptic/src/ellipticSEMFEMSetup.c b/solvers/elliptic/src/ellipticSEMFEMSetup.c
index 59a8cd9a7..a09d996e0 100644
--- a/solvers/elliptic/src/ellipticSEMFEMSetup.c
+++ b/solvers/elliptic/src/ellipticSEMFEMSetup.c
@@ -108,7 +108,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
   memcpy(femMesh,mesh,sizeof(mesh_t));
 
   if (elliptic->elementType==TRIANGLES) {
-  
+
     //set semfem nodes as the grid points
     pmesh->Np = mesh->NpFEM;
     pmesh->r  = mesh->rFEM;
@@ -139,7 +139,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
       if( (pmesh->r[n]+1)*(pmesh->r[n]+1)+(pmesh->s[n]-1)*(pmesh->s[n]-1)<NODETOL)
         pmesh->vertexNodes[2] = n;
     }
-  
+
     // connect elements using parallel sort
     meshParallelConnect(pmesh);
 
@@ -208,7 +208,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
 
     // global nodes
     meshParallelConnectNodes(pmesh);
-    //pmesh->globalIds is now populated    
+    //pmesh->globalIds is now populated
   }
 
 
@@ -220,32 +220,32 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
   femMesh->EToV = (hlong*) calloc(femMesh->Nelements*femMesh->Nverts, sizeof(hlong));
   femMesh->EX = (dfloat*) calloc(femMesh->Nverts*femMesh->Nelements, sizeof(dfloat));
   femMesh->EY = (dfloat*) calloc(femMesh->Nverts*femMesh->Nelements, sizeof(dfloat));
-  if (elliptic->dim==3) 
+  if (elliptic->dim==3)
     femMesh->EZ = (dfloat*) calloc(femMesh->Nverts*femMesh->Nelements, sizeof(dfloat));
-  
+
   dlong *localIds = (dlong *) calloc(femMesh->Nverts*femMesh->Nelements,sizeof(dlong));
 
   // dlong NFEMverts = mesh->Nelements*mesh->NpFEM;
   for(dlong e=0;e<mesh->Nelements;++e){
     for (int n=0;n<mesh->NelFEM;n++) {
       dlong id[femMesh->Nverts];
-      
+
       dlong femId = e*mesh->NelFEM*mesh->Nverts+n*mesh->Nverts;
 
       for (int i=0;i<femMesh->Nverts;i++) {
         //local ids in the subelement fem grid
-        id[i] = e*mesh->NpFEM + mesh->FEMEToV[n*mesh->Nverts+i];  
-        
+        id[i] = e*mesh->NpFEM + mesh->FEMEToV[n*mesh->Nverts+i];
+
         /* read vertex triplet for triangle */
         femMesh->EToV[femId+i] = pmesh->globalIds[id[i]];
-        
+
         femMesh->EX[femId+i] = pmesh->x[id[i]];
         femMesh->EY[femId+i] = pmesh->y[id[i]];
-        if (elliptic->dim==3) 
+        if (elliptic->dim==3)
           femMesh->EZ[femId+i] = pmesh->z[id[i]];
 
       }
-      
+
       switch(elliptic->elementType){
       case TRIANGLES:
         localIds[femId+0] = id[0];
@@ -310,9 +310,9 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
 
   for (int n=0;n<mesh->NelFEM;n++) {
     for (int f=0;f<femMesh->Nfaces;f++) {
-      
+
       for (int face=0; face<pmesh->Nfaces;face++) {
-        
+
         //count the nodes on this face which are on a macro face
         int NvertsOnFace = 0;
         for (int i=0;i<femMesh->Nfp;i++){
@@ -322,7 +322,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
         }
         if (NvertsOnFace == femMesh->Nfp)
           femFaceMap[n*femMesh->Nfaces+f] = face; //on macro face
-      }      
+      }
     }
   }
 
@@ -401,19 +401,19 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
         }
       }
     }
-    ogsGatherScatter(mapB, ogsInt, ogsMin, pmesh->ogs); 
+    ogsGatherScatter(mapB, ogsInt, ogsMin, pmesh->ogs);
 
     //use the bc flags to find masked ids
     for (dlong n=0;n<pmesh->Nelements*pmesh->Np;n++) {
       if (mapB[n] == 1) { //Dirichlet boundary
         pmesh->maskedGlobalIds[n] = 0;
       }
-    } 
-    free(mapB);   
+    }
+    free(mapB);
   } else {
 
     //mask using the original mask
-    for (dlong n=0;n<elliptic->Nmasked;n++) 
+    for (dlong n=0;n<elliptic->Nmasked;n++)
       pmesh->maskedGlobalIds[elliptic->maskIds[n]] = 0;
 
   }
@@ -475,7 +475,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
           }
         }
       }
-    }  
+    }
   } else if (elliptic->elementType==TETRAHEDRA) {
     //build stiffness matrices
     femMesh->Srr = (dfloat *) calloc(femMesh->Np*femMesh->Np,sizeof(dfloat));
@@ -505,7 +505,7 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
       }
     }
   }
-  
+
   if (mesh->rank==0) printf("Building full SEMFEM matrix..."); fflush(stdout);
 
   // Build non-zeros of stiffness matrix (unassembled)
@@ -528,8 +528,8 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
     BuildFEMMatrixTet3D(femMesh,pmesh,lambda, localIds, globalNumbering, globalOwners,&cnt,sendNonZeros); break;
   case HEXAHEDRA:
     BuildFEMMatrixHex3D(femMesh,pmesh,lambda, localIds, globalNumbering, globalOwners,&cnt,sendNonZeros); break;
-  }  
-  
+  }
+
   // Make the MPI_NONZERO_T data type
   MPI_Datatype MPI_NONZERO_T;
   MPI_Datatype dtype[4] = {MPI_HLONG, MPI_HLONG, MPI_INT, MPI_DFLOAT};
@@ -601,9 +601,10 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
     Cols[n] = A[n].col;
     Vals[n] = A[n].val;
   }
+  free(A);
 
-  precon->parAlmond = parAlmondInit(mesh, options);
-  parAlmondAgmgSetup(precon->parAlmond,
+  precon->parAlmond = parAlmond::Init(mesh->device, mesh->comm, options);
+  parAlmond::AMGSetup(precon->parAlmond,
                      globalStarts,
                      nnz,
                      Rows,
@@ -611,13 +612,16 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
                      Vals,
                      elliptic->allNeumann,
                      elliptic->allNeumannPenalty);
-  free(A); free(Rows); free(Cols); free(Vals);
+  free(Rows); free(Cols); free(Vals);
+
+  if (options.compareArgs("VERBOSE", "TRUE"))
+      parAlmond::Report(precon->parAlmond);
 
   if (elliptic->elementType==TRIANGLES||elliptic->elementType==TETRAHEDRA) {
-    //tell parAlmond not to gather this level (its done manually)
-    agmgLevel *baseLevel = precon->parAlmond->levels[0];
-    baseLevel->gatherLevel = false;
-    baseLevel->weightedInnerProds = false;
+    // //tell parAlmond not to gather this level (its done manually)
+    // agmgLevel *baseLevel = precon->parAlmond->levels[0];
+    // baseLevel->gatherLevel = false;
+    // baseLevel->weightedInnerProds = false;
 
     // build interp and anterp
     dfloat *SEMFEMAnterp = (dfloat*) calloc(mesh->NpFEM*mesh->Np, sizeof(dfloat));
@@ -639,30 +643,36 @@ void ellipticSEMFEMSetup(elliptic_t *elliptic, precon_t* precon, dfloat lambda)
     precon->o_GzFEM = mesh->device.malloc(precon->FEMogs->Ngather*sizeof(dfloat));
   } else {
 
-    //tell parAlmond to gather this level
-    agmgLevel *baseLevel = precon->parAlmond->levels[0];
+    // //tell parAlmond to gather this level
+    // agmgLevel *baseLevel = precon->parAlmond->levels[0];
+
+    // baseLevel->gatherLevel = true;
+    parAlmond::multigridLevel *baseLevel = precon->parAlmond->levels[0];
+    precon->rhsG = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat));
+    precon->xG   = (dfloat*) calloc(baseLevel->Ncols,sizeof(dfloat));
+    precon->o_rhsG = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat));
+    precon->o_xG   = mesh->device.malloc(baseLevel->Ncols*sizeof(dfloat));
 
-    baseLevel->gatherLevel = true;
-    baseLevel->Srhs = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat));
-    baseLevel->Sx   = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat));
-    baseLevel->o_Srhs = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat));
-    baseLevel->o_Sx   = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat));
+    // baseLevel->Srhs = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat));
+    // baseLevel->Sx   = (dfloat*) calloc(mesh->Np*mesh->Nelements,sizeof(dfloat));
+    // baseLevel->o_Srhs = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat));
+    // baseLevel->o_Sx   = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat));
 
-    baseLevel->weightedInnerProds = false;
+    // baseLevel->weightedInnerProds = false;
 
-    baseLevel->gatherArgs = (void **) calloc(3,sizeof(void*));  
-    baseLevel->gatherArgs[0] = (void *) elliptic;
-    baseLevel->gatherArgs[1] = (void *) precon->FEMogs;  //use the gs made from the partial gathered femgrid 
-    baseLevel->gatherArgs[2] = (void *) &(baseLevel->o_Sx);
-    baseLevel->scatterArgs = baseLevel->gatherArgs;
+    // baseLevel->gatherArgs = (void **) calloc(3,sizeof(void*));
+    // baseLevel->gatherArgs[0] = (void *) elliptic;
+    // baseLevel->gatherArgs[1] = (void *) precon->FEMogs;  //use the gs made from the partial gathered femgrid
+    // baseLevel->gatherArgs[2] = (void *) &(baseLevel->o_Sx);
+    // baseLevel->scatterArgs = baseLevel->gatherArgs;
 
-    baseLevel->device_gather  = ellipticGather;
-    baseLevel->device_scatter = ellipticScatter;  
+    // baseLevel->device_gather  = ellipticGather;
+    // baseLevel->device_scatter = ellipticScatter;
   }
 }
 
 
-void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, 
+void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
                         dlong *localIds, hlong* globalNumbering, int *globalOwners,
                         dlong *cnt, nonZero_t *A) {
 
@@ -705,7 +715,7 @@ void BuildFEMMatrixTri2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
   }
 }
 
-void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, 
+void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
                           dlong *localIds, hlong* globalNumbering, int *globalOwners,
                           dlong *cnt, nonZero_t *A) {
 
@@ -777,7 +787,7 @@ void BuildFEMMatrixQuad2D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
   }
 }
 
-void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, 
+void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
                         dlong *localIds, hlong* globalNumbering, int *globalOwners,
                         dlong *cnt, nonZero_t *A) {
 
@@ -828,7 +838,7 @@ void BuildFEMMatrixTet3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
   }
 }
 
-void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda, 
+void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
                         dlong *localIds, hlong* globalNumbering, int *globalOwners,
                         dlong *cnt, nonZero_t *A) {
 
@@ -840,17 +850,17 @@ void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
 	  dlong nn = nx+ny*femMesh->Nq+nz*femMesh->Nq*femMesh->Nq;
 	  dlong idn = localIds[e*femMesh->Np + nn];
 	  if (globalNumbering[idn]<0) continue; //skip masked nodes
-      
+
 	  for (int mz=0;mz<femMesh->Nq;mz++) {
 	    for (int my=0;my<femMesh->Nq;my++) {
 	      for (int mx=0;mx<femMesh->Nq;mx++) {
 		dlong mm = mx+my*femMesh->Nq+mz*femMesh->Nq*femMesh->Nq;
 		dlong idm = localIds[e*femMesh->Np + mm];
 		if (globalNumbering[idm]<0) continue; //skip masked nodes
-      
+
 		int id;
 		dfloat val = 0.;
-        
+
 		if ((ny==my)&&(nz==mz)) {
 		  for (int k=0;k<femMesh->Nq;k++) {
 		    id = k+ny*femMesh->Nq+nz*femMesh->Nq*femMesh->Nq;
@@ -888,7 +898,7 @@ void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
 		    val += Gss*femMesh->D[ny+k*femMesh->Nq]*femMesh->D[my+k*femMesh->Nq];
 		  }
 		}
-        
+
 		if (nx==mx) {
 		  id = nx+my*femMesh->Nq+nz*femMesh->Nq*femMesh->Nq;
 		  dfloat Gst = femMesh->ggeo[e*femMesh->Np*femMesh->Nggeo + id + G12ID*femMesh->Np];
@@ -907,13 +917,13 @@ void BuildFEMMatrixHex3D(mesh_t *femMesh, mesh_t *pmesh, dfloat lambda,
 		    val += Gtt*femMesh->D[nz+k*femMesh->Nq]*femMesh->D[mz+k*femMesh->Nq];
 		  }
 		}
-        
+
 		if ((nx==mx)&&(ny==my)&&(nz==mz)) {
 		  id = nx + ny*femMesh->Nq+nz*femMesh->Nq*femMesh->Nq;
 		  dfloat JW = femMesh->ggeo[e*femMesh->Np*femMesh->Nggeo + id + GWJID*femMesh->Np];
 		  val += JW*lambda;
 		}
-        
+
 		// pack non-zero
 		dfloat nonZeroThreshold = 1e-7;
 		if (fabs(val) >= nonZeroThreshold) {
diff --git a/solvers/elliptic/src/ellipticSetup.c b/solvers/elliptic/src/ellipticSetup.c
index 6b08a95b8..eaab788fa 100644
--- a/solvers/elliptic/src/ellipticSetup.c
+++ b/solvers/elliptic/src/ellipticSetup.c
@@ -31,7 +31,7 @@ SOFTWARE.
 void reportMemoryUsage(occa::device &device, const char *mess);
 
 elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelInfo, setupAide options){
- 
+
   elliptic_t *elliptic = (elliptic_t*) calloc(1, sizeof(elliptic_t));
 
   options.getArgs("MESH DIMENSION", elliptic->dim);
@@ -50,8 +50,9 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
   else
     meshOccaSetup2D(mesh, options, kernelInfo);
 
-  reportMemoryUsage(mesh->device, "after occa setup");
-  
+  if (mesh->rank==0)
+    reportMemoryUsage(mesh->device, "after occa setup");
+
   // Boundary Type translation. Just default from the mesh file.
   int BCType[3] = {0,1,2};
   elliptic->BCType = (int*) calloc(3,sizeof(int));
@@ -62,12 +63,12 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
     if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){
       if(options.compareArgs("ELEMENT MAP", "TRILINEAR")){
 	printf("mesh->dim = %d, mesh->Nverts = %d\n", mesh->dim, mesh->Nverts);
-	
+
 	// pack gllz, gllw, and elementwise EXYZ
 	hlong Nxyz = mesh->Nelements*mesh->dim*mesh->Nverts;
 	dfloat *EXYZ = (dfloat*) calloc(Nxyz, sizeof(dfloat));
 	dfloat *gllzw = (dfloat*) calloc(2*mesh->Nq, sizeof(dfloat));
-	
+
 	int sk = 0;
 	for(int n=0;n<mesh->Nq;++n)
 	  gllzw[sk++] = mesh->gllz[n];
@@ -83,7 +84,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
 	  for(int v=0;v<mesh->Nverts;++v)
 	    EXYZ[sk++] = mesh->EZ[e*mesh->Nverts+v];
 	}
-	
+
 	// nodewise ggeo with element coordinates and gauss node info
 	elliptic->o_EXYZ = mesh->device.malloc(Nxyz*sizeof(dfloat), EXYZ);
 	elliptic->o_gllzw = mesh->device.malloc(2*mesh->Nq*sizeof(dfloat), gllzw);
@@ -94,7 +95,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
     }
   }
 
-  // 
+  //
   ellipticSolveSetup(elliptic, lambda, kernelInfo);
 
 
@@ -105,12 +106,12 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
   // load forcing into r
   for(dlong e=0;e<mesh->Nelements;++e){
     for(int n=0;n<mesh->Np;++n){
-      
+
       dfloat J;
       if (elliptic->elementType==TRIANGLES || elliptic->elementType==TETRAHEDRA) {
         J = mesh->vgeo[e*mesh->Nvgeo+JID];
       } else {
-        J = mesh->vgeo[mesh->Np*(e*mesh->Nvgeo + JID) + n];  
+        J = mesh->vgeo[mesh->Np*(e*mesh->Nvgeo + JID) + n];
       }
       dlong id = n+e*mesh->Np;
       dfloat xn = mesh->x[id];
@@ -119,7 +120,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
 
       if(elliptic->dim==2)
         elliptic->r[id] = J*(2*M_PI*M_PI+lambda)*sin(M_PI*xn)*sin(M_PI*yn);
-      else 
+      else
         elliptic->r[id] = J*(3*M_PI*M_PI+lambda)*cos(M_PI*xn)*cos(M_PI*yn)*cos(M_PI*zn);
       elliptic->x[id] = 0;
     }
@@ -135,13 +136,13 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
   elliptic->o_x   = mesh->device.malloc(Nall*sizeof(dfloat), elliptic->x);
 
 
-  string boundaryHeaderFileName; 
+  string boundaryHeaderFileName;
   options.getArgs("DATA FILE", boundaryHeaderFileName);
   kernelInfo["includes"] += (char*)boundaryHeaderFileName.c_str();
 
   // set kernel name suffix
   char *suffix;
-  
+
   if(elliptic->elementType==TRIANGLES)
     suffix = strdup("Tri2D");
   if(elliptic->elementType==QUADRILATERALS)
@@ -159,7 +160,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
       if(r==mesh->rank){
 	sprintf(fileName, DELLIPTIC "/okl/ellipticRhsBCIpdg%s.okl", suffix);
 	sprintf(kernelName, "ellipticRhsBCIpdg%s", suffix);
-	
+
 	elliptic->rhsBCIpdgKernel = mesh->device.buildKernel(fileName,kernelName, kernelInfo);
       }
       MPI_Barrier(mesh->comm);
@@ -186,17 +187,17 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
       if(r==mesh->rank){
 	sprintf(fileName, DELLIPTIC "/okl/ellipticRhsBC%s.okl", suffix);
 	sprintf(kernelName, "ellipticRhsBC%s", suffix);
-	
+
 	elliptic->rhsBCKernel = mesh->device.buildKernel(fileName,kernelName, kernelInfo);
-	
+
 	sprintf(fileName, DELLIPTIC "/okl/ellipticAddBC%s.okl", suffix);
 	sprintf(kernelName, "ellipticAddBC%s", suffix);
-	
+
 	elliptic->addBCKernel = mesh->device.buildKernel(fileName,kernelName, kernelInfo);
       }
       MPI_Barrier(mesh->comm);
     }
-    
+
     dfloat zero = 0.f;
     elliptic->rhsBCKernel(mesh->Nelements,
                         mesh->o_ggeo,
@@ -217,7 +218,7 @@ elliptic_t *ellipticSetup(mesh_t *mesh, dfloat lambda, occa::properties &kernelI
 
   // gather-scatter
   if(options.compareArgs("DISCRETIZATION","CONTINUOUS")){
-    ogsGatherScatter(elliptic->o_r, ogsDfloat, ogsAdd, mesh->ogs);  
+    ogsGatherScatter(elliptic->o_r, ogsDfloat, ogsAdd, mesh->ogs);
     if (elliptic->Nmasked) mesh->maskKernel(elliptic->Nmasked, elliptic->o_maskIds, elliptic->o_r);
   }
 
diff --git a/solvers/elliptic/src/ellipticSmoother.c b/solvers/elliptic/src/ellipticSmoother.c
deleted file mode 100644
index 6441e9d30..000000000
--- a/solvers/elliptic/src/ellipticSmoother.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "elliptic.h"
-
-void ellipticMultigridSmooth(void **args, occa::memory &o_r, occa::memory &o_x, bool xIsZero) {
-
-  elliptic_t *elliptic = (elliptic_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  occa::memory o_res = level->o_smootherResidual;
-
-  if (xIsZero) {
-    level->device_smoother(level->smootherArgs, o_r, o_x);
-    return;
-  }
-
-  dfloat one = 1.; dfloat mone = -1.;
-
-  //res = r-Ax
-  level->device_Ax(level->AxArgs,o_x,o_res);
-  elliptic->scaledAddKernel(level->Nrows,one, o_r, mone, o_res);
-
-  //smooth the fine problem x = x + S(r-Ax)
-  level->device_smoother(level->smootherArgs, o_res, o_res);
-  elliptic->scaledAddKernel(level->Nrows,one, o_res, one, o_x);
-}
-
-void ellipticMultigridSmoothChebyshev(void **args, occa::memory &o_r, occa::memory &o_x, bool xIsZero) {
-
-  elliptic_t *elliptic = (elliptic_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  dfloat lambdaN = level->smoother_params[0];
-  dfloat lambda1 = level->smoother_params[1];
-
-  dfloat theta = 0.5*(lambdaN+lambda1);
-  dfloat delta = 0.5*(lambdaN-lambda1);
-  dfloat invTheta = 1.0/theta;
-  dfloat sigma = theta/delta;
-  dfloat rho_n = 1./sigma;
-  dfloat rho_np1;
-
-  dfloat one = 1., mone = -1., zero = 0.0;
-
-  occa::memory o_res = level->o_smootherResidual;
-  occa::memory o_Ad  = level->o_smootherResidual2;
-  occa::memory o_d   = level->o_smootherUpdate;
-
-  if(xIsZero){ //skip the Ax if x is zero
-    //res = Sr
-    level->device_smoother(level->smootherArgs, o_r, o_res);
-
-    //d = invTheta*res
-    elliptic->scaledAddKernel(level->Nrows, invTheta, o_res, zero, o_d);
-  } else {
-    //res = S(r-Ax)
-    level->device_Ax(level->AxArgs,o_x,o_res);
-    elliptic->scaledAddKernel(level->Nrows, one, o_r, mone, o_res);
-    level->device_smoother(level->smootherArgs, o_res, o_res);
-
-    //d = invTheta*res
-    elliptic->scaledAddKernel(level->Nrows, invTheta, o_res, zero, o_d);
-  }
-
-  for (int k=0;k<level->ChebyshevIterations;k++) {
-    //x_k+1 = x_k + d_k
-    if (xIsZero&&(k==0))
-      elliptic->scaledAddKernel(level->Nrows, one, o_d, zero, o_x);
-    else
-      elliptic->scaledAddKernel(level->Nrows, one, o_d, one, o_x);
-
-    //r_k+1 = r_k - SAd_k
-    level->device_Ax(level->AxArgs,o_d,o_Ad);
-    level->device_smoother(level->smootherArgs, o_Ad, o_Ad);
-    elliptic->scaledAddKernel(level->Nrows, mone, o_Ad, one, o_res);
-
-    rho_np1 = 1.0/(2.*sigma-rho_n);
-    dfloat rhoDivDelta = 2.0*rho_np1/delta;
-
-    //d_k+1 = rho_k+1*rho_k*d_k  + 2*rho_k+1*r_k+1/delta
-    elliptic->scaledAddKernel(level->Nrows, rhoDivDelta, o_res, rho_np1*rho_n, o_d);
-
-    rho_n = rho_np1;
-  }
-  //x_k+1 = x_k + d_k
-  elliptic->scaledAddKernel(level->Nrows, one, o_d, one, o_x);
-
-}
-
-void LocalPatch(void **args, occa::memory &o_r, occa::memory &o_Sr) {
-
-  elliptic_t *elliptic = (elliptic_t*) args[0];
-  mesh_t *mesh = elliptic->mesh;
-  precon_t *precon = elliptic->precon;
-
-  occaTimerTic(mesh->device,"approxBlockJacobiSolveKernel");
-  precon->approxBlockJacobiSolverKernel(mesh->Nelements,
-                            precon->o_patchesIndex,
-                            precon->o_invAP,
-                            precon->o_invDegreeAP,
-                            o_r,
-                            o_Sr);
-  occaTimerToc(mesh->device,"approxBlockJacobiSolveKernel");
-}
-
-void dampedJacobi(void **args, occa::memory &o_r, occa::memory &o_Sr) {
-
-  elliptic_t *elliptic = (elliptic_t *) args[0];
-  mesh_t *mesh = elliptic->mesh;
-
-  occa::memory o_invDiagA = elliptic->precon->o_invDiagA;
-
-  elliptic->dotMultiplyKernel(mesh->Np*mesh->Nelements,o_invDiagA,o_r,o_Sr);
-}
\ No newline at end of file
diff --git a/solvers/elliptic/src/ellipticSmootherSetup.c b/solvers/elliptic/src/ellipticSmootherSetup.c
deleted file mode 100644
index 3202a68aa..000000000
--- a/solvers/elliptic/src/ellipticSmootherSetup.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "elliptic.h"
-
-typedef struct{
-
-  dlong localId;
-  hlong baseId;
-  int haloFlag;
-
-} preconGatherInfo_t;
-
-int parallelCompareBaseId(const void *a, const void *b){
-
-  preconGatherInfo_t *fa = (preconGatherInfo_t*) a;
-  preconGatherInfo_t *fb = (preconGatherInfo_t*) b;
-
-  if(fa->baseId < fb->baseId) return -1;
-  if(fa->baseId > fb->baseId) return +1;
-
-  return 0;
-}
-
-void ellipticSetupSmootherLocalPatch(elliptic_t *elliptic, precon_t *precon, 
-                                      agmgLevel *level, dfloat lambda, 
-                                      dfloat rateTolerance) {
-
-  dfloat *invAP;
-  dlong Npatches;
-  dlong *patchesIndex;
-
-  mesh_t *mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  int NpP = mesh->Np;
-
-  //initialize the full inverse operators on each 4 element patch
-  ellipticBuildLocalPatches(elliptic, lambda, rateTolerance, &Npatches, &patchesIndex, &invAP);
-
-  precon->o_invAP = mesh->device.malloc(Npatches*NpP*NpP*sizeof(dfloat),invAP);
-  precon->o_patchesIndex = mesh->device.malloc(mesh->Nelements*sizeof(dlong), patchesIndex);
-
-  dfloat *invDegree = (dfloat*) calloc(mesh->Nelements,sizeof(dfloat));
-  for (dlong e=0;e<mesh->Nelements;e++) {
-    invDegree[e] = 1.0;
-  }
-  precon->o_invDegreeAP = mesh->device.malloc(mesh->Nelements*sizeof(dfloat),invDegree);
-
-  level->device_smoother = LocalPatch;
-
-  //estimate the max eigenvalue of S*A
-  dfloat rho = maxEigSmoothAx(elliptic, level);
-
-  if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) {
-
-    level->smoother_params = (dfloat *) calloc(2,sizeof(dfloat));
-
-    level->smoother_params[0] = rho;
-    level->smoother_params[1] = rho/10.;
-
-  } else {
-
-    //set the stabilty weight (jacobi-type interation)
-    dfloat weight = (4./3.)/rho;
-
-    for (dlong e=0;e<mesh->Nelements;e++)
-      invDegree[e] *= weight;
-
-    //update with weight
-    precon->o_invDegreeAP.copyFrom(invDegree);
-  }
-  free(invDegree);
-}
-
-void ellipticSetupSmootherDampedJacobi(elliptic_t *elliptic, precon_t *precon, 
-                                       agmgLevel *level, dfloat lambda) {
-
-  dfloat *invDiagA;
-  mesh_t *mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  ellipticBuildJacobi(elliptic,lambda, &invDiagA);
-
-  precon->o_invDiagA = mesh->device.malloc(mesh->Np*mesh->Nelements*sizeof(dfloat), invDiagA);
-    
-  level->device_smoother = dampedJacobi;
-
-  //estimate the max eigenvalue of S*A
-  dfloat rho = maxEigSmoothAx(elliptic, level);
-
-  if (options.compareArgs("MULTIGRID SMOOTHER","CHEBYSHEV")) {
-
-    level->smoother_params = (dfloat *) calloc(2,sizeof(dfloat));
-
-    level->smoother_params[0] = rho;
-    level->smoother_params[1] = rho/10.;
-
-  } else {
-
-    //set the stabilty weight (jacobi-type interation)
-    dfloat weight = (4./3.)/rho;
-
-    for (dlong n=0;n<mesh->Np*mesh->Nelements;n++)
-      invDiagA[n] *= weight;
-
-    //update diagonal with weight
-    precon->o_invDiagA.copyFrom(invDiagA);
-  }
-
-  free(invDiagA);
-}
-
-static void eig(const int Nrows, double *A, double *WR, double *WI){
-
-  int NB  = 256;
-  char JOBVL  = 'V';
-  char JOBVR  = 'V';
-  int     N = Nrows;
-  int   LDA = Nrows;
-  int  LWORK  = (NB+2)*N;
-
-  double *WORK  = new double[LWORK];
-  double *VL  = new double[Nrows*Nrows];
-  double *VR  = new double[Nrows*Nrows];
-
-  int INFO = -999;
-
-  dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI,
-    VL, &LDA, VR, &LDA, WORK, &LWORK, &INFO);
-
-
-  assert(INFO == 0);
-
-  delete [] VL;
-  delete [] VR;
-  delete [] WORK;
-}
-
-dfloat maxEigSmoothAx(elliptic_t* elliptic, agmgLevel *level){
-
-  mesh_t *mesh = elliptic->mesh;
-  setupAide options = elliptic->options;
-
-  const dlong N = level->Nrows;
-  const dlong M = level->Ncols;
-
-  int k = 10;
-
-  hlong Nlocal = (hlong) level->Nrows;
-  hlong Ntotal = 0;
-  MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, mesh->comm);
-  if(k > Ntotal) k = (int) Ntotal;
-
-  // do an arnoldi
-
-  // allocate memory for Hessenberg matrix
-  double *H = (double *) calloc(k*k,sizeof(double));
-
-  // allocate memory for basis
-  dfloat *Vx = (dfloat*) calloc(M, sizeof(dfloat));
-  occa::memory *o_V = (occa::memory *) calloc(k+1, sizeof(occa::memory));
-  
-  occa::memory o_Vx  = mesh->device.malloc(M*sizeof(dfloat),Vx);
-  occa::memory o_AVx = mesh->device.malloc(M*sizeof(dfloat),Vx);
-
-  for(int i=0; i<=k; i++)
-    o_V[i] = mesh->device.malloc(M*sizeof(dfloat),Vx);
-
-  // generate a random vector for initial basis vector
-  for (dlong i=0;i<N;i++) Vx[i] = (dfloat) drand48(); 
-
-  //gather-scatter 
-  if (options.compareArgs("DISCRETIZATION","CONTINUOUS")) {
-    ogsGatherScatter(Vx, ogsDfloat, ogsAdd, mesh->ogs); 
-  
-    for (dlong i=0;i<elliptic->Nmasked;i++) Vx[elliptic->maskIds[i]] = 0.;
-  }
-
-  o_Vx.copyFrom(Vx); //copy to device
-  dfloat norm_vo = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_Vx, o_Vx);
-  norm_vo = sqrt(norm_vo);
-
-  ellipticScaledAdd(elliptic, 1./norm_vo, o_Vx, 0. , o_V[0]);
-
-  for(int j=0; j<k; j++){
-    // v[j+1] = invD*(A*v[j])
-    level->device_Ax(level->AxArgs,o_V[j],o_AVx);
-    level->device_smoother(level->smootherArgs, o_AVx, o_V[j+1]);
-
-    // modified Gram-Schmidth
-    for(int i=0; i<=j; i++){
-      // H(i,j) = v[i]'*A*v[j]
-      dfloat hij = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_V[i], o_V[j+1]);
-
-      // v[j+1] = v[j+1] - hij*v[i]
-      ellipticScaledAdd(elliptic, -hij, o_V[i], 1., o_V[j+1]);
-
-      H[i + j*k] = (double) hij;
-    }
-
-    if(j+1 < k){
-      // v[j+1] = v[j+1]/||v[j+1]||
-      dfloat norm_vj = ellipticWeightedInnerProduct(elliptic, elliptic->o_invDegree, o_V[j+1], o_V[j+1]);
-      norm_vj = sqrt(norm_vj);
-      ellipticScaledAdd(elliptic, 1/norm_vj, o_V[j+1], 0., o_V[j+1]);
-      
-      H[j+1+ j*k] = (double) norm_vj;
-    }
-  }
-
-  double *WR = (double *) calloc(k,sizeof(double));
-  double *WI = (double *) calloc(k,sizeof(double));
-
-  eig(k, H, WR, WI);
-
-  double rho = 0.;
-
-  for(int i=0; i<k; i++){
-    double rho_i  = sqrt(WR[i]*WR[i] + WI[i]*WI[i]);
-
-    if(rho < rho_i) {
-      rho = rho_i;
-    }
-  }
-
-  // free memory
-  free(H);
-  free(WR);
-  free(WI);
-
-  free(Vx);
-  o_Vx.free();
-  o_AVx.free();
-  for(int i=0; i<=k; i++) o_V[i].free();
-  free((void*)o_V);
-
-  if((mesh->rank==0)&&(options.compareArgs("VERBOSE","TRUE"))) printf("weight = %g \n", rho);
-
-  return rho;
-}
diff --git a/solvers/elliptic/src/ellipticSolveSetup.c b/solvers/elliptic/src/ellipticSolveSetup.c
index 566e59ba7..7c53e5cce 100644
--- a/solvers/elliptic/src/ellipticSolveSetup.c
+++ b/solvers/elliptic/src/ellipticSolveSetup.c
@@ -38,7 +38,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
     MPI_Finalize();
     exit(-1);
   }
-  if (options.compareArgs("PRECONDITIONER","MASSMATRIX") && elliptic->elementType!=TRIANGLES 
+  if (options.compareArgs("PRECONDITIONER","MASSMATRIX") && elliptic->elementType!=TRIANGLES
                                                          && elliptic->elementType!=TETRAHEDRA ) {
     printf("ERROR: MASSMATRIX preconditioner is only available for triangle and tetrhedra elements. Use JACOBI instead.\n");
     MPI_Finalize();
@@ -56,11 +56,11 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
   dlong Nall   = Ntotal + Nhalo;
 
   dlong Nblock2 = mymax(1,(Nblock+blockSize-1)/blockSize);
-  
+
   //tau
   if (elliptic->elementType==TRIANGLES || elliptic->elementType==QUADRILATERALS)
     elliptic->tau = 2.0*(mesh->N+1)*(mesh->N+2)/2.0;
-  else 
+  else
     elliptic->tau = 2.0*(mesh->N+1)*(mesh->N+3);
 
   elliptic->p   = (dfloat*) calloc(Nall,   sizeof(dfloat));
@@ -97,19 +97,19 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
     elliptic->sendBuffer = (dfloat*) o_sendBuffer.getMappedPointer();
     elliptic->recvBuffer = (dfloat*) o_recvBuffer.getMappedPointer();
 
-    
+
     occa::memory o_gradSendBuffer = mesh->device.mappedAlloc(2*Nbytes, NULL);
     occa::memory o_gradRecvBuffer = mesh->device.mappedAlloc(2*Nbytes, NULL);
 
     elliptic->gradSendBuffer = (dfloat*) o_gradSendBuffer.getMappedPointer();
     elliptic->gradRecvBuffer = (dfloat*) o_gradRecvBuffer.getMappedPointer();
 #endif
-    
+
     elliptic->sendBuffer = (dfloat*) occaHostMallocPinned(mesh->device, Nbytes, NULL, elliptic->o_sendBuffer);
     elliptic->recvBuffer = (dfloat*) occaHostMallocPinned(mesh->device, Nbytes, NULL, elliptic->o_recvBuffer);
     elliptic->gradSendBuffer = (dfloat*) occaHostMallocPinned(mesh->device, 2*Nbytes, NULL, elliptic->o_gradSendBuffer);
     elliptic->gradRecvBuffer = (dfloat*) occaHostMallocPinned(mesh->device, 2*Nbytes, NULL, elliptic->o_gradRecvBuffer);
-    
+
   }else{
     elliptic->sendBuffer = NULL;
     elliptic->recvBuffer = NULL;
@@ -176,13 +176,13 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
       }
     }
   }
-  
 
-  // !!!!!! Removed MPI::BOOL since some mpi versions complains about it !!!!! 
-  int lallNeumann, gallNeumann; 
-  lallNeumann = allNeumann ? 0:1; 
+
+  // !!!!!! Removed MPI::BOOL since some mpi versions complains about it !!!!!
+  int lallNeumann, gallNeumann;
+  lallNeumann = allNeumann ? 0:1;
   MPI_Allreduce(&lallNeumann, &gallNeumann, 1, MPI_INT, MPI_SUM, mesh->comm);
-  elliptic->allNeumann = (gallNeumann>0) ? false: true; 
+  elliptic->allNeumann = (gallNeumann>0) ? false: true;
 
   // MPI_Allreduce(&allNeumann, &(elliptic->allNeumann), 1, MPI::BOOL, MPI_LAND, mesh->comm);
   if (mesh->rank==0&& options.compareArgs("VERBOSE","TRUE")) printf("allNeumann = %d \n", elliptic->allNeumann);
@@ -204,16 +204,16 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
   elliptic->o_EToB = mesh->device.malloc(mesh->Nelements*mesh->Nfaces*sizeof(int), elliptic->EToB);
 
 #if 0
-  if (mesh->rank==0 && options.compareArgs("VERBOSE","TRUE")) 
+  if (mesh->rank==0 && options.compareArgs("VERBOSE","TRUE"))
     occa::setVerboseCompilation(true);
-  else 
+  else
     occa::setVerboseCompilation(false);
 #endif
 
   //setup an unmasked gs handle
   int verbose = options.compareArgs("VERBOSE","TRUE") ? 1:0;
   meshParallelGatherScatterSetup(mesh, Ntotal, mesh->globalIds, mesh->comm, verbose);
-  
+
   //make a node-wise bc flag using the gsop (prioritize Dirichlet boundaries over Neumann)
   elliptic->mapB = (int *) calloc(mesh->Nelements*mesh->Np,sizeof(int));
   for (dlong e=0;e<mesh->Nelements;e++) {
@@ -229,7 +229,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
       }
     }
   }
-  ogsGatherScatter(elliptic->mapB, ogsInt, ogsMin, mesh->ogs); 
+  ogsGatherScatter(elliptic->mapB, ogsInt, ogsMin, mesh->ogs);
 
   //use the bc flags to find masked ids
   elliptic->Nmasked = 0;
@@ -241,7 +241,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
     }
   }
   elliptic->o_mapB = mesh->device.malloc(mesh->Nelements*mesh->Np*sizeof(int), elliptic->mapB);
-  
+
   elliptic->maskIds = (dlong *) calloc(elliptic->Nmasked, sizeof(dlong));
   elliptic->Nmasked =0; //reset
   for (dlong n=0;n<mesh->Nelements*mesh->Np;n++) {
@@ -252,16 +252,16 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
   //make a masked version of the global id numbering
   mesh->maskedGlobalIds = (hlong *) calloc(Ntotal,sizeof(hlong));
   memcpy(mesh->maskedGlobalIds, mesh->globalIds, Ntotal*sizeof(hlong));
-  for (dlong n=0;n<elliptic->Nmasked;n++) 
+  for (dlong n=0;n<elliptic->Nmasked;n++)
     mesh->maskedGlobalIds[elliptic->maskIds[n]] = 0;
 
   //use the masked ids to make another gs handle
   elliptic->ogs = ogsSetup(Ntotal, mesh->maskedGlobalIds, mesh->comm, verbose, mesh->device);
   elliptic->o_invDegree = elliptic->ogs->o_invDegree;
-  
+
   /*preconditioner setup */
   elliptic->precon = (precon_t*) calloc(1, sizeof(precon_t));
-  
+
   kernelInfo["parser/" "automate-add-barriers"] =  "disabled";
 
   if(mesh->device.mode()=="CUDA"){ // add backend compiler optimization for CUDA
@@ -273,7 +273,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
 
   // set kernel name suffix
   char *suffix;
-  
+
   if(elliptic->elementType==TRIANGLES)
     suffix = strdup("Tri2D");
   if(elliptic->elementType==QUADRILATERALS)
@@ -289,7 +289,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
   for (int r=0;r<mesh->size;r++) {
     if (r==mesh->rank) {
 
-      //mesh kernels 
+      //mesh kernels
       mesh->haloExtractKernel =
         mesh->device.buildKernel(DHOLMES "/okl/meshHaloExtract2D.okl",
                                        "meshHaloExtract2D",
@@ -307,7 +307,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
 
 
       kernelInfo["defines/" "p_blockSize"]= blockSize;
-      
+
 
       mesh->sumKernel =
         mesh->device.buildKernel(DHOLMES "/okl/sum.okl",
@@ -338,8 +338,8 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
         mesh->device.buildKernel(DHOLMES "/okl/norm2.okl",
                                            "norm2",
                                            kernelInfo);
-      
-      
+
+
       elliptic->scaledAddKernel =
           mesh->device.buildKernel(DHOLMES "/okl/scaledAdd.okl",
                                          "scaledAdd",
@@ -354,7 +354,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
           mesh->device.buildKernel(DHOLMES "/okl/dotDivide.okl",
                                          "dotDivide",
                                          kernelInfo);
-      
+
       // add custom defines
       kernelInfo["defines/" "p_NpP"]= (mesh->Np+mesh->Nfp*mesh->Nfaces);
       kernelInfo["defines/" "p_Nverts"]= mesh->Nverts;
@@ -412,7 +412,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
       dfloatKernelInfo["defines/" "pfloat"]= dfloatString;
 
       elliptic->AxKernel = mesh->device.buildKernel(fileName,kernelName,dfloatKernelInfo);
-      
+
       if(elliptic->elementType!=HEXAHEDRA){
         sprintf(kernelName, "ellipticPartialAx%s", suffix);
       }
@@ -427,7 +427,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
       elliptic->partialAxKernel = mesh->device.buildKernel(fileName,kernelName,dfloatKernelInfo);
 
       elliptic->partialFloatAxKernel = mesh->device.buildKernel(fileName,kernelName,floatKernelInfo);
-      
+
       if (options.compareArgs("BASIS","BERN")) {
 
         sprintf(fileName, DELLIPTIC "/okl/ellipticGradientBB%s.okl", suffix);
@@ -437,14 +437,14 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
 
         sprintf(kernelName, "ellipticPartialGradientBB%s", suffix);
         elliptic->partialGradientKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-      
+
         sprintf(fileName, DELLIPTIC "/okl/ellipticAxIpdgBB%s.okl", suffix);
         sprintf(kernelName, "ellipticAxIpdgBB%s", suffix);
         elliptic->ipdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
 
         sprintf(kernelName, "ellipticPartialAxIpdgBB%s", suffix);
         elliptic->partialIpdgKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
-          
+
       } else if (options.compareArgs("BASIS","NODAL")) {
 
         sprintf(fileName, DELLIPTIC "/okl/ellipticGradient%s.okl", suffix);
@@ -482,7 +482,7 @@ void ellipticSolveSetup(elliptic_t *elliptic, dfloat lambda, occa::properties &k
       sprintf(kernelName, "ellipticApproxBlockJacobiSolver");
       elliptic->precon->approxBlockJacobiSolverKernel = mesh->device.buildKernel(fileName,kernelName,kernelInfo);
 
-      if (   elliptic->elementType == TRIANGLES 
+      if (   elliptic->elementType == TRIANGLES
           || elliptic->elementType == TETRAHEDRA) {
         elliptic->precon->SEMFEMInterpKernel =
           mesh->device.buildKernel(DELLIPTIC "/okl/ellipticSEMFEMInterp.okl",
diff --git a/solvers/ins/makefile b/solvers/ins/makefile
index ed9ac8158..ad439ae39 100644
--- a/solvers/ins/makefile
+++ b/solvers/ins/makefile
@@ -4,7 +4,7 @@ ERROR:
 	@echo "Error, environment variable [OCCA_DIR] is not set"
 endif
 
-CXXFLAGS = 
+CXXFLAGS =
 
 include ${OCCA_DIR}/scripts/Makefile
 
@@ -12,7 +12,7 @@ include ${OCCA_DIR}/scripts/Makefile
 HDRDIR  = ../../include
 GSDIR  = ../../3rdParty/gslib
 OGSDIR  = ../../libs/gatherScatter
-ALMONDDIR = ../parALMOND
+ALMONDDIR = ../../libs/parAlmond
 ELLIPTICDIR = ../elliptic
 
 # set options for this machine
@@ -22,16 +22,16 @@ CC	= mpic++
 LD	= mpic++
 
 # compiler flags to be used (set to compile with debugging on)
-CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -I$(ELLIPTICDIR) -g  -D DHOLMES='"${CURDIR}/../.."' -D DINS='"${CURDIR}"'
+CFLAGS = -I. -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -I$(HDRDIR) -I$(OGSDIR) -I$(ELLIPTICDIR) -I$(ALMONDDIR) -g  -D DHOLMES='"${CURDIR}/../.."' -D DINS='"${CURDIR}"'
 
-# link flags to be used 
+# link flags to be used
 LDFLAGS	= -DOCCA_VERSION_1_0 $(compilerFlags) $(flags) -g
 
 # libraries to be linked in
-LIBS	=  -L$(ELLIPTICDIR) -lelliptic -L$(ALMONDDIR) -lparALMOND  \
+LIBS	=  -L$(ELLIPTICDIR) -lelliptic -L$(ALMONDDIR) -lparAlmond  \
 		   -L$(OGSDIR) -logs -L$(GSDIR)/lib  -lgs \
 		   -L$(OCCA_DIR)/lib $(links) -L../../3rdParty/BlasLapack -lBlasLapack -lgfortran \
-			
+
 
 INCLUDES = ins.h
 DEPS = $(INCLUDES) \
@@ -39,12 +39,13 @@ $(HDRDIR)/mesh.h \
 $(HDRDIR)/mesh2D.h \
 $(HDRDIR)/mesh3D.h \
 $(OGSDIR)/ogs.hpp \
-$(ALMONDDIR)/parALMOND.h \
+$(ALMONDDIR)/parAlmond.hpp \
 $(ELLIPTICDIR)/elliptic.h \
-$(ELLIPTICDIR)/ellipticPrecon.h  
+$(ELLIPTICDIR)/ellipticPrecon.h \
+$(ELLIPTICDIR)/ellipticMultiGrid.h
 
 # types of files we are going to construct rules for
-.SUFFIXES: .c 
+.SUFFIXES: .c
 
 # rule for .c files
 .c.o: $(DEPS)
@@ -75,7 +76,7 @@ AOBJS    = \
 ./src/insPressureUpdate.o \
 ./src/insRestart.o \
 ./src/insWeldTriVerts.o \
-./src/insIsoPlotVTU.o    
+./src/insIsoPlotVTU.o
 
 # library objects
 LOBJS = \
@@ -136,7 +137,7 @@ LOBJS = \
 ../../src/occaHostMallocPinned.o \
 ../../src/timer.o
 
-insMain:$(AOBJS) $(LOBJS) ./src/insMain.o libblas libogs libparALMOND libelliptic
+insMain:$(AOBJS) $(LOBJS) ./src/insMain.o libblas libogs libparAlmond libelliptic
 	$(LD)  $(LDFLAGS)  -o insMain ./src/insMain.o $(COBJS) $(AOBJS) $(LOBJS) $(paths) $(LIBS)
 
 lib:$(AOBJS)
@@ -148,8 +149,8 @@ libogs:
 libblas:
 	cd ../../3rdParty/BlasLapack; make -j lib; cd ../../solvers/ins
 
-libparALMOND:
-	cd ../parALMOND; make -j lib; cd ../ins
+libparAlmond:
+	cd ../../libs/parAlmond; make -j lib; cd ../../solvers/ins
 
 libelliptic:
 	cd ../elliptic; make -j lib; cd ../ins
diff --git a/solvers/parALMOND/include/agmg.h b/solvers/parALMOND/include/agmg.h
deleted file mode 100644
index cb99d0c6d..000000000
--- a/solvers/parALMOND/include/agmg.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#ifndef AGMG_H
-#define AGMG_H 1
-
-#ifdef OCCA_VERSION_1_0
-#include "occa/modes/opencl/utils.hpp"
-#endif
-
-#include "mesh.h"
-
-#include "parAlmond.h"
-#include "agmgLevel.h"
-#include "agmgMatrices.h"
-#include "vectorPrimitives.h"
-#include <mpi.h>
-
-#define AGMGBDIM 32 //block size
-#define SIMDWIDTH 32 //width of simd blocks
-#define MAX_LEVELS 100
-#define GPU_CPU_SWITCH_SIZE 0 //host-device switch threshold
-
-#define RDIMX 32
-#define RDIMY 8
-#define RLOAD 1
-
-
-void agmgSetup(parAlmond_t *parAlmond, csr *A, dfloat *nullA, hlong *globalRowStarts, setupAide options);
-void parAlmondReport(parAlmond_t *parAlmond);
-void buildAlmondKernels(parAlmond_t *parAlmond);
-
-void kcycle(parAlmond_t *parAlmond, int k);
-void device_kcycle(parAlmond_t *parAlmond, int k);
-
-void vcycle(parAlmond_t *parAlmond, int k);
-void device_vcycle(parAlmond_t *parAlmond, int k);
-
-void pgmres(parAlmond_t *parAlmond, int maxIt, dfloat tol);
-void device_pgmres(parAlmond_t *parAlmond, int maxIt, dfloat tol);
-
-void pcg(parAlmond_t *parAlmond, int maxIt, dfloat tol);
-void device_pcg(parAlmond_t *parAlmond, int maxIt, dfloat tol);
-
-namespace agmg {
-  extern int rank;
-  extern int size;
-  extern MPI_Comm comm;
-};
-#endif
diff --git a/solvers/parALMOND/include/agmgLevel.h b/solvers/parALMOND/include/agmgLevel.h
deleted file mode 100644
index 39fd92ff3..000000000
--- a/solvers/parALMOND/include/agmgLevel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-
-void agmgAx        (void **args, dfloat *x, dfloat *Ax);
-void agmgCoarsen   (void **args, dfloat *r, dfloat *Rr);
-void agmgProlongate(void **args, dfloat *x, dfloat *Px);
-void agmgSmooth    (void **args, dfloat *rhs, dfloat *x, bool x_is_zero);
-
-void device_agmgAx        (void **args, occa::memory &o_x, occa::memory &o_Ax);
-void device_agmgCoarsen   (void **args, occa::memory &o_r, occa::memory &o_Rr);
-void device_agmgProlongate(void **args, occa::memory &o_x, occa::memory &o_Px);
-void device_agmgSmooth    (void **args, occa::memory &o_r, occa::memory &o_x, bool x_is_zero);
-
-void setupSmoother(parAlmond_t *parAlmond, agmgLevel *level, SmoothType s);
-void setupExactSolve(parAlmond_t *parAlmond, agmgLevel *level, bool nullSpace, dfloat nullSpacePenalty);
-void exactCoarseSolve(parAlmond_t *parAlmond, int N, dfloat *rhs, dfloat *x);
-void device_exactCoarseSolve(parAlmond_t *parAlmond, int N, occa::memory o_rhs, occa::memory o_x);
diff --git a/solvers/parALMOND/include/agmgMatrices.h b/solvers/parALMOND/include/agmgMatrices.h
deleted file mode 100644
index 5b5c663e6..000000000
--- a/solvers/parALMOND/include/agmgMatrices.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-
-//creators
-csr * newCSRfromCOO(dlong N, hlong* globalRowStarts,
-            		dlong NNZ,   hlong *Ai, hlong *Aj, dfloat *Avals);
-void freeCSR(csr *A);
-dcoo *newDCOO(parAlmond_t *parAlmond, csr *B);
-hyb * newHYB(parAlmond_t *parAlmond, csr *csrA);
-
-
-void axpy(csr *A, dfloat alpha, dfloat *x, dfloat beta, dfloat *y, bool nullSpace, dfloat nullSpacePenalty);
-
-void axpy(parAlmond_t *parAlmond, dcoo *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y);
-
-void axpy(parAlmond_t *parAlmond, hyb *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y, bool nullSpace, dfloat nullSpacePenalty);
-
-void axpy(parAlmond_t *parAlmond, ell *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y);
-
-void ax(parAlmond_t *parAlmond, coo *C, dfloat alpha, occa::memory o_x, occa::memory o_y);
-
-
-//smoothing
-void smoothJacobi      (parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero);
-void smoothDampedJacobi(parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero);
-void smoothChebyshev   (parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero);
-void smoothJacobi      (parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero);
-void smoothDampedJacobi(parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero);
-void smoothChebyshev   (parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero);
-
-//halo exchange
-void csrHaloSetup(csr *A, hlong *globalColStarts);
-void csrHaloExchange(csr *A, size_t Nbytes, void *sourceBuffer, void *sendBuffer, void *recvBuffer);
-void csrHaloExchangeStart(csr *A, size_t Nbytes, void *sourceBuffer, void *sendBuffer, void *recvBuffer);
-void csrHaloExchangeFinish(csr *A);
-void dcooHaloExchangeStart(dcoo *A, size_t Nbytes, void *sendBuffer, void *recvBuffer);
-void dcooHaloExchangeFinish(dcoo *A);
-void hybHaloExchangeStart(hyb *A, size_t Nbytes, void *sendBuffer, void *recvBuffer);
-void hybHaloExchangeFinish(hyb *A);
diff --git a/solvers/parALMOND/include/vectorPrimitives.h b/solvers/parALMOND/include/vectorPrimitives.h
deleted file mode 100644
index 6be049f58..000000000
--- a/solvers/parALMOND/include/vectorPrimitives.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-
-
-dfloat innerProd(dlong n, dfloat *a, dfloat *b);
-
-void doubleInnerProd(dlong n, dfloat *aDotbc, dfloat *a, dfloat *b, dfloat *c);
-
-void kcycleCombinedOp1(dlong n, dfloat *aDotbc, dfloat *a, dfloat *b, dfloat *c, dfloat *w, bool weighted);
-
-void kcycleCombinedOp2(dlong n, dfloat *aDotbcd, dfloat *a, dfloat *b, dfloat *c, dfloat* d, dfloat *w, bool weighted);
-
-void vectorAdd(dlong n, dfloat alpha, dfloat *x, dfloat beta, dfloat *y);
-
-dfloat vectorAddInnerProd(dlong n, dfloat alpha, dfloat *x, dfloat beta, dfloat *y, dfloat *w, bool weighted);
-
-void dotStar(dlong m, dfloat *a, dfloat *b);
-
-void scaleVector(dlong m, dfloat *a, dfloat alpha);
-
-void setVector(dlong m, dfloat *a, dfloat alpha);
-
-dfloat sumVector(dlong m, dfloat *a);
-
-void addScalar(dlong m, dfloat alpha, dfloat *a);
-
-void randomize(dlong m, dfloat *a);
-
-dfloat maxEntry(dlong n, dfloat *a);
-
-void scaleVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a, dfloat alpha);
-
-void setVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a, dfloat alpha);
-
-dfloat sumVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a);
-
-void addScalar(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_a);
-
-void dotStar(parAlmond_t *parAlmond, dlong N, occa::memory o_a, occa::memory o_b);
-
-void dotStar(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_a,
-	           occa::memory o_b, dfloat beta, occa::memory o_c);
-
-dfloat innerProd(parAlmond_t *parAlmond, dlong N, occa::memory o_x, occa::memory o_y);
-
-// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b,
-void kcycleCombinedOp1(parAlmond_t *parAlmond, dlong n, dfloat *aDotbc, occa::memory o_a,
-                                        occa::memory o_b, occa::memory o_c, occa::memory o_w, bool weighted);
-
-// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d,
-void kcycleCombinedOp2(parAlmond_t *parAlmond, dlong n, dfloat *aDotbcd, occa::memory o_a,
-                                              occa::memory o_b, occa::memory o_c, occa::memory o_d,
-                                              occa::memory o_w, bool weighted);
-
-// y = beta*y + alpha*x, and return y\dot y
-dfloat vectorAddInnerProd(parAlmond_t *parAlmond, dlong n, dfloat alpha, occa::memory o_x,
-                                                          dfloat beta, occa::memory o_y,
-                                                          occa::memory o_w, bool weighted);
-
-void vectorAdd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y);
-
-void vectorAdd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x,
-	 dfloat beta, occa::memory o_y, occa::memory o_z);
diff --git a/solvers/parALMOND/makefile b/solvers/parALMOND/makefile
deleted file mode 100644
index 385eb8d5d..000000000
--- a/solvers/parALMOND/makefile
+++ /dev/null
@@ -1,42 +0,0 @@
-sDir = ./src
-iDir = ./include
-objDir = ./
-
-include ${OCCA_DIR}/scripts/Makefile
-
-sources = $(wildcard $(sDir)/*c)
-includes = $(wildcard $(iDir)/*h)
-objects = $(subst $(sDir)/,$(objDir)/,$(sources:.c=.o))
-deps = $(includes) \
-../../include/mesh.h \
-../../libs/gatherScatter/ogs.hpp \
-../../include/parAlmond.h
-
-flags = -DOCCA_VERSION_1_0 -I${OCCA_DIR}/include -I$(iDir) -I../../include -I../../libs/gatherScatter
-libs   =  -L${OCCA_DIR}/lib -locca -llapack -lblas
-
-flags  += -D DPWD='"${CURDIR}"'
-CC =  mpic++
-#flags += -fopenmp
-
-# Debug Option
-ifeq ($(DEBUG), 1)
-flags += -g
-else
-# <> For debugging purposes
-flags += -O3 -DNDEBUG  -fopenmp
-endif
-
-#flags += -DINS_MPI=$(INS_MPI) -DINS_RENDER=$(INS_RENDER) -DINS_CLUSTER=$(INS_CLUSTER)
-
-all: lib
-
-lib: $(objects) $(deps)
-	ar -cr libparALMOND.a $(objects)
-
-$(objDir)/%.o:$(sDir)/%.c $(deps) #$(wildcard $(subst $(sDir),$(iDir),$(<:.cpp=.hpp)))
-	$(CC) $(flags) -o $@ $(libs) -c $< $(paths)
-
-clean:
-	rm -f libparALMOND.a
-	rm -f $(objDir)/*.o
diff --git a/solvers/parALMOND/okl/agg_interpolate.okl b/solvers/parALMOND/okl/agg_interpolate.okl
deleted file mode 100644
index 98dd590ea..000000000
--- a/solvers/parALMOND/okl/agg_interpolate.okl
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-// Specialized interpolation operator for aggregation amg
-// Assumes that each row has exactly one non-zero, P_coefs at column index that is
-// stored in P_cols
-// y = y + P*x,  (Px is used to represent y in the @kernel)
-
-@kernel void agg_interpolate(const dlong   n,
-          @restrict const  dlong * P_rows,
-          @restrict const  dlong * P_cols,
-          @restrict const  dfloat * P_coefs,
-          @restrict const  dfloat * x,
-          @restrict dfloat * Px){
-
-  for(dlong i=0;i<n;++i;@tile(256,@outer,@inner)){
-    // 3 co-alesced access and 1 random access
-    if(i<n)
-      Px[P_rows[i]] += (P_coefs[i]*x[P_cols[i]]);
-  }
-}
diff --git a/solvers/parALMOND/okl/ellAXPY.okl b/solvers/parALMOND/okl/ellAXPY.okl
deleted file mode 100644
index e208ebc61..000000000
--- a/solvers/parALMOND/okl/ellAXPY.okl
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-@kernel void ellAXPY(const dlong   numRows,
-        const int             nnzPerRow,
-        const dlong             strideLength,
-        const dfloat           alpha,
-        const dfloat           beta,
-        @restrict const  dlong  * cols,
-        @restrict const  dfloat * coefs,
-        @restrict const  dfloat * x,
-              @restrict dfloat * y){
-
-  // y = alpha * A * x + beta * y
-  for(dlong i=0;i<numRows;++i;@tile(256,@outer,@inner)){
-
-    if(i < numRows){
-      const dfloat betay = beta*y[i];
-
-      dfloat result = 0.;
-      for(int c=0; c<nnzPerRow; c++){
-        // location in the memory
-        const dlong address = c*strideLength+i;
-
-        // access column index
-        const dlong col = cols[address];
-
-        if (col >-1) {
-          const dfloat coeffn = coefs[address];
-          const dfloat xn = x[col];
-
-          result += coeffn*xn;
-        }
-      }
-      y[i] = alpha*result + betay;//beta*y[row];
-    }
-  }
-}
-
-@kernel void ellZeqAXPY(const dlong   numRows,
-           const int   nnzPerRow,
-           const dlong   strideLength,
-           const dfloat   alpha,
-           const dfloat   beta,
-           @restrict const  dlong  * cols,
-           @restrict const  dfloat * coefs,
-           @restrict const  dfloat * x,
-           @restrict const  dfloat * y,
-                 @restrict dfloat * z){
-
-  // z = alpha * A * x + beta * y
-  for(dlong i=0;i<numRows;++i;@tile(256,@outer,@inner)){
-    if(i < numRows){
-      dfloat result = 0.;
-      for(int c=0; c<nnzPerRow; c++){
-        // location in the memory
-        const dlong address = c*strideLength+i;
-
-        // access column index
-        const dlong col = cols[address];
-
-        // dont access coefs[address] if col is -ve
-        if(col > -1) result += coefs[address]*x[col];
-      }
-      z[i] = alpha*result + beta*y[i];
-    }
-  }
-}
-
-@kernel void ellJacobi(const dlong   numRows,
-           const int   nnzPerRow,
-           const dlong   strideLength,
-           @restrict const  dlong  * cols,
-           @restrict const  dfloat * coefs,
-           @restrict const  dfloat * x,
-           @restrict const  dfloat * r,
-                 @restrict dfloat * z){
-
-  // z = r - (A-D)*x
-  for(dlong i=0;i<numRows;++i;@tile(256,@outer,@inner)){
-    if(i < numRows){
-      dfloat result = r[i];
-
-      // skip the first stride since it corresponds to diag
-      for(int c=1; c<nnzPerRow; c++){
-        // location in the memory
-        const dlong address = c*strideLength+i;
-
-        // access column index
-        const dlong col = cols[address];
-
-        // dont access coefs[address] or x[col] if col is -ve
-        if(col > -1) result -= coefs[address]*x[col];
-      }
-      z[i] = result;
-    }
-  }
-}
-
diff --git a/solvers/parALMOND/okl/kcycleCombinedOp.okl b/solvers/parALMOND/okl/kcycleCombinedOp.okl
deleted file mode 100644
index b073b7094..000000000
--- a/solvers/parALMOND/okl/kcycleCombinedOp.okl
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-
-// a.b, a.c, b.b
-@kernel void kcycleCombinedOp1Kernel(const dlong Nblocks,
-					      const dlong   N,
-					      @restrict const  dfloat * a,
-					      @restrict const  dfloat * b,
-					      @restrict const  dfloat * c,
-					            @restrict dfloat * ips){
-
-  for(dlong g=0;g<Nblocks;++g;@outer(0)){
-
-    @shared volatile dfloat s_ip[p_RDIMY][p_RDIMX];
-    @shared volatile dfloat s_res[p_RDIMY];
-
-    @exclusive dfloat abi, aci, bbi;
-    
-    for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){
-      for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){
-	      dlong i = tx + ty*p_RDIMX + g*p_RDIMX*p_RDIMY;
-
-      	abi = 0.f;
-      	aci = 0.f;
-      	bbi = 0.f;
-      	
-      	while(i<N){ // scan through whole array
-      	  const dfloat ai =  a[i];
-      	  const dfloat bi =  b[i];
-      	  const dfloat ci =  c[i];
-      	  abi += ai*bi;
-      	  aci += ai*ci;
-      	  bbi += bi*bi;
-      	  i += Nblocks*p_RDIMX*p_RDIMY; 
-      	}
-      }
-    }
-
-    twoPhaseReduction(abi, s_ip, s_res, ips[3*g+0]);
-    twoPhaseReduction(aci, s_ip, s_res, ips[3*g+1]);
-    twoPhaseReduction(bbi, s_ip, s_res, ips[3*g+2]);
-  }
-}
-
-// a.b, a.c, b.b
-@kernel void kcycleCombinedOp2Kernel(const dlong Nblocks,
-                const dlong   N,
-                @restrict const  dfloat * a,
-                @restrict const  dfloat * b,
-                @restrict const  dfloat * c,
-                @restrict const  dfloat * d,
-                      @restrict dfloat * ips){
-
-  for(dlong g=0;g<Nblocks;++g;@outer(0)){
-
-    @shared volatile dfloat s_ip[p_RDIMY][p_RDIMX];
-    @shared volatile dfloat s_res[p_RDIMY];
-
-    @exclusive dfloat abi, aci, adi;
-    
-    for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){
-      for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){
-        dlong i = tx + ty*p_RDIMX + g*p_RDIMX*p_RDIMY;
-
-        abi = 0.f;
-        aci = 0.f;
-        adi = 0.f;
-        
-        while(i<N){ // scan through whole array
-          const dfloat ai =  a[i];
-          const dfloat bi =  b[i];
-          const dfloat ci =  c[i];
-          const dfloat di =  d[i];
-          abi += ai*bi;
-          aci += ai*ci;
-          adi += ai*di;
-          i += Nblocks*p_RDIMX*p_RDIMY; 
-        }
-      }
-    }
-
-    twoPhaseReduction(abi, s_ip, s_res, ips[3*g+0]);
-    twoPhaseReduction(aci, s_ip, s_res, ips[3*g+1]);
-    twoPhaseReduction(adi, s_ip, s_res, ips[3*g+2]);
-  }
-}
-    
-// w.a.b, w.a.c, w.b.b
-@kernel void kcycleWeightedCombinedOp1Kernel(const dlong Nblocks,
-                const dlong   N,
-                @restrict const  dfloat * a,
-                @restrict const  dfloat * b,
-                @restrict const  dfloat * c,
-                @restrict const  dfloat * w,
-                      @restrict dfloat * ips){
-
-  for(dlong g=0;g<Nblocks;++g;@outer(0)){
-
-    @shared volatile dfloat s_ip[p_RDIMY][p_RDIMX];
-    @shared volatile dfloat s_res[p_RDIMY];
-
-    @exclusive dfloat abi, aci, bbi;
-    
-    for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){
-      for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){
-        dlong i = tx + ty*p_RDIMX + g*p_RDIMX*p_RDIMY;
-
-        abi = 0.f;
-        aci = 0.f;
-        bbi = 0.f;
-        
-        while(i<N){ // scan through whole array
-          const dfloat ai =  a[i];
-          const dfloat bi =  b[i];
-          const dfloat ci =  c[i];
-          const dfloat wi =  w[i];
-          abi += wi*ai*bi;
-          aci += wi*ai*ci;
-          bbi += wi*bi*bi;
-          i += Nblocks*p_RDIMX*p_RDIMY; 
-        }
-      }
-    }
-
-    twoPhaseReduction(abi, s_ip, s_res, ips[3*g+0]);
-    twoPhaseReduction(aci, s_ip, s_res, ips[3*g+1]);
-    twoPhaseReduction(bbi, s_ip, s_res, ips[3*g+2]);
-  }
-}
-
-// w.a.b, w.a.c, w.b.b
-@kernel void kcycleWeightedCombinedOp2Kernel(const dlong Nblocks,
-                const dlong   N,
-                @restrict const  dfloat * a,
-                @restrict const  dfloat * b,
-                @restrict const  dfloat * c,
-                @restrict const  dfloat * d,
-                @restrict const  dfloat * w,
-                      @restrict dfloat * ips){
-
-  for(dlong g=0;g<Nblocks;++g;@outer(0)){
-
-    @shared volatile dfloat s_ip[p_RDIMY][p_RDIMX];
-    @shared volatile dfloat s_res[p_RDIMY];
-
-    @exclusive dfloat abi, aci, adi;
-    
-    for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){
-      for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){
-        dlong i = tx + ty*p_RDIMX + g*p_RDIMX*p_RDIMY;
-
-        abi = 0.f;
-        aci = 0.f;
-        adi = 0.f;
-        
-        while(i<N){ // scan through whole array
-          const dfloat ai =  a[i];
-          const dfloat bi =  b[i];
-          const dfloat ci =  c[i];
-          const dfloat di =  d[i];
-          const dfloat wi =  w[i];
-          abi += wi*ai*bi;
-          aci += wi*ai*ci;
-          adi += wi*ai*di;
-          i += Nblocks*p_RDIMX*p_RDIMY; 
-        }
-      }
-    }
-
-    twoPhaseReduction(abi, s_ip, s_res, ips[3*g+0]);
-    twoPhaseReduction(aci, s_ip, s_res, ips[3*g+1]);
-    twoPhaseReduction(adi, s_ip, s_res, ips[3*g+2]);
-  }
-}
\ No newline at end of file
diff --git a/solvers/parALMOND/okl/sumVector.okl b/solvers/parALMOND/okl/sumVector.okl
deleted file mode 100644
index 0456c1321..000000000
--- a/solvers/parALMOND/okl/sumVector.okl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-// alpha = sum x 
-@kernel void sumVectorKernel(const dlong Nblocks,
-          const dlong   N,
-          @restrict const  dfloat * x,
-                @restrict dfloat * ip){
-
-  for(dlong b=0;b<Nblocks;++b;@outer(0)){
-
-    @shared volatile dfloat s_ip[p_RDIMY][p_RDIMX];
-    @shared volatile dfloat s_res[p_RDIMY];
-    @exclusive dfloat res;
-    
-    for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){
-      for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){
-        dlong i = tx + ty*p_RDIMX + b*p_RDIMX*p_RDIMY;
-        
-        res = 0;
-        while(i<N){ // scan through whole array
-          res += x[i];
-          i += Nblocks*p_RDIMX*p_RDIMY; 
-        }
-      }
-    }
-    
-    twoPhaseReduction(res, s_ip, s_res, ip[b]);
-  }
-}
diff --git a/solvers/parALMOND/okl/twoPhaseReduction.h b/solvers/parALMOND/okl/twoPhaseReduction.h
deleted file mode 100644
index 98d3d1f4d..000000000
--- a/solvers/parALMOND/okl/twoPhaseReduction.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-
-// used a macro since I am not sure what happens with @exclusive variables in OpenMP mode
-#define twoPhaseReduction(r_ip, s_ip, s_res, g_ip)			\
-									\
-  @barrier("local");						\
-									\
-  for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){					\
-    for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){				\
-      s_ip[ty][tx] = r_ip;						\
-      if(tx>=  1*p_RDIMX/2) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/2];	\
-      if(tx>=  3*p_RDIMX/4) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/4];	\
-      if(tx>=  7*p_RDIMX/8) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/8];	\
-      if(tx>= 15*p_RDIMX/16) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/16];	\
-      if(tx>= 31*p_RDIMX/32) s_ip[ty][tx] += s_ip[ty][tx-p_RDIMX/32];	\
-      if(tx==(p_RDIMX-1)) s_res[ty] = s_ip[ty][tx];			\
-    }									\
-  }									\
-									\
-  @barrier("local");						\
-									\
-  for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){					\
-    for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){				\
-      if(ty==0 && tx<p_RDIMY){						\
-      	if(tx >= 1*p_RDIMY/2) s_res[tx] += s_res[tx-p_RDIMY/2];		\
-      	if(tx >= 3*p_RDIMY/4) s_res[tx] += s_res[tx-p_RDIMY/4];		\
-      	if(tx >= 7*p_RDIMY/8) s_res[tx] += s_res[tx-p_RDIMY/8];		\
-      	if(tx==(p_RDIMY-1)) {						\
-          g_ip = s_res[p_RDIMY-1];				\
-        }								\
-      }									\
-    }									\
-  }
diff --git a/solvers/parALMOND/okl/vectorAddInnerProduct.okl b/solvers/parALMOND/okl/vectorAddInnerProduct.okl
deleted file mode 100644
index a6d7cb2b0..000000000
--- a/solvers/parALMOND/okl/vectorAddInnerProduct.okl
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-  
-// y = beta*y + alpha*x
-// ip = y.y 
-@kernel void vectorAddInnerProductKernel(const dlong Nblocks,
-					const dlong   N,
-					const dfloat   alpha,
-					const dfloat   beta,
-					@restrict const  dfloat * x,
-				        @restrict dfloat * y,
-					      @restrict dfloat * ip){
-
-  for(dlong b=0;b<Nblocks;++b;@outer(0)){
-
-    @shared volatile dfloat s_ip[p_RDIMY][p_RDIMX];
-    @shared volatile dfloat s_res[p_RDIMY];
-    @exclusive dfloat res;
-    
-    for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){
-      for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){
-      	dlong i = tx + ty*p_RDIMX + b*p_RDIMX*p_RDIMY;
-      	
-      	res = 0;
-      	while(i<N){ // scan through whole array
-      	  dfloat yi =  beta*y[i] + alpha*x[i];
-      	  res += (yi*yi);
-      	  y[i] = yi;
-      	  i += Nblocks*p_RDIMX*p_RDIMY; 
-      	}
-      }
-    }
-    
-    twoPhaseReduction(res, s_ip, s_res, ip[b]);
-  }
-}
-
-// y = beta*y + alpha*x
-// ip = w.y.y 
-@kernel void vectorAddWeightedInnerProductKernel(const dlong Nblocks,
-          const dlong   N,
-          const dfloat   alpha,
-          const dfloat   beta,
-          @restrict const  dfloat * x,
-                @restrict dfloat * y,
-          @restrict const  dfloat * w,
-                @restrict dfloat * ip){
-
-  for(dlong b=0;b<Nblocks;++b;@outer(0)){
-
-    @shared volatile dfloat s_ip[p_RDIMY][p_RDIMX];
-    @shared volatile dfloat s_res[p_RDIMY];
-    @exclusive dfloat res;
-    
-    for(int ty=0;ty<p_RDIMY;++ty;@inner(1)){
-      for(int tx=0;tx<p_RDIMX;++tx;@inner(0)){
-        dlong i = tx + ty*p_RDIMX + b*p_RDIMX*p_RDIMY;
-        
-        res = 0;
-        while(i<N){ // scan through whole array
-          dfloat yi =  beta*y[i] + alpha*x[i];
-          res += (w[i]*yi*yi);
-          y[i] = yi;
-          i += Nblocks*p_RDIMX*p_RDIMY; 
-        }
-      }
-    }
-    
-    twoPhaseReduction(res, s_ip, s_res, ip[b]);
-  }
-}
-    
diff --git a/solvers/parALMOND/src/agmg.c b/solvers/parALMOND/src/agmg.c
deleted file mode 100644
index b012ed904..000000000
--- a/solvers/parALMOND/src/agmg.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "agmg.h"
-
-namespace agmg{
-
-  int rank;
-  int size;
-  MPI_Comm comm;
-
-}
-
-void kcycle(parAlmond_t *parAlmond, int k){
-
-  agmgLevel **levels = parAlmond->levels;
-
-  dlong m = levels[k]->Nrows;
-  // dlong n = levels[k]->Ncols;
-
-  //check for base level
-  if(k==parAlmond->numLevels-1) {
-    if (parAlmond->invCoarseA != NULL) {
-      //use exact sovler
-      exactCoarseSolve(parAlmond, m, levels[k]->rhs, levels[k]->x);
-    } else {
-      levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, true);
-    }
-    return;
-  }
-
-  char name[BUFSIZ];
-  sprintf(name, "host kcycle level %d", k);
-  occaTimerTic(parAlmond->device,name);
-
-  dlong mCoarse = levels[k+1]->Nrows;
-  // dlong nCoarse = levels[k+1]->Ncols;
-
-  // zero out x
-  //setVector(m, levels[k]->x, 0.0);
-
-  levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, true);
-
-  // res = r - A*x
-  levels[k]->Ax(levels[k]->AxArgs,levels[k]->x,levels[k]->res);
-  vectorAdd(m, 1.0, levels[k]->rhs, -1.0, levels[k]->res);
-
-  // coarsen the residual to next level, checking if the residual needs to be gathered after
-  if (levels[k+1]->gatherLevel==true) {
-    levels[k+1]->coarsen(levels[k+1]->coarsenArgs, levels[k]->res, levels[k+1]->Srhs);
-    levels[k+1]->gather (levels[k+1]->gatherArgs,  levels[k+1]->Srhs, levels[k+1]->rhs);
-  } else {
-    levels[k+1]->coarsen(levels[k+1]->coarsenArgs, levels[k]->res, levels[k+1]->rhs);
-  }
-
-  if(k>2) {
-    vcycle(parAlmond,k+1);
-    //kcycle(parAlmond, k+1);
-  } else{
-    dfloat *ckp1 = levels[k+1]->ckp1;
-    dfloat *vkp1 = levels[k+1]->vkp1;
-    dfloat *wkp1 = levels[k+1]->wkp1;
-    dfloat *dkp1 = levels[k+1]->x;
-    dfloat *rkp1 = levels[k+1]->rhs;
-    dfloat *w = levels[k+1]->weight;
-    bool weighted = levels[k+1]->weightedInnerProds;
-
-    // first inner krylov iteration
-    kcycle(parAlmond, k+1);
-
-    //ckp1 = x
-    memcpy(ckp1,levels[k+1]->x,mCoarse*sizeof(dfloat));
-
-    // v = A*c
-    levels[k+1]->Ax(levels[k+1]->AxArgs,ckp1,vkp1);
-
-    dfloat rhoLocal[3], rhoGlobal[3];
-    dfloat rho1, alpha1, norm_rkp1;
-    dfloat norm_rktilde_p, norm_rktilde_pGlobal;
-
-    if(parAlmond->ktype == PCG)
-      kcycleCombinedOp1(mCoarse, rhoLocal, ckp1, rkp1, vkp1, w, weighted);
-
-    if(parAlmond->ktype == GMRES)
-      kcycleCombinedOp1(mCoarse, rhoLocal, vkp1, rkp1, vkp1, w, weighted);
-
-    MPI_Allreduce(rhoLocal,rhoGlobal,3,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-    alpha1 = rhoGlobal[0];
-    rho1   = rhoGlobal[1];
-    norm_rkp1 = sqrt(rhoGlobal[2]);
-
-    // rkp1 = rkp1 - (alpha1/rho1)*vkp1
-    norm_rktilde_p = vectorAddInnerProd(mCoarse, -alpha1/rho1, vkp1, 1.0, rkp1, w, weighted);
-    MPI_Allreduce(&norm_rktilde_p,&norm_rktilde_pGlobal,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-    norm_rktilde_pGlobal = sqrt(norm_rktilde_pGlobal);
-
-    dfloat t = 0.2;
-
-    if(norm_rktilde_pGlobal < t*norm_rkp1){
-      // x = (alpha1/rho1)*x
-      scaleVector(mCoarse, levels[k+1]->x, alpha1/rho1);
-    } else{
-
-      kcycle(parAlmond, k+1);
-
-      // w = A*d
-      levels[k+1]->Ax(levels[k+1]->AxArgs,dkp1,wkp1);
-
-      dfloat gamma, beta, alpha2;
-
-      if(parAlmond->ktype == PCG)
-        kcycleCombinedOp2(mCoarse,rhoLocal,dkp1,vkp1,wkp1,rkp1, w, weighted);
-
-      if(parAlmond->ktype == GMRES)
-        kcycleCombinedOp2(mCoarse,rhoLocal,wkp1,vkp1,wkp1,rkp1, w, weighted);
-
-      MPI_Allreduce(rhoLocal,rhoGlobal,3,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-      gamma  = rhoGlobal[0];
-      beta   = rhoGlobal[1];
-      alpha2 = rhoGlobal[2];
-
-      if(fabs(rho1) > (dfloat) 1e-20){
-
-        dfloat rho2 = beta - gamma*gamma/rho1;
-
-        if(fabs(rho2) > (dfloat) 1e-20){
-          // levels[k+1]->x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ckp1 + (alpha2/rho2)*dkp1
-          dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2);
-          dfloat b = alpha2/rho2;
-
-          vectorAdd(mCoarse, a, ckp1, b, levels[k+1]->x);
-        }
-      }
-    }
-  }
-
-  if (levels[k+1]->gatherLevel==true) {
-    levels[k+1]->scatter(levels[k+1]->scatterArgs,  levels[k+1]->x, levels[k+1]->Sx);
-    levels[k+1]->prolongate(levels[k+1]->prolongateArgs, levels[k+1]->Sx, levels[k]->x);
-  } else {
-    levels[k+1]->prolongate(levels[k+1]->prolongateArgs, levels[k+1]->x, levels[k]->x);
-  }
-
-  levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, false);
-
-  occaTimerToc(parAlmond->device,name);
-}
-
-
-void device_kcycle(parAlmond_t *parAlmond, int k){
-
-  agmgLevel **levels = parAlmond->levels;
-
-  dlong m = levels[k]->Nrows;
-  // dlong n = levels[k]->Ncols;
-
-  if(m < GPU_CPU_SWITCH_SIZE){
-    levels[k]->o_rhs.copyTo(levels[k]->rhs, m*sizeof(dfloat));
-    kcycle(parAlmond, k);
-    levels[k]->o_x.copyFrom(levels[k]->x, m*sizeof(dfloat));
-    return;
-  }
-
-  //check for base level
-  if(k==parAlmond->numLevels-1) {
-    if (parAlmond->invCoarseA != NULL) {
-      //use exact sovler
-      device_exactCoarseSolve(parAlmond, m, levels[k]->o_rhs, levels[k]->o_x);
-    } else {
-      levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, true);
-    }
-    return;
-  }
-
-  dlong mCoarse = levels[k+1]->Nrows;
-  // dlong nCoarse = levels[k+1]->Ncols;
-
-  char name[BUFSIZ];
-  sprintf(name, "device kcycle level %d", k);
-  occaTimerTic(parAlmond->device,name);
-
-  // zero out x
-  //setVector(parAlmond, m, levels[k]->o_x, 0.0);
-
-  levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, true);
-
-  // res = rhs - A*x
-  levels[k]->device_Ax(levels[k]->AxArgs,levels[k]->o_x,levels[k]->o_res);
-  vectorAdd(parAlmond, m, 1.0, levels[k]->o_rhs, -1.0, levels[k]->o_res);
-
-  // coarsen the residual to next level, checking if the residual needs to be gathered after
-  if (levels[k+1]->gatherLevel==true) {
-    levels[k+1]->device_coarsen(levels[k+1]->coarsenArgs, levels[k]->o_res, levels[k+1]->o_Srhs);
-    levels[k+1]->device_gather (levels[k+1]->gatherArgs,  levels[k+1]->o_Srhs, levels[k+1]->o_rhs);
-  } else {
-    levels[k+1]->device_coarsen(levels[k+1]->coarsenArgs, levels[k]->o_res, levels[k+1]->o_rhs);
-  }
-
-  if(k>2) {
-    device_vcycle(parAlmond,k+1);
-    //device_kcycle(parAlmond, k+1);
-  } else{
-    // first inner krylov iteration
-    device_kcycle(parAlmond,k+1);
-
-    //ckp1 = levels[k+1]->x;
-    if (mCoarse)
-      levels[k+1]->o_ckp1.copyFrom(levels[k+1]->o_x);
-
-    // v = A*c
-    levels[k+1]->device_Ax(levels[k+1]->AxArgs,levels[k+1]->o_ckp1,levels[k+1]->o_vkp1);
-
-    dfloat rhoLocal[3], rhoGlobal[3];
-    dfloat rho1, alpha1, norm_rkp1;
-    dfloat norm_rktilde_pLocal, norm_rktilde_pGlobal;
-
-    // kcycleCombinedOp1(parAlmond,N,aDotbc,a,b,c,w,bool) 
-    //    returns aDotbc[0] = a.b, aDotbc[1] = a.c, aDotbc[2] = b.b
-    //       or aDotbc[0] = w.a.b, aDotbc[1] = w.a.c, aDotbc[2] = w.b.b
-    if(parAlmond->ktype == PCG)
-      kcycleCombinedOp1(parAlmond, mCoarse, rhoLocal,
-                        levels[k+1]->o_ckp1,
-                        levels[k+1]->o_rhs,
-                        levels[k+1]->o_vkp1,
-                        levels[k+1]->o_weight,
-                        levels[k+1]->weightedInnerProds);
-
-    if(parAlmond->ktype == GMRES)
-      kcycleCombinedOp1(parAlmond, mCoarse, rhoLocal,
-                        levels[k+1]->o_vkp1,
-                        levels[k+1]->o_rhs,
-                        levels[k+1]->o_vkp1,
-                        levels[k+1]->o_weight,
-                        levels[k+1]->weightedInnerProds);
-
-    MPI_Allreduce(rhoLocal,rhoGlobal,3,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-    alpha1 = rhoGlobal[0];
-    rho1   = rhoGlobal[1];
-    norm_rkp1 = sqrt(rhoGlobal[2]);
-
-    // rkp1 = rkp1 - (alpha1/rho1)*vkp1
-    norm_rktilde_pLocal = vectorAddInnerProd(parAlmond, mCoarse, -alpha1/rho1,
-                                              levels[k+1]->o_vkp1, 1.0,
-                                              levels[k+1]->o_rhs,
-                                              levels[k+1]->o_weight,
-                                              levels[k+1]->weightedInnerProds);
-    MPI_Allreduce(&norm_rktilde_pLocal,&norm_rktilde_pGlobal,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-    norm_rktilde_pGlobal = sqrt(norm_rktilde_pGlobal);
-
-    dfloat t = 0.2;
-    if(norm_rktilde_pGlobal < t*norm_rkp1){
-      //      levels[k+1]->x = (alpha1/rho1)*x
-      scaleVector(parAlmond,mCoarse, levels[k+1]->o_x, alpha1/rho1);
-    } else{
-    
-      device_kcycle(parAlmond,k+1);
-
-      // w = A*x
-      levels[k+1]->device_Ax(levels[k+1]->AxArgs,levels[k+1]->o_x,levels[k+1]->o_wkp1);
-
-      dfloat gamma, beta, alpha2;
-
-      // kcycleCombinedOp2(parAlmond,N,aDotbc,a,b,c,d,w,bool) 
-      //   returns aDotbcd[0] = a.b, aDotbcd[1] = a.c, aDotbcd[2] = a.d,
-      //      or aDotbcd[0] = w.a.b, aDotbcd[1] = w.a.c, aDotbcd[2] = w.a.d,
-      if(parAlmond->ktype == PCG)
-        kcycleCombinedOp2(parAlmond,mCoarse,rhoLocal,
-                          levels[k+1]->o_x,
-                          levels[k+1]->o_vkp1,
-                          levels[k+1]->o_wkp1,
-                          levels[k+1]->o_rhs,
-                          levels[k+1]->o_weight,
-                          levels[k+1]->weightedInnerProds);
-
-      if(parAlmond->ktype == GMRES)
-        kcycleCombinedOp2(parAlmond,mCoarse,rhoLocal,
-                          levels[k+1]->o_wkp1,
-                          levels[k+1]->o_vkp1,
-                          levels[k+1]->o_wkp1,
-                          levels[k+1]->o_rhs,
-                          levels[k+1]->o_weight,
-                          levels[k+1]->weightedInnerProds);
-
-      MPI_Allreduce(rhoLocal,rhoGlobal,3,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-      gamma  = rhoGlobal[0];
-      beta   = rhoGlobal[1];
-      alpha2 = rhoGlobal[2];
-
-      if(fabs(rho1) > (dfloat) 1e-20){
-
-        dfloat rho2 = beta - gamma*gamma/rho1;
-
-        if(fabs(rho2) > (dfloat) 1e-20){
-          // levels[k+1]->x = (alpha1/rho1 - (gam*alpha2)/(rho1*rho2))*ckp1 + (alpha2/rho2)*dkp1
-          dfloat a = alpha1/rho1 - gamma*alpha2/(rho1*rho2);
-          dfloat b = alpha2/rho2;
-
-          vectorAdd(parAlmond, mCoarse, a, levels[k+1]->o_ckp1,
-                                        b, levels[k+1]->o_x);
-        }
-      }
-    }
-  }
-
-  if (levels[k+1]->gatherLevel==true) {
-    levels[k+1]->device_scatter   (levels[k+1]->scatterArgs,  levels[k+1]->o_x, levels[k+1]->o_Sx);
-    levels[k+1]->device_prolongate(levels[k+1]->prolongateArgs, levels[k+1]->o_Sx, levels[k]->o_x);
-  } else {
-    levels[k+1]->device_prolongate(levels[k+1]->prolongateArgs, levels[k+1]->o_x, levels[k]->o_x);
-  }
-
-  levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, false);
-
-  occaTimerToc(parAlmond->device,name);
-}
-
-
-
-void vcycle(parAlmond_t *parAlmond, int k) {
-
-  agmgLevel **levels = parAlmond->levels;
-
-  const dlong m = levels[k]->Nrows;
-
-  //check for base level
-  if(k==parAlmond->numLevels-1) {
-    if (parAlmond->invCoarseA != NULL) {
-      //use exact sovler
-      exactCoarseSolve(parAlmond, m, levels[k]->rhs, levels[k]->x);
-    } else {
-      levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, true);
-    }
-    return;
-  }
-
-  char name[BUFSIZ];
-  sprintf(name, "host vcycle level %d", k);
-  occaTimerTic(parAlmond->device,name);
-
-  // const int mCoarse = levels[k+1]->Nrows;
-
-  // zero out x
-  //setVector(m, levels[k]->x,  0.0);
-
-  levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x, true);
-
-  // res = rhs - A*x
-  levels[k]->Ax(levels[k]->AxArgs,levels[k]->x,levels[k]->res);
-  vectorAdd(m, 1.0, levels[k]->rhs, -1.0, levels[k]->res);
-
-  // coarsen the residual to next level, checking if the residual needs to be gathered after
-  if (levels[k+1]->gatherLevel==true) {
-    levels[k+1]->coarsen(levels[k+1]->coarsenArgs, levels[k]->res, levels[k+1]->Srhs);
-    levels[k+1]->gather (levels[k+1]->gatherArgs,  levels[k+1]->Srhs, levels[k+1]->rhs);
-  } else {
-    levels[k+1]->coarsen(levels[k+1]->coarsenArgs, levels[k]->res, levels[k+1]->rhs);
-  }
-
-  vcycle(parAlmond,k+1);
-
-  if (levels[k+1]->gatherLevel==true) {
-    levels[k+1]->scatter(levels[k+1]->scatterArgs,  levels[k+1]->x, levels[k+1]->Sx);
-    levels[k+1]->prolongate(levels[k+1]->prolongateArgs, levels[k+1]->Sx, levels[k]->x);
-  } else {
-    levels[k+1]->prolongate(levels[k+1]->prolongateArgs, levels[k+1]->x, levels[k]->x);
-  }
-
-  levels[k]->smooth(levels[k]->smoothArgs, levels[k]->rhs, levels[k]->x,false);
-
-  occaTimerToc(parAlmond->device,name);
-}
-
-
-void device_vcycle(parAlmond_t *parAlmond, int k){
-
-  agmgLevel **levels = parAlmond->levels;
-
-  const dlong m = levels[k]->Nrows;
-  // const dlong mCoarse = levels[k+1]->Nrows;
-
-  // switch to cpu if the problem size is too small for gpu
-  if(m < GPU_CPU_SWITCH_SIZE){
-    levels[k]->o_rhs.copyTo(levels[k]->rhs, m*sizeof(dfloat));
-    vcycle(parAlmond, k);
-    levels[k]->o_x.copyFrom(levels[k]->x, m*sizeof(dfloat));
-    return;
-  }
-
-  //check for base level
-  if (k==parAlmond->numLevels-1) {
-    if (parAlmond->invCoarseA != NULL) {
-      //use exact sovler
-      device_exactCoarseSolve(parAlmond, m, levels[k]->o_rhs, levels[k]->o_x);
-    } else {
-      levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, true);
-    }
-    return;
-  }
-
-  char name[BUFSIZ];
-  sprintf(name, "device vcycle level %d", k);
-  occaTimerTic(parAlmond->device,name);
-
-  // zero out x
-  //setVector(parAlmond, m, levels[k]->o_x, 0.0);
-
-  levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x, true);
-
-  // res = rhs - A*x
-  levels[k]->device_Ax(levels[k]->AxArgs,levels[k]->o_x,levels[k]->o_res);
-  vectorAdd(parAlmond, m, 1.0, levels[k]->o_rhs, -1.0, levels[k]->o_res);
-
-  // coarsen the residual to next level, checking if the residual needs to be gathered after
-  if (levels[k+1]->gatherLevel==true) {
-    levels[k+1]->device_coarsen(levels[k+1]->coarsenArgs, levels[k]->o_res, levels[k+1]->o_Srhs);
-    levels[k+1]->device_gather (levels[k+1]->gatherArgs,  levels[k+1]->o_Srhs, levels[k+1]->o_rhs);
-  } else {
-    levels[k+1]->device_coarsen(levels[k+1]->coarsenArgs, levels[k]->o_res, levels[k+1]->o_rhs);
-  }
-
-  device_vcycle(parAlmond, k+1);
-
-  if (levels[k+1]->gatherLevel==true) {
-    levels[k+1]->device_scatter   (levels[k+1]->scatterArgs,  levels[k+1]->o_x, levels[k+1]->o_Sx);
-    levels[k+1]->device_prolongate(levels[k+1]->prolongateArgs, levels[k+1]->o_Sx, levels[k]->o_x);
-  } else {
-    levels[k+1]->device_prolongate(levels[k+1]->prolongateArgs, levels[k+1]->o_x, levels[k]->o_x);
-  }
-
-  levels[k]->device_smooth(levels[k]->smoothArgs, levels[k]->o_rhs, levels[k]->o_x,false);
-
-  occaTimerToc(parAlmond->device,name);
-}
diff --git a/solvers/parALMOND/src/agmgLevel.c b/solvers/parALMOND/src/agmgLevel.c
deleted file mode 100644
index 67bd7a66b..000000000
--- a/solvers/parALMOND/src/agmgLevel.c
+++ /dev/null
@@ -1,640 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "agmg.h"
-
-// parAlmond's function call-backs
-void agmgAx(void **args, dfloat *x, dfloat *Ax){
-  parAlmond_t *parAlmond = (parAlmond_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  axpy(level->A, 1.0, x, 0.0, Ax,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-}
-
-void agmgCoarsen(void **args, dfloat *r, dfloat *Rr){
-  // parAlmond_t *parAlmond = (parAlmond_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  axpy(level->R, 1.0, r, 0.0, Rr,false,0.);
-}
-
-void agmgProlongate(void **args, dfloat *x, dfloat *Px){
-  // parAlmond_t *parAlmond = (parAlmond_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  axpy(level->P, 1.0, x, 1.0, Px,false,0.);
-}
-
-void agmgSmooth(void **args, dfloat *rhs, dfloat *x, bool x_is_zero){
-  parAlmond_t *parAlmond = (parAlmond_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  if(level->stype == JACOBI){
-    smoothJacobi(parAlmond, level, level->A, rhs, x, x_is_zero);
-  } else if(level->stype == DAMPED_JACOBI){
-    smoothDampedJacobi(parAlmond, level, level->A, rhs, x, x_is_zero);
-  } else if(level->stype == CHEBYSHEV){
-    smoothChebyshev(parAlmond, level, level->A, rhs, x, x_is_zero);
-  }
-}
-
-void device_agmgAx(void **args, occa::memory &o_x, occa::memory &o_Ax){
-  parAlmond_t *parAlmond = (parAlmond_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  axpy(parAlmond,level->deviceA, 1.0, o_x, 0.0, o_Ax,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-}
-
-void device_agmgCoarsen(void **args, occa::memory &o_r, occa::memory &o_Rr){
-  parAlmond_t *parAlmond = (parAlmond_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  axpy(parAlmond, level->deviceR, 1.0, o_r, 0.0, o_Rr,false,0.);
-}
-
-void device_agmgProlongate(void **args, occa::memory &o_x, occa::memory &o_Px){
-  parAlmond_t *parAlmond = (parAlmond_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  axpy(parAlmond, level->dcsrP, 1.0, o_x, 1.0, o_Px);
-}
-
-void device_agmgSmooth(void **args, occa::memory &o_rhs, occa::memory &o_x, bool x_is_zero){
-  parAlmond_t *parAlmond = (parAlmond_t *) args[0];
-  agmgLevel *level = (agmgLevel *) args[1];
-
-  if(level->stype == JACOBI){
-    smoothJacobi(parAlmond, level, level->deviceA, o_rhs, o_x, x_is_zero);
-  } else if(level->stype == DAMPED_JACOBI){
-    smoothDampedJacobi(parAlmond, level, level->deviceA, o_rhs, o_x, x_is_zero);
-  } else if(level->stype == CHEBYSHEV){
-    smoothChebyshev(parAlmond, level, level->deviceA, o_rhs, o_x, x_is_zero);
-  }
-}
-
-dfloat rhoDinvA(parAlmond_t *parAlmond, csr *A, dfloat *invD);
-
-void setupSmoother(parAlmond_t *parAlmond, agmgLevel *level, SmoothType s){
-
-  level->stype = s;
-
-  if((s == DAMPED_JACOBI)||(s == CHEBYSHEV)){
-    // estimate rho(invD * A)
-    dfloat rho=0;
-
-    if(level->A->Nrows)
-      level->A->diagInv = (dfloat *) calloc(level->A->Nrows, sizeof(dfloat));
-
-    for (dlong i=0;i<level->A->Nrows;i++) {
-      dfloat diag = level->A->diagCoefs[level->A->diagRowStarts[i]];
-      if (parAlmond->nullSpace) {
-        diag += parAlmond->nullSpacePenalty*level->A->null[i]*level->A->null[i];
-      }
-      level->A->diagInv[i] = 1.0/diag;
-    }
-
-    rho = rhoDinvA(parAlmond, level->A, level->A->diagInv);
-
-    if (s == DAMPED_JACOBI) {
-
-      level->smoother_params = (dfloat *) calloc(1,sizeof(dfloat));
-
-      level->smoother_params[0] = (4./3.)/rho;
-
-      //temp storage for smoothing
-      if (level->Ncols) level->smootherResidual = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
-      if (level->Ncols) level->o_smootherResidual = parAlmond->device.malloc(level->Ncols*sizeof(dfloat),level->smootherResidual);
-
-    } else if (s == CHEBYSHEV) {
-
-      level->smoother_params = (dfloat *) calloc(2,sizeof(dfloat));
-
-      level->smoother_params[0] = rho;
-      level->smoother_params[1] = rho/10.;
-
-      //temp storage for smoothing
-      if (level->Ncols) level->smootherResidual = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
-      if (level->Ncols) level->smootherResidual2 = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
-      if (level->Ncols) level->smootherUpdate   = (dfloat *) calloc(level->Ncols,sizeof(dfloat));
-      if (level->Ncols) level->o_smootherResidual  = parAlmond->device.malloc(level->Ncols*sizeof(dfloat),level->smootherResidual);
-      if (level->Ncols) level->o_smootherResidual2 = parAlmond->device.malloc(level->Ncols*sizeof(dfloat),level->smootherResidual);
-      if (level->Ncols) level->o_smootherUpdate    = parAlmond->device.malloc(level->Ncols*sizeof(dfloat),level->smootherUpdate);
-    }
-  }
-}
-
-extern "C"{
-  void dgeev_(char *JOBVL, char *JOBVR, int *N, double *A, int *LDA, double *WR, double *WI,
-  double *VL, int *LDVL, double *VR, int *LDVR, double *WORK, int *LWORK, int *INFO );
-}
-
-
-static void eig(const int Nrows, double *A, double *WR,
-    double *WI){
-
-  if(Nrows){
-  int NB  = 256;
-  char JOBVL  = 'V';
-  char JOBVR  = 'V';
-  int     N = Nrows;
-  int   LDA = Nrows;
-  int  LWORK  = (NB+2)*N;
-
-  double *WORK  = new double[LWORK];
-  double *VL  = new double[Nrows*Nrows];
-  double *VR  = new double[Nrows*Nrows];
-
-  int INFO = -999;
-
-  dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI,
-    VL, &LDA, VR, &LDA, WORK, &LWORK, &INFO);
-
-
-  assert(INFO == 0);
-
-  delete [] VL;
-  delete [] VR;
-  delete [] WORK;
-  }
-}
-
-dfloat rhoDinvA(parAlmond_t* parAlmond,csr *A, dfloat *invD){
-
-  const dlong N = A->Nrows;
-  const dlong M = A->Ncols;
-
-  int k = 10;
-
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  hlong Nlocal = (hlong) N;
-  hlong Ntotal = 0;
-  MPI_Allreduce(&Nlocal, &Ntotal, 1, MPI_HLONG, MPI_SUM, agmg::comm);
-  if(k > Ntotal)
-    k = (int) Ntotal;
-
-  // do an arnoldi
-
-  // allocate memory for Hessenberg matrix
-  double *H = (double *) calloc(k*k,sizeof(double));
-
-  // allocate memory for basis
-  dfloat **V = (dfloat **) calloc(k+1, sizeof(dfloat *));
-  dfloat *Vx = (dfloat *) calloc(M, sizeof(dfloat));
-
-  for(int i=0; i<=k; i++)
-    V[i] = (dfloat *) calloc(N, sizeof(dfloat));
-
-  // generate a random vector for initial basis vector
-  for (dlong i=0;i<N;i++)
-    Vx[i] = (dfloat) drand48();
-
-  dfloat norm_vo = 0.;
-  for (dlong i=0;i<N;i++)
-    norm_vo += Vx[i]*Vx[i];
-
-  dfloat gNorm_vo = 0;
-  MPI_Allreduce(&norm_vo, &gNorm_vo, 1, MPI_DFLOAT, MPI_SUM, agmg::comm);
-  gNorm_vo = sqrt(gNorm_vo);
-
-  for (dlong i=0;i<N;i++)
-    Vx[i] /= gNorm_vo;
-
-  for (dlong i=0;i<N;i++)
-    V[0][i] = Vx[i];
-
-  for(int j=0; j<k; j++){
-
-    for (dlong i=0;i<N;i++)
-      Vx[i] = V[j][i];
-
-    // v[j+1] = invD*(A*v[j])
-    axpy(A, 1.0, Vx, 0., V[j+1],parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-
-    dotStar(N, invD, V[j+1]);
-
-    // modified Gram-Schmidth
-    for(int i=0; i<=j; i++){
-      // H(i,j) = v[i]'*A*v[j]
-      dfloat hij = innerProd(N, V[i], V[j+1]);
-      dfloat ghij = 0;
-      MPI_Allreduce(&hij, &ghij, 1, MPI_DFLOAT, MPI_SUM, agmg::comm);
-
-      // v[j+1] = v[j+1] - hij*v[i]
-      vectorAdd(N,-ghij, V[i], 1.0, V[j+1]);
-
-      H[i + j*k] = (double) ghij;
-    }
-
-    if(j+1 < k){
-
-      dfloat norm_vj = 0.;
-      for (dlong i=0;i<N;i++)
-        norm_vj += V[j+1][i]*V[j+1][i];
-
-      dfloat gNorm_vj;
-      MPI_Allreduce(&norm_vj, &gNorm_vj, 1, MPI_DFLOAT, MPI_SUM, agmg::comm);
-      gNorm_vj = sqrt(gNorm_vj);
-
-      H[j+1+ j*k] = (double) gNorm_vj;
-
-      scaleVector(N,V[j+1], 1./H[j+1 + j*k]);
-    }
-  }
-
-  double *WR = (double *) calloc(k,sizeof(double));
-  double *WI = (double *) calloc(k,sizeof(double));
-
-  eig(k, H, WR, WI);
-
-  double rho = 0.;
-
-  for(int i=0; i<k; i++){
-    double rho_i  = sqrt(WR[i]*WR[i] + WI[i]*WI[i]);
-
-    if(rho < rho_i) {
-      rho = rho_i;
-    }
-  }
-
-  free(H);
-  free(WR);
-  free(WI);
-
-  // free memory
-  for(int i=0; i<=k; i++){
-    free(V[i]);
-  }
-
-  if ((rank==0)&& (parAlmond->options.compareArgs("VERBOSE","TRUE"))) printf("weight = %g \n", rho);
-
-  return rho;
-}
-
-void matrixInverse(int N, dfloat *A);
-
-//set up exact solver using xxt
-void setupExactSolve(parAlmond_t *parAlmond, agmgLevel *level, bool nullSpace, dfloat nullSpacePenalty) {
-
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  //copy the global coarse partition as ints
-  int *coarseOffsets = (int* ) calloc(size+1,sizeof(int));
-  for (int r=0;r<size+1;r++) coarseOffsets[r] = (int) level->globalRowStarts[r];
-  
-  int  coarseTotal   = coarseOffsets[size];
-  int  coarseOffset  = coarseOffsets[rank];
-
-  csr *A = level->A;
-  int N = (int) level->Nrows;
-
-  int localNNZ;
-  int *rows;
-  int *cols;
-  dfloat *vals;
-
-  if((rank==0)&&(parAlmond->options.compareArgs("VERBOSE","TRUE"))) printf("Setting up coarse solver...");fflush(stdout);
-
-  if(!nullSpace) {
-    //if no nullspace, use sparse A
-    localNNZ = (int) (A->diagNNZ+A->offdNNZ);
-    
-    if (localNNZ) {
-      rows = (int *) calloc(localNNZ,sizeof(int));
-      cols = (int *) calloc(localNNZ,sizeof(int));
-      vals = (dfloat *) calloc(localNNZ,sizeof(dfloat));
-    }
-
-    //populate matrix
-    int cnt = 0;
-    for (int n=0;n<N;n++) {
-      int start = (int) A->diagRowStarts[n];
-      int end   = (int) A->diagRowStarts[n+1];
-      for (int m=start;m<end;m++) {
-        rows[cnt] = n + coarseOffset;
-        cols[cnt] = (int) (A->diagCols[m] + coarseOffset);
-        vals[cnt] = A->diagCoefs[m];
-        cnt++;
-      }
-      start = (int) A->offdRowStarts[n];
-      end   = (int) A->offdRowStarts[n+1];
-      for (dlong m=A->offdRowStarts[n];m<A->offdRowStarts[n+1];m++) {
-        rows[cnt] = n + coarseOffset;
-        cols[cnt] = (int) A->colMap[A->offdCols[m]];
-        vals[cnt] = A->offdCoefs[m];
-        cnt++;
-      }
-    }
-  } else {
-    localNNZ = (int) (A->Nrows*coarseTotal); //A is dense due to nullspace augmentation
-
-    if (localNNZ) {
-      rows = (int *) calloc(localNNZ,sizeof(int));
-      cols = (int *) calloc(localNNZ,sizeof(int));
-      vals = (dfloat *) calloc(localNNZ,sizeof(dfloat));
-    }
-
-    //gather null vector
-    dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
-    int *nullCounts = (int*) calloc(size,sizeof(int));
-    for (int r=0;r<size;r++) 
-      nullCounts[r] = coarseOffsets[r+1]-coarseOffsets[r];
-    
-    MPI_Allgatherv(A->null, N, MPI_DFLOAT, nullTotal, nullCounts, coarseOffsets, MPI_DFLOAT, agmg::comm);
-
-    //populate matrix
-    for (int n=0;n<N;n++) {
-      for (int m=0;m<coarseTotal;m++) {    
-        rows[n*coarseTotal+m] = n + coarseOffset;
-        cols[n*coarseTotal+m] = m;
-        vals[n*coarseTotal+m] = nullSpacePenalty*nullTotal[n+coarseOffset]*nullTotal[m];
-      }
-    }
-
-    for (int n=0;n<N;n++) {
-      int start = (int) A->diagRowStarts[n];
-      int end   = (int) A->diagRowStarts[n+1];
-      for (int m=start;m<end;m++) {
-        int col = (int) (A->diagCols[m] + coarseOffset);
-        vals[n*coarseTotal+col] += A->diagCoefs[m];
-      }
-      start = (int) A->offdRowStarts[n];
-      end   = (int) A->offdRowStarts[n+1];
-      for (int m=start;m<end;m++) {
-        int col = (int) A->colMap[A->offdCols[m]];
-        vals[n*coarseTotal+col] += A->offdCoefs[m];
-      }
-    }
-  }
-
-  //ge the nonzero counts from all ranks
-  int *NNZ = (int*) calloc(size,sizeof(int));  
-  int *NNZoffsets = (int*) calloc(size+1,sizeof(int));  
-  MPI_Allgather(&localNNZ, 1, MPI_INT, NNZ, 1, MPI_INT, agmg::comm);
-
-  int totalNNZ = 0;
-  for (int r=0;r<size;r++) {
-    totalNNZ += NNZ[r];
-    NNZoffsets[r+1] = NNZoffsets[r] + NNZ[r];
-  }
-
-  int *Arows = (int *) calloc(totalNNZ,sizeof(int));
-  int *Acols = (int *) calloc(totalNNZ,sizeof(int));
-  dfloat *Avals = (dfloat *) calloc(totalNNZ,sizeof(dfloat));
-
-  MPI_Allgatherv(rows, localNNZ, MPI_INT, Arows, NNZ, NNZoffsets, MPI_INT, agmg::comm);
-  MPI_Allgatherv(cols, localNNZ, MPI_INT, Acols, NNZ, NNZoffsets, MPI_INT, agmg::comm);
-  MPI_Allgatherv(vals, localNNZ, MPI_DFLOAT, Avals, NNZ, NNZoffsets, MPI_DFLOAT, agmg::comm);
-
-  //assemble the full matrix
-  dfloat *coarseA = (dfloat *) calloc(coarseTotal*coarseTotal,sizeof(dfloat));
-  for (int i=0;i<totalNNZ;i++) {
-    int n = Arows[i];
-    int m = Acols[i];
-    coarseA[n*coarseTotal+m] = Avals[i];
-  }
-
-  matrixInverse(coarseTotal, coarseA);
-
-  //store only the local rows of the full inverse
-  parAlmond->invCoarseA = (dfloat *) calloc(A->Nrows*coarseTotal,sizeof(dfloat));
-  for (int n=0;n<N;n++) {
-    for (int m=0;m<coarseTotal;m++) {
-      parAlmond->invCoarseA[n*coarseTotal+m] = coarseA[(n+coarseOffset)*coarseTotal+m];
-    } 
-  }
-
-  parAlmond->coarseTotal = coarseTotal;
-  parAlmond->coarseOffset = coarseOffset;
-  parAlmond->coarseOffsets = coarseOffsets;
-  parAlmond->coarseCounts = (int*) calloc(size,sizeof(int));
-    for (int r=0;r<size;r++) 
-      parAlmond->coarseCounts[r] = coarseOffsets[r+1]-coarseOffsets[r];
-
-  parAlmond->xCoarse   = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
-  parAlmond->rhsCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
-
-  if (localNNZ) {
-    free(rows);
-    free(cols);
-    free(vals);
-  }
-
-  if (totalNNZ) {
-    free(Arows);
-    free(Acols);
-    free(Avals);
-  }
-
-  if(coarseTotal) {
-    free(coarseA);
-  }
-
-  if((rank==0)&&(parAlmond->options.compareArgs("VERBOSE","TRUE"))) printf("done.\n");
-}
-
-
-void exactCoarseSolve(parAlmond_t *parAlmond, int N, dfloat *rhs, dfloat *x) {
-
-  //gather the full vector
-  MPI_Allgatherv(rhs, N, MPI_DFLOAT, parAlmond->rhsCoarse, parAlmond->coarseCounts, parAlmond->coarseOffsets, MPI_DFLOAT, agmg::comm);
-
-  //multiply by local part of the exact matrix inverse
-  #pragma omp parallel for
-  for (int n=0;n<N;n++) {
-    x[n] = 0.;
-    for (int m=0;m<parAlmond->coarseTotal;m++) {
-      x[n] += parAlmond->invCoarseA[n*parAlmond->coarseTotal+m]*parAlmond->rhsCoarse[m];
-    }
-  }
-}
-
-void device_exactCoarseSolve(parAlmond_t *parAlmond, int N, occa::memory o_rhs, occa::memory o_x) {
-
-  dfloat *rhs = parAlmond->levels[parAlmond->numLevels-1]->rhs;
-  dfloat *x = parAlmond->levels[parAlmond->numLevels-1]->x;
-
-  //use coarse solver
-  o_rhs.copyTo(rhs);
-  //gather the full vector
-  MPI_Allgatherv(rhs, N, MPI_DFLOAT, parAlmond->rhsCoarse, parAlmond->coarseCounts, parAlmond->coarseOffsets, MPI_DFLOAT, agmg::comm);
-
-  //multiply by local part of the exact matrix inverse
-  #pragma omp parallel for
-  for (int n=0;n<N;n++) {
-    x[n] = 0.;
-    for (int m=0;m<parAlmond->coarseTotal;m++) {
-      x[n] += parAlmond->invCoarseA[n*parAlmond->coarseTotal+m]*parAlmond->rhsCoarse[m];
-    }
-  }
-
-  o_x.copyFrom(x);
-}
-
-#if 0
-//set up exact solver using xxt
-void setupExactSolve(parAlmond_t *parAlmond, agmgLevel *level, bool nullSpace, dfloat nullSpacePenalty) {
-
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  int* coarseOffsets = level->globalRowStarts;
-  int coarseTotal = coarseOffsets[size];
-  int coarseOffset = coarseOffsets[rank];
-
-  int *globalNumbering = (int *) calloc(coarseTotal,sizeof(int));
-  for (int n=0;n<coarseTotal;n++)
-    globalNumbering[n] = n;
-
-  csr *A = level->A;
-  int N = level->Nrows;
-
-  int totalNNZ;
-  int *rows;
-  int *cols;
-  dfloat *vals;
-
-  if(!nullSpace) {
-    //if no nullspace, use sparse A
-    totalNNZ = A->diagNNZ+A->offdNNZ;
-    if (totalNNZ) {
-      rows = (int *) calloc(totalNNZ,sizeof(int));
-      cols = (int *) calloc(totalNNZ,sizeof(int));
-      vals = (dfloat *) calloc(totalNNZ,sizeof(dfloat));
-    }
-
-    //populate matrix
-    int cnt = 0;
-    for (int n=0;n<N;n++) {
-      for (int m=A->diagRowStarts[n];m<A->diagRowStarts[n+1];m++) {
-        rows[cnt] = n + coarseOffset;
-        cols[cnt] = A->diagCols[m] + coarseOffset;
-        vals[cnt] = A->diagCoefs[m];
-        cnt++;
-      }
-      for (int m=A->offdRowStarts[n];m<A->offdRowStarts[n+1];m++) {
-        rows[cnt] = n + coarseOffset;
-        cols[cnt] = A->colMap[A->offdCols[m]];
-        vals[cnt] = A->offdCoefs[m];
-        cnt++;
-      }
-    }
-  } else {
-    totalNNZ = A->Nrows*coarseTotal; //A is dense due to nullspace augmentation
-    if (totalNNZ) {
-      rows = (int *) calloc(totalNNZ,sizeof(int));
-      cols = (int *) calloc(totalNNZ,sizeof(int));
-      vals = (dfloat *) calloc(totalNNZ,sizeof(dfloat));
-    }
-
-    //gather null vector
-    dfloat *nullTotal = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
-    int *nullCounts = (int*) calloc(size,sizeof(int));
-    for (int r=0;r<size;r++) 
-      nullCounts[r] = coarseOffsets[r+1]-coarseOffsets[r];
-    
-    MPI_Allgatherv(A->null, A->Nrows, MPI_DFLOAT, nullTotal, nullCounts, coarseOffsets, MPI_DFLOAT, agmg::comm);
-
-    //populate matrix
-    for (int n=0;n<N;n++) {
-      for (int m=0;m<coarseTotal;m++) {    
-        rows[n*coarseTotal+m] = n + coarseOffset;
-        cols[n*coarseTotal+m] = m;
-        vals[n*coarseTotal+m] = nullSpacePenalty*nullTotal[n+coarseOffset]*nullTotal[m];
-      }
-    }
-
-    for (int n=0;n<N;n++) {
-      for (int m=A->diagRowStarts[n];m<A->diagRowStarts[n+1];m++) {
-        int col = A->diagCols[m] + coarseOffset;
-        vals[n*coarseTotal+col] += A->diagCoefs[m];
-      }
-      for (int m=A->offdRowStarts[n];m<A->offdRowStarts[n+1];m++) {
-        int col = A->colMap[A->offdCols[m]];
-        vals[n*coarseTotal+col] += A->offdCoefs[m];
-      }
-    }
-  }
-
-  parAlmond->ExactSolve = xxtSetup(A->Nrows,
-                                globalNumbering,
-                                totalNNZ,
-                                rows,
-                                cols,
-                                vals,
-                                0,
-                                "int",
-                                dfloatString);
-
-  parAlmond->coarseTotal = coarseTotal;
-  parAlmond->coarseOffset = coarseOffset;
-
-  parAlmond->xCoarse   = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
-  parAlmond->rhsCoarse = (dfloat*) calloc(coarseTotal,sizeof(dfloat));
-
-  free(globalNumbering);
-  if (totalNNZ) {
-    free(rows);
-    free(cols);
-    free(vals);
-  }
-
-  printf("Done UberCoarse setup\n");
-}
-
-
-void exactCoarseSolve(parAlmond_t *parAlmond, int N, dfloat *rhs, dfloat *x) {
-
-  //use coarse solver
-  for (int n=0;n<parAlmond->coarseTotal;n++)
-    parAlmond->rhsCoarse[n] =0.;
-
-  for (int n=0;n<N;n++)
-    parAlmond->rhsCoarse[n+parAlmond->coarseOffset] = rhs[n];
-
-  xxtSolve(parAlmond->xCoarse, parAlmond->ExactSolve, parAlmond->rhsCoarse);
-
-  for (int n=0;n<N;n++)
-    x[n] = parAlmond->xCoarse[n+parAlmond->coarseOffset];
-
-}
-
-void device_exactCoarseSolve(parAlmond_t *parAlmond, int N, occa::memory o_rhs, occa::memory o_x) {
-
-  //use coarse solver
-  for (int n=0;n<parAlmond->coarseTotal;n++)
-    parAlmond->rhsCoarse[n] =0.;
-
-  o_rhs.copyTo(parAlmond->rhsCoarse+parAlmond->coarseOffset);
-  xxtSolve(parAlmond->xCoarse, parAlmond->ExactSolve, parAlmond->rhsCoarse);
-  o_x.copyFrom(parAlmond->xCoarse+parAlmond->coarseOffset,N*sizeof(dfloat));
-}
-#endif
diff --git a/solvers/parALMOND/src/agmgMatrices.c b/solvers/parALMOND/src/agmgMatrices.c
deleted file mode 100644
index 373fc52d3..000000000
--- a/solvers/parALMOND/src/agmgMatrices.c
+++ /dev/null
@@ -1,1155 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "agmg.h"
-
-csr * newCSRfromCOO(dlong N, hlong* globalRowStarts,
-                    dlong nnz, hlong *Ai, hlong *Aj, dfloat *Avals){
-
-  int size, rank;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  csr *A = (csr *) calloc(1,sizeof(csr));
-
-  A->Nrows = N;
-  A->Ncols = N;
-
-  A->NlocalCols = N;
-
-  hlong globalOffset = globalRowStarts[rank];
-
-  //first, count number of local, and non-local non-zeros
-  dlong diagNNZ=0;
-  dlong offdNNZ=0;
-  for (dlong n=0;n<nnz;n++) {
-    if ((Aj[n] < globalOffset) || (Aj[n]>globalOffset+N-1)) offdNNZ++;
-    else diagNNZ++;
-  }
-
-  dlong   *diagAi, *diagAj;
-  dlong   *offdAi;
-  hlong   *offdAj;
-  dfloat *diagAvals, *offdAvals;
-
-  if (diagNNZ) {
-    diagAi        = (dlong *)   calloc(diagNNZ, sizeof(dlong));
-    diagAj        = (dlong *)   calloc(diagNNZ, sizeof(dlong));
-    diagAvals     = (dfloat *) calloc(diagNNZ, sizeof(dfloat));
-  }
-  if (offdNNZ) {
-    offdAi        = (dlong *)   calloc(offdNNZ, sizeof(dlong));
-    offdAj        = (hlong *)   calloc(offdNNZ, sizeof(hlong));
-    offdAvals     = (dfloat *) calloc(offdNNZ, sizeof(dfloat));
-  }
-
-  //split into local and non-local COO matrices
-  diagNNZ =0;
-  offdNNZ =0;
-  for (dlong n=0;n<nnz;n++) {
-    if ((Aj[n] < globalOffset) || (Aj[n]>globalOffset+N-1)) {
-      offdAi[offdNNZ] = (dlong) Ai[n] - globalOffset; //local index
-      offdAj[offdNNZ] = Aj[n];                        //global index
-      offdAvals[offdNNZ] = Avals[n];
-      offdNNZ++;
-    } else {
-      diagAi[diagNNZ] = (dlong) Ai[n] - globalOffset; //local index
-      diagAj[diagNNZ] = (dlong) Aj[n] - globalOffset; //local index
-      diagAvals[diagNNZ] = Avals[n];
-      diagNNZ++;
-    }
-  }
-
-  A->diagNNZ   = diagNNZ;
-  A->offdNNZ   = offdNNZ;
-
-  if (N) {
-    A->diagRowStarts = (dlong *)   calloc(N+1,sizeof(dlong));
-    A->offdRowStarts = (dlong *)   calloc(N+1,sizeof(dlong));
-  }
-  if (diagNNZ) {
-    A->diagCols  = (dlong *)   calloc(diagNNZ, sizeof(dlong));
-    A->diagCoefs = (dfloat *) calloc(diagNNZ, sizeof(dfloat));
-  }
-  hlong* offdCols;
-  if (offdNNZ) {
-    offdCols     = (hlong *)   calloc(offdNNZ,sizeof(hlong));
-    A->offdCols  = (dlong *)   calloc(offdNNZ,sizeof(dlong));
-    A->offdCoefs = (dfloat *) calloc(offdNNZ, sizeof(dfloat));
-  }
-
-  // Convert to csr storage, assumes orginal matrix was presorted by rows
-  for(dlong n=0;n<diagNNZ;++n) {
-    dlong row = diagAi[n];
-    A->diagRowStarts[row+1]++;
-  }
-  for(dlong n=0;n<offdNNZ;++n) {
-    dlong row = offdAi[n];
-    A->offdRowStarts[row+1]++;
-  }
-  //cumulative sum
-  for (dlong i=0;i<A->Nrows;i++) {
-    A->diagRowStarts[i+1] += A->diagRowStarts[i];
-    A->offdRowStarts[i+1] += A->offdRowStarts[i];
-  }
-
-  //copy input data into struct
-  if (diagNNZ) {
-    for (dlong i=0; i<N; i++) {
-      dlong start = A->diagRowStarts[i];
-      int cnt = 1;
-      for (dlong j=A->diagRowStarts[i]; j<A->diagRowStarts[i+1]; j++) {
-        if (diagAj[j] == i) { //move diagonal to first entry
-          A->diagCols[start]  = diagAj[j];
-          A->diagCoefs[start] = diagAvals[j];
-        } else {
-          A->diagCols[start+cnt]  = diagAj[j];
-          A->diagCoefs[start+cnt] = diagAvals[j];
-          cnt++;
-        }
-      }
-    }
-  }
-
-  //record global indexing of columns
-  A->colMap = (hlong *)   calloc(A->Ncols, sizeof(hlong));
-  for (dlong i=0;i<A->Ncols;i++)
-    A->colMap[i] = i + globalOffset;
-
-  if (offdNNZ) {
-    for (dlong i=0; i<N; i++) {
-      dlong start = A->offdRowStarts[i];
-      int cnt = 0;
-      for (dlong j=A->offdRowStarts[i]; j<A->offdRowStarts[i+1]; j++) {
-        offdCols[start+cnt]  = offdAj[j];
-        A->offdCoefs[start+cnt] = offdAvals[j];
-        cnt++;
-      }
-    }
-
-    //we now need to reorder the x vector for the halo, and shift the column indices
-    hlong *col = (hlong *) calloc(A->offdNNZ,sizeof(hlong));
-    for (dlong n=0;n<offdNNZ;n++)
-      col[n] = offdCols[n]; //copy non-local column global ids
-
-    //sort by global index
-    std::sort(col,col+offdNNZ);
-
-    //count unique non-local column ids
-    A->NHalo = 0;
-    for (dlong n=1;n<offdNNZ;n++)
-      if (col[n]!=col[n-1])  col[++A->NHalo] = col[n];
-    A->NHalo++; //number of unique columns
-
-    A->Ncols += A->NHalo;
-
-    //save global column ids in colMap
-    A->colMap    = (hlong *) realloc(A->colMap, A->Ncols*sizeof(hlong));
-    for (dlong n=0; n<A->NHalo; n++)
-      A->colMap[n+A->NlocalCols] = col[n];
-    free(col);
-
-    //shift the column indices to local indexing
-    for (dlong n=0;n<offdNNZ;n++) {
-      hlong gcol = offdCols[n];
-      for (dlong m=A->NlocalCols;m<A->Ncols;m++) {
-        if (gcol == A->colMap[m])
-          A->offdCols[n] = m;
-      }
-    }
-  }
-
-  if (diagNNZ) {
-    free(diagAi);
-    free(diagAj);
-    free(diagAvals);
-  }
-  if (offdNNZ) {
-    free(offdAi);
-    free(offdAj);
-    free(offdAvals);
-    free(offdCols);
-  }
-
-  csrHaloSetup(A,globalRowStarts);
-
-  return A;
-}
-
-void freeCSR(csr *A) {
-  if (A->diagNNZ) {
-    free(A->diagRowStarts);
-    free(A->diagCols);
-    free(A->diagCoefs);
-  }
-  if (A->offdNNZ) {
-    free(A->offdRowStarts);
-    free(A->offdCols);
-    free(A->offdCoefs);
-  }
-  if (A->Ncols) {
-    free(A->colMap);
-  }
-  free(A->haloSendRequests);
-  free(A->haloRecvRequests);
-  free(A->NsendPairs);
-  free(A->NrecvPairs);
-  if (A->NsendTotal) {
-    free(A->sendBuffer);
-    free(A->haloElementList);
-  }
-
-  free(A);
-}
-
-//create a device version of a coo matrix
-dcoo *newDCOO(parAlmond_t *parAlmond, csr *B){
-
-  dcoo *A = (dcoo *) calloc(1,sizeof(dcoo));
-
-  A->Nrows  = B->Nrows;
-  A->Ncols  = B->Ncols;
-
-  A->NHalo = B->NHalo;
-  A->NlocalCols = B->NlocalCols;
-
-  A->diagNNZ = B->diagNNZ;
-  A->offdNNZ = B->offdNNZ;
-
-  dlong *diagRows;
-  dlong *offdRows;
-  if (B->diagNNZ)
-    diagRows = (dlong *) calloc(B->diagNNZ,sizeof(dlong));
-  if (B->offdNNZ)
-    offdRows = (dlong *) calloc(B->offdNNZ,sizeof(dlong));
-
-  dlong diagCnt =0;
-  dlong offdCnt =0;
-  for (dlong i=0;i<B->Nrows;i++) {
-    for (dlong j=B->diagRowStarts[i];j<B->diagRowStarts[i+1];j++)
-      diagRows[diagCnt++] = i;
-
-    for (dlong j=B->offdRowStarts[i];j<B->offdRowStarts[i+1];j++)
-      offdRows[offdCnt++] = i;
-  }
-
-  //copy to device
-  if(B->diagNNZ){
-    A->o_diagRows  = parAlmond->device.malloc(A->diagNNZ*sizeof(dlong),   diagRows);
-    A->o_diagCols  = parAlmond->device.malloc(A->diagNNZ*sizeof(dlong),   B->diagCols);
-    A->o_diagCoefs = parAlmond->device.malloc(A->diagNNZ*sizeof(dfloat), B->diagCoefs);
-  }
-  if(B->offdNNZ){
-    A->o_offdRows  = parAlmond->device.malloc(A->offdNNZ*sizeof(dlong), offdRows);
-    A->o_offdCols  = parAlmond->device.malloc(A->offdNNZ*sizeof(dlong),   B->offdCols);
-    A->o_offdCoefs = parAlmond->device.malloc(A->offdNNZ*sizeof(dfloat), B->offdCoefs);
-  }
-
-  A->NrecvTotal = B->NrecvTotal;
-  A->NsendTotal = B->NsendTotal;
-  A->haloElementList = B->haloElementList;
-  if (A->NsendTotal)
-    A->o_haloElementList = parAlmond->device.malloc(A->NsendTotal*sizeof(dlong),A->haloElementList);
-  A->NsendPairs = B->NsendPairs;
-  A->NrecvPairs = B->NrecvPairs;
-  A->NsendMessages = B->NsendMessages;
-  A->NrecvMessages = B->NrecvMessages;
-  
-  if (A->NrecvTotal) A->recvBuffer = (dfloat *) malloc(A->NrecvTotal*sizeof(dfloat));
-  if (A->NsendTotal) {
-#if 0
-    occa::memory o_haloBuffer = parAlmond->device.mappedAlloc(A->NsendTotal*sizeof(dfloat), NULL);
-    A->sendBuffer = (dfloat*) o_haloBuffer.getMappedPointer();
-#endif
-    A->sendBuffer = (dfloat*) occaHostMallocPinned(parAlmond->device, A->NsendTotal*sizeof(dfloat), NULL, A->o_haloBuffer);
-  }
-
-  A->haloSendRequests = B->haloSendRequests;
-  A->haloRecvRequests = B->haloRecvRequests;
-
-  return A;
-}
-
-hyb * newHYB(parAlmond_t *parAlmond, csr *csrA) {
-
-  hyb *A = (hyb *) calloc(1,sizeof(hyb));
-
-  A->Nrows  = csrA->Nrows;
-  A->Ncols  = csrA->Ncols;
-
-  A->NlocalCols = csrA->NlocalCols;
-  A->NHalo = csrA->NHalo;
-
-  int *rowCounters;
-  if (csrA->Nrows)   
-    rowCounters = (int*) calloc(csrA->Nrows, sizeof(int));
-
-  int maxNnzPerRow = 0;
-  int minNnzPerRow = 0;
-  if (csrA->Nrows)
-    minNnzPerRow = (int) csrA->diagRowStarts[1] - csrA->diagRowStarts[0];
-
-  for(dlong i=0; i<csrA->Nrows; i++) {
-    int rowNnz = (int) csrA->diagRowStarts[i+1] - csrA->diagRowStarts[i];
-    rowCounters[i] = rowNnz;
-
-    maxNnzPerRow = (rowNnz > maxNnzPerRow) ? rowNnz : maxNnzPerRow;
-    minNnzPerRow = (rowNnz < minNnzPerRow) ? rowNnz : minNnzPerRow;
-  }
-
-  // create bins
-  int numBins = maxNnzPerRow - minNnzPerRow + 1;
-
-  //zero row check
-  if (numBins<0) numBins =0;
-
-  int *bins;
-  if (numBins)
-    bins = (int *) calloc(numBins, sizeof(int));
-
-  for(dlong i=0; i<csrA->Nrows; i++){
-    bins[rowCounters[i]-minNnzPerRow]++;
-  }
-
-  dfloat threshold = 2.0/3.0;
-  dlong totalNNZ = csrA->diagNNZ+csrA->offdNNZ;
-  int nnzPerRow = 0;
-  dlong nnz = 0;
-
-  //increase the nnz per row in E until it holds threshold*totalnnz nonzeros
-  for(int i=0; i<numBins; i++){
-    nnz += bins[i] * (i+minNnzPerRow);
-    if((nnz > threshold*totalNNZ)||(i==numBins-1)){
-      nnzPerRow = i+minNnzPerRow;
-      break;
-    }
-  }
-
-  A->E = (ell *) calloc(1, sizeof(ell));
-
-  A->E->Nrows = csrA->Nrows;
-  A->E->Ncols = csrA->Ncols;
-  A->E->nnzPerRow = nnzPerRow;
-  A->E->strideLength = csrA->Nrows;
-
-  dlong *Ecols;
-  dfloat *Ecoefs;
-  if(nnzPerRow&&csrA->Nrows){
-    Ecols  = (dlong *) calloc(csrA->Nrows*nnzPerRow, sizeof(dlong));
-    Ecoefs = (dfloat *) calloc(csrA->Nrows*nnzPerRow, sizeof(dfloat));
-  }
-
-  dlong nnzC = 0;
-
-  // count the number of nonzeros to be stored in coo format
-  for(dlong i=0; i<csrA->Nrows; i++) {
-    //excess from row in diag
-    if(rowCounters[i] > nnzPerRow) nnzC += (rowCounters[i] - nnzPerRow);
-
-    //all of offd
-    int offdRowNnz = (int) csrA->offdRowStarts[i+1]-csrA->offdRowStarts[i];
-
-    nnzC += offdRowNnz;
-  }
-
-  A->E->actualNNZ  = totalNNZ - nnzC;
-
-  A->C = (coo *) calloc(1, sizeof(coo));
-
-  A->C->Nrows = csrA->Nrows;
-  A->C->Ncols = csrA->Ncols;
-  A->C->nnz   = nnzC;
-
-  dlong *Coffsets;
-  dlong *Ccols;
-  dfloat *Ccoefs;
-
-  Coffsets = (dlong *) calloc(csrA->Nrows+1, sizeof(dlong));
-  if (nnzC) {
-    Ccols    = (dlong *) calloc(nnzC, sizeof(dlong));
-    Ccoefs   = (dfloat *) calloc(nnzC, sizeof(dfloat));
-  }
-
-  nnzC = 0;
-  for(dlong i=0; i<csrA->Nrows; i++){
-    dlong Jstart = csrA->diagRowStarts[i];
-    dlong Jend   = csrA->diagRowStarts[i+1];
-    int rowNnz = (int)  Jend - Jstart;
-
-    // store only min of nnzPerRow and rowNnz
-    int maxNnz = (nnzPerRow >= rowNnz) ? rowNnz : nnzPerRow;
-
-    for(int c=0; c<maxNnz; c++){
-      Ecols [i+c*A->E->strideLength]  = csrA->diagCols[Jstart+c];
-      Ecoefs[i+c*A->E->strideLength]  = csrA->diagCoefs[Jstart+c];
-    }
-
-    // store the remaining in coo format
-    if(rowNnz > nnzPerRow){
-      for(int c=nnzPerRow; c<rowNnz; c++){
-        Coffsets[i+1]++;
-        Ccols[nnzC]   = csrA->diagCols[Jstart+c];
-        Ccoefs[nnzC]  = csrA->diagCoefs[Jstart+c];
-        nnzC++;
-      }
-    }
-
-    //add the offd non-zeros
-    for (dlong j=csrA->offdRowStarts[i];j<csrA->offdRowStarts[i+1];j++) {
-      Coffsets[i+1]++;
-      Ccols[nnzC]   = csrA->offdCols[j];
-      Ccoefs[nnzC]  = csrA->offdCoefs[j];
-      nnzC++;
-    }
-  }
-
-  //use counts to create offsets
-  for (dlong i=0;i<csrA->Nrows;i++)
-    Coffsets[i+1] += Coffsets[i];
-
-  // copy the data to device memory
-  if(csrA->Nrows) {
-    free(rowCounters); free(bins);
-  }
-
-  //copy null vector if present
-  if(csrA->null&&csrA->Nrows) 
-    A->o_null = parAlmond->device.malloc(csrA->Nrows*sizeof(dfloat), csrA->null);
-
-  if (csrA->diagInv&&csrA->Nrows)
-    A->o_diagInv = parAlmond->device.malloc(csrA->Nrows*sizeof(dfloat), csrA->diagInv);
-
-  if(A->E->nnzPerRow&&csrA->Nrows){
-    A->E->o_cols  = parAlmond->device.malloc(csrA->Nrows*nnzPerRow*sizeof(dlong), Ecols);
-    A->E->o_coefs = parAlmond->device.malloc(csrA->Nrows*nnzPerRow*sizeof(dfloat), Ecoefs);
-    free(Ecols); free(Ecoefs);
-  }
-
-  if(A->C->nnz){
-    A->C->o_offsets = parAlmond->device.malloc((csrA->Nrows+1)*sizeof(dlong), Coffsets);
-    A->C->o_cols    = parAlmond->device.malloc(A->C->nnz*sizeof(dlong), Ccols);
-    A->C->o_coefs   = parAlmond->device.malloc(A->C->nnz*sizeof(dfloat), Ccoefs);
-
-    free(Ccols); free(Ccoefs);
-  }
-
-  free(Coffsets);
-
-  A->NrecvTotal = csrA->NrecvTotal;
-  A->NsendTotal = csrA->NsendTotal;
-  A->haloElementList = csrA->haloElementList;
-  if (A->NsendTotal) A->o_haloElementList = parAlmond->device.malloc(A->NsendTotal*sizeof(dlong),A->haloElementList);
-  A->NsendPairs = csrA->NsendPairs;
-  A->NrecvPairs = csrA->NrecvPairs;
-  A->NsendMessages = csrA->NsendMessages;
-  A->NrecvMessages = csrA->NrecvMessages;
-  A->haloSendRequests = csrA->haloSendRequests;
-  A->haloRecvRequests = csrA->haloRecvRequests;
-
-  if (A->NrecvTotal) A->recvBuffer = (dfloat *) malloc(A->NrecvTotal*sizeof(dfloat));
-  if (A->NsendTotal) {
-#if 0
-    occa::memory o_haloBuffer = parAlmond->device.mappedAlloc(A->NsendTotal*sizeof(dfloat), NULL);
-    A->sendBuffer = (dfloat*) o_haloBuffer.getMappedPointer();
-#endif
-    A->sendBuffer = (dfloat*) occaHostMallocPinned(parAlmond->device, A->NsendTotal*sizeof(dfloat), NULL, A->o_haloBuffer);
-  }
-
-  return A;
-}
-
-
-void axpy(csr *A, dfloat alpha, dfloat *x, dfloat beta, dfloat *y, bool nullSpace, dfloat nullSpacePenalty) {
-
-  dfloat alphaG = 0.;
-
-  if (A->NsendTotal + A->NrecvTotal)
-    csrHaloExchangeStart(A, sizeof(dfloat), x, A->sendBuffer, x+A->NlocalCols);
-
-  // y[i] = beta*y[i] + alpha* (sum_{ij} Aij*x[j])
-  #pragma omp parallel for
-  for(dlong i=0; i<A->Nrows; i++){ //local
-    dfloat result = 0.0;
-    for(dlong jj=A->diagRowStarts[i]; jj<A->diagRowStarts[i+1]; jj++)
-      result += (A->diagCoefs[jj]*x[A->diagCols[jj]]);
-
-    y[i] = alpha*result + beta*y[i];
-  }
-
-  //rank 1 correction if there is a nullspace
-  if (nullSpace) {
-    dfloat alphaL = innerProd(A->Nrows, A->null, x);
-    MPI_Allreduce(&alphaL, &alphaG, 1, MPI_DFLOAT, MPI_SUM, agmg::comm);
-    alphaG *= nullSpacePenalty;
-  }
-
-  if (A->NsendTotal + A->NrecvTotal)
-    csrHaloExchangeFinish(A);
-
-  #pragma omp parallel for
-  for(dlong i=0; i<A->Nrows; i++){ //nonlocal
-    dfloat result = 0.0;
-    for(dlong jj=A->offdRowStarts[i]; jj<A->offdRowStarts[i+1]; jj++)
-      result += (A->offdCoefs[jj]*x[A->offdCols[jj]]);
-
-    y[i] += alpha*result;
-  }
-
-  //add the correction
-  if (nullSpace) 
-    vectorAdd(A->Nrows, alpha*alphaG, A->null, 1., y);
-}
-
-void axpy(parAlmond_t *parAlmond, dcoo *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y) {
-
-  occaTimerTic(parAlmond->device,"dcoo axpy");
-  if (A->NsendTotal) {
-    parAlmond->device.finish();
-    parAlmond->device.setStream(parAlmond->dataStream);
-    parAlmond->haloExtract(A->NsendTotal, 1, A->o_haloElementList, o_x, A->o_haloBuffer);
-
-    //copy from device
-    A->o_haloBuffer.copyTo(A->sendBuffer,"async: true");
-    parAlmond->device.setStream(parAlmond->defaultStream);
-  }
-
-  if (A->NsendTotal + A->NrecvTotal){
-    parAlmond->device.setStream(parAlmond->dataStream);
-    parAlmond->device.finish();
-    dcooHaloExchangeStart(A, sizeof(dfloat), A->sendBuffer, A->recvBuffer);
-    parAlmond->device.setStream(parAlmond->defaultStream);
-  }
-
-  if (A->diagNNZ)
-    parAlmond->agg_interpolateKernel(A->diagNNZ, A->o_diagRows, A->o_diagCols, A->o_diagCoefs, o_x, o_y);
-
-  if (A->NsendTotal + A->NrecvTotal)
-    dcooHaloExchangeFinish(A);
-
-  //copy back to device
-  if(A->NrecvTotal){
-    parAlmond->device.setStream(parAlmond->dataStream);
-    o_x.copyFrom(A->recvBuffer,A->NrecvTotal*sizeof(dfloat),A->NlocalCols*sizeof(dfloat),"async: true");
-    parAlmond->device.finish();
-    parAlmond->device.setStream(parAlmond->defaultStream);
-    parAlmond->device.finish();
-  }
-
-  if (A->offdNNZ)
-    parAlmond->agg_interpolateKernel(A->offdNNZ, A->o_offdRows, A->o_offdCols, A->o_offdCoefs, o_x, o_y);
-
-  occaTimerToc(parAlmond->device,"dcoo axpy");
-}
-
-void axpy(parAlmond_t *parAlmond, hyb *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y, bool nullSpace, dfloat nullSpacePenalty) {
-
-  dfloat alphaG = 0.;
-
-  occaTimerTic(parAlmond->device,"hyb axpy");
-  if (A->NsendTotal) {
-    parAlmond->device.finish();
-    parAlmond->device.setStream(parAlmond->dataStream);
-
-    parAlmond->haloExtract(A->NsendTotal, 1, A->o_haloElementList, o_x, A->o_haloBuffer);
-
-    //copy from device
-    A->o_haloBuffer.copyTo(A->sendBuffer,"async: true");
-
-    parAlmond->device.setStream(parAlmond->defaultStream);
-  }
-
-  // y <-- alpha*E*x+beta*y
-  axpy(parAlmond, A->E, alpha, o_x, beta, o_y);
-
-  if (A->NsendTotal+A->NrecvTotal){
-    parAlmond->device.setStream(parAlmond->dataStream); 
-    parAlmond->device.finish();
-    hybHaloExchangeStart(A, sizeof(dfloat),A->sendBuffer, A->recvBuffer);
-    parAlmond->device.setStream(parAlmond->defaultStream);
-  }
-
-  //rank 1 correction if there is a nullspace
-  if (nullSpace) {
-    dfloat alphaL = innerProd(parAlmond, A->Nrows, A->o_null, o_x);
-    MPI_Allreduce(&alphaL, &alphaG, 1, MPI_DFLOAT, MPI_SUM, agmg::comm);
-    alphaG *= nullSpacePenalty;
-  }
-
-  if (A->NsendTotal+A->NrecvTotal)
-    hybHaloExchangeFinish(A);
-
-  //copy back to device
-  if (A->NrecvTotal){
-    parAlmond->device.setStream(parAlmond->dataStream);
-    o_x.copyFrom(A->recvBuffer,A->NrecvTotal*sizeof(dfloat),A->NlocalCols*sizeof(dfloat),"async: true");
-    parAlmond->device.finish();
-    parAlmond->device.setStream(parAlmond->defaultStream);
-    parAlmond->device.finish();
-  }
-
-  // y <-- alpha*C*x + y
-  if (A->C->nnz)
-    ax(parAlmond, A->C, alpha, o_x, o_y);
-
-  //add the correction
-  if (nullSpace) 
-    vectorAdd(parAlmond, A->Nrows, alpha*alphaG, A->o_null, 1., o_y);
-
-  occaTimerToc(parAlmond->device,"hyb axpy");
-}
-
-void axpy(parAlmond_t *parAlmond, ell *A, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y) {
-
-  if(A->actualNNZ){
-    occaTimerTic(parAlmond->device,"ell axpy");
-    parAlmond->ellAXPYKernel(A->Nrows, A->nnzPerRow, A->strideLength,
-                          alpha, beta, A->o_cols, A->o_coefs, o_x, o_y);
-    occaTimerToc(parAlmond->device,"ell axpy");
-  }
-}
-
-void ax(parAlmond_t *parAlmond, coo *C, dfloat alpha, occa::memory o_x, occa::memory o_y) {
-
-  // do block-wise product
-  if(C->nnz){
-    occaTimerTic(parAlmond->device,"coo ax");
-    parAlmond->cooAXKernel(C->Nrows, alpha, C->o_offsets, C->o_cols, C->o_coefs,o_x, o_y);
-    occaTimerToc(parAlmond->device,"coo ax");
-  }
-}
-
-void smoothJacobi(parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero) {
-
-  // x = x + inv(D)*(b-A*x)
-  if(x_is_zero){
-    #pragma omp parallel for
-    for(dlong i=0; i<A->Nrows; i++){
-      x[i] = A->diagInv[i]*r[i];
-    }
-    return;
-  }
-
-  dfloat *res = level->smootherResidual;
-  #pragma omp parallel for
-  for(dlong i=0; i<A->Nrows; i++){
-    res[i] = r[i];
-  }
-
-  axpy(A, -1.0, x, 1.0, res,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-
-  // update x
-  #pragma omp parallel for
-  for (dlong i=0;i<A->Nrows;i++)
-    x[i] = x[i] + A->diagInv[i]*res[i];
-
-}
-
-
-void smoothDampedJacobi(parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero) {
-
-  // dfloat alphaG = 0.;
-  dfloat alpha = level->smoother_params[0];
-
-  if(x_is_zero){
-  #pragma omp parallel for
-    for(dlong i=0; i<A->Nrows; i++){
-      x[i] = alpha*A->diagInv[i]*r[i];
-    }
-    return;
-  }
-
-  dfloat *res = level->smootherResidual;
-  #pragma omp parallel for
-  for(dlong i=0; i<A->Nrows; i++){
-    res[i] = r[i];
-  }
-
-  axpy(A, -1.0, x, 1.0, res,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-
-  // copy the buffer vector to x
-  #pragma omp parallel for
-  for (dlong i=0;i<A->Nrows;i++)
-    x[i] = x[i] + alpha*A->diagInv[i]*res[i];
-}
-
-void smoothChebyshev(parAlmond_t *parAlmond, agmgLevel *level, csr *A, dfloat *r, dfloat *x, bool x_is_zero) {
-
-  dfloat lambdaN = level->smoother_params[0];
-  dfloat lambda1 = level->smoother_params[1];
-
-  dfloat theta = 0.5*(lambdaN+lambda1);
-  dfloat delta = 0.5*(lambdaN-lambda1);
-  dfloat invTheta = 1.0/theta;
-  dfloat sigma = theta/delta;
-  dfloat rho_n = 1./sigma;
-  dfloat rho_np1;
-
-  dfloat *res = level->smootherResidual;
-  dfloat *Ad  = level->smootherResidual2;
-  dfloat *d   = level->smootherUpdate;
-
-  // dfloat alphaG = 0.;
-
-  if(x_is_zero){ //skip the Ax if x is zero
-    #pragma omp parallel for
-    for(dlong i=0; i<A->Nrows; i++){
-      res[i] = A->diagInv[i]*r[i];
-      x[i] = 0.;
-      d[i] = invTheta*res[i];
-    }
-  } else {
-
-    level->Ax(level->AxArgs,x,res);
-
-    #pragma omp parallel for
-    for(dlong i=0; i<A->Nrows; i++){
-      res[i] = A->diagInv[i]*(r[i]-res[i]);
-      d[i]   = invTheta*res[i];
-    }
-  }
-
-  for (int k=0;k<level->ChebyshevIterations;k++) {
-    //x_k+1 = x_k + d_k
-    vectorAdd(A->Nrows, 1.0, d, 1.0, x);
-
-    //r_k+1 = r_k - D^{-1}Ad_k
-    level->Ax(level->AxArgs,d,Ad);
-    #pragma omp parallel for
-    for(dlong i=0; i<A->Nrows; i++) {
-      res[i] = res[i] - A->diagInv[i]*Ad[i];
-    }
-
-    rho_np1 = 1.0/(2.*sigma-rho_n);
-
-    //d_k+1 = rho_k+1*rho_k*d_k  + 2*rho_k+1*r_k+1/delta
-    vectorAdd(A->Nrows, 2.0*rho_np1/delta, res, rho_np1*rho_n, d);
-    rho_n = rho_np1;
-  }
-  //x_k+1 = x_k + d_k
-  vectorAdd(A->Nrows, 1.0, d, 1.0, x);
-}
-
-void smoothJacobi(parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero) {
-
-  // dfloat alphaG = 0.;
-
-  occaTimerTic(parAlmond->device,"hyb smoothJacobi");
-  if(x_is_zero){
-    if (A->Nrows)
-      dotStar(parAlmond, A->Nrows, 1.0, A->o_diagInv, o_r, 0.0, o_x);
-    occaTimerToc(parAlmond->device,"hyb smoothJacobi");
-    return;
-  }
-
-  occa::memory o_res = level->o_smootherResidual;
-
-  o_res.copyFrom(o_r,A->Nrows*sizeof(dfloat));
-  axpy(parAlmond, A, -1.0, o_x, 1.0, o_res,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-
-  // x = x + inv(D)*(r-A*x)
-  dotStar(parAlmond, A->Nrows, 1.0, A->o_diagInv, o_res, 1.0, o_x);
-  occaTimerToc(parAlmond->device,"hyb smoothJacobi");
-}
-
-void smoothDampedJacobi(parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero){
-
-  // dfloat alphaG = 0.;
-  dfloat alpha = level->smoother_params[0];
-
-  occaTimerTic(parAlmond->device,"hyb smoothDampedJacobi");
-  if(x_is_zero){
-    if (A->Nrows)
-      dotStar(parAlmond, A->Nrows, alpha, A->o_diagInv, o_r, 0.0, o_x);
-    occaTimerToc(parAlmond->device,"hyb smoothDampedJacobi");
-    return;
-  }
-
-  occa::memory o_res = level->o_smootherResidual;
-
-  o_res.copyFrom(o_r,A->Nrows*sizeof(dfloat));
-  axpy(parAlmond, A, -1.0, o_x, 1.0, o_res,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-
-  // x = x + alpha*inv(D)*(r-A*x)
-  dotStar(parAlmond, A->Nrows, alpha, A->o_diagInv, o_res, 1.0, o_x);
-  occaTimerToc(parAlmond->device,"hyb smoothDampedJacobi");
-}
-
-void smoothChebyshev(parAlmond_t *parAlmond, agmgLevel *level, hyb *A, occa::memory o_r, occa::memory o_x, bool x_is_zero) {
-
-  dfloat lambdaN = level->smoother_params[0];
-  dfloat lambda1 = level->smoother_params[1];
-
-  dfloat theta = 0.5*(lambdaN+lambda1);
-  dfloat delta = 0.5*(lambdaN-lambda1);
-  dfloat invTheta = 1.0/theta;
-  dfloat sigma = theta/delta;
-  dfloat rho_n = 1./sigma;
-  dfloat rho_np1;
-
-  occa::memory o_res = level->o_smootherResidual;
-  occa::memory o_Ad  = level->o_smootherResidual2;
-  occa::memory o_d   = level->o_smootherUpdate;
-
-  // dfloat alphaG = 0.;
-
-  occaTimerTic(parAlmond->device,"hyb smoothChebyshev");
-
-  if(x_is_zero){ //skip the Ax if x is zero
-    //res = D^{-1}r
-    dotStar(parAlmond, A->Nrows, 1.0, A->o_diagInv, o_r, 0.0, o_res);
-    setVector(parAlmond, A->Nrows, o_x, 0.0);
-    //d = invTheta*res
-    vectorAdd(parAlmond, A->Nrows, invTheta, o_res, 0.0, o_d);
-
-  } else {
-
-    //res = D^{-1}(r-Ax)
-    level->device_Ax(level->AxArgs,o_x,o_res);
-    vectorAdd(parAlmond, A->Nrows, 1.0, o_r, -1.0, o_res);
-    dotStar(parAlmond, A->Nrows, A->o_diagInv, o_res);
-
-    //d = invTheta*res
-    vectorAdd(parAlmond, A->Nrows, invTheta, o_res, 0.0, o_d);
-  }
-
-  for (int k=0;k<level->ChebyshevIterations;k++) {
-    //x_k+1 = x_k + d_k
-    vectorAdd(parAlmond, A->Nrows, 1.0, o_d, 1.0, o_x);
-
-    //r_k+1 = r_k - D^{-1}Ad_k
-    level->device_Ax(level->AxArgs,o_d,o_Ad);
-    dotStar(parAlmond, A->Nrows, -1.0, A->o_diagInv, o_Ad, 1.0, o_res);
-
-    rho_np1 = 1.0/(2.*sigma-rho_n);
-
-    //d_k+1 = rho_k+1*rho_k*d_k  + 2*rho_k+1*r_k+1/delta
-    vectorAdd(parAlmond, A->Nrows, 2.0*rho_np1/delta, o_res, rho_np1*rho_n, o_d);
-    rho_n = rho_np1;
-  }
-  //x_k+1 = x_k + d_k
-  vectorAdd(parAlmond, A->Nrows, 1.0, o_d, 1.0, o_x);
-
-  occaTimerToc(parAlmond->device,"hyb smoothChebyshev");
-}
-
-
-// set up halo infomation for inter-processor MPI
-// exchange of trace nodes
-void csrHaloSetup(csr *A, hlong *globalColStarts){
-
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-  
-  // non-blocking MPI isend/irecv requests (used in meshHaloExchange)
-  A->haloSendRequests = calloc(size, sizeof(MPI_Request));
-  A->haloRecvRequests = calloc(size, sizeof(MPI_Request));
-
-  // count number of halo element nodes to swap
-  A->NrecvTotal = 0;
-  A->NsendPairs = (int*) calloc(size, sizeof(int));
-  A->NrecvPairs = (int*) calloc(size, sizeof(int));
-  for(dlong n=A->NlocalCols;n<A->Ncols;++n){ //for just the halo
-    hlong id = A->colMap[n]; // global index
-    for (int r=0;r<size;r++) { //find owner's rank
-      if (globalColStarts[r]-1<id && id < globalColStarts[r+1]) {
-        A->NrecvTotal++;
-        A->NrecvPairs[r]++;
-      }
-    }
-  }
-
-  MPI_Alltoall(A->NrecvPairs, 1, MPI_INT, A->NsendPairs, 1, MPI_INT, agmg::comm);
-
-  A->NsendTotal = 0;
-  for (int r=0;r<size;r++)
-    A->NsendTotal += A->NsendPairs[r];
-
-  hlong *ghaloElementList;
-  if (A->NsendTotal) {
-    ghaloElementList   = (hlong *) calloc(A->NsendTotal,sizeof(hlong));
-    A->haloElementList = (dlong *) calloc(A->NsendTotal,sizeof(dlong));
-  }
-
-  // count number of MPI messages in halo exchange
-  A->NsendMessages = 0;
-  A->NrecvMessages = 0;
-  for(int r=0;r<size;++r) {
-    if(A->NsendPairs[r])
-      A->NsendMessages++;
-    if(A->NrecvPairs[r])
-      A->NrecvMessages++;
-  }
-
-  //exchange the needed ids
-  int tag = 999;
-  dlong recvOffset = A->NlocalCols;
-  int sendOffset = 0;
-  int sendMessage = 0, recvMessage = 0;
-  for(int r=0;r<size;++r){
-     if(A->NsendPairs[r]) {
-      MPI_Irecv(ghaloElementList+sendOffset, A->NsendPairs[r], MPI_HLONG, r, tag,
-          agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage);
-      sendOffset += A->NsendPairs[r];
-      ++sendMessage;
-    }
-    if(A->NrecvPairs[r]){
-      MPI_Isend(A->colMap+recvOffset, A->NrecvPairs[r], MPI_HLONG, r, tag,
-          agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage);
-      recvOffset += A->NrecvPairs[r];
-      ++recvMessage;
-    }
-  }
-
-  // Wait for all sent messages to have left and received messages to have arrived
-  MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status));
-  MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status));
-
-  MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus);
-  MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus);
-
-  free(recvStatus);
-  free(sendStatus);
-
-  //shift to local ids
-  for (int n=0;n<A->NsendTotal;n++)
-    A->haloElementList[n] = (dlong) ghaloElementList[n] - globalColStarts[rank];
-
-  if (A->NsendTotal)
-    A->sendBuffer = (dfloat *) calloc(A->NsendTotal,sizeof(dfloat));
-
-  A->totalHaloPairs = A->NsendTotal+A->NrecvTotal;
-}
-
-void csrHaloExchange(csr *A,
-                    size_t Nbytes,         // message size per element
-                    void *sourceBuffer,
-                    void *sendBuffer,    // temporary buffer
-                    void *recvBuffer) {
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-  
-  int tag = 999;
-
-  // copy data from outgoing elements into temporary send buffer
-  for(int i=0;i<A->NsendTotal;++i){
-    // outgoing element
-    dlong id = A->haloElementList[i];
-
-    memcpy(((char*)sendBuffer)+i*Nbytes, ((char*)sourceBuffer)+id*Nbytes, Nbytes);
-  }
-
-  // initiate immediate send  and receives to each other process as needed
-  int recvOffset = 0;
-  int sendOffset = 0;
-  int sendMessage = 0, recvMessage = 0;
-  for(int r=0;r<size;++r){
-    if (A->NrecvTotal) {
-      if(A->NrecvPairs[r]) {
-        MPI_Irecv(((char*)recvBuffer)+recvOffset, A->NrecvPairs[r]*Nbytes, MPI_CHAR, r, tag,
-            agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage);
-        recvOffset += A->NrecvPairs[r]*Nbytes;
-        ++recvMessage;
-      }
-    }
-    if (A->NsendTotal) {
-      if(A->NsendPairs[r]){
-        MPI_Isend(((char*)sendBuffer)+sendOffset, A->NsendPairs[r]*Nbytes, MPI_CHAR, r, tag,
-            agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage);
-        sendOffset += A->NsendPairs[r]*Nbytes;
-        ++sendMessage;
-      }
-    }
-  }
-
-  // Wait for all sent messages to have left and received messages to have arrived
-  if (A->NsendTotal) {
-    MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus);
-    free(sendStatus);
-  }
-  if (A->NrecvTotal) {
-    MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus);
-    free(recvStatus);
-  }
-}
-
-void csrHaloExchangeStart(csr *A,
-                    size_t Nbytes,         // message size per element
-                    void *sourceBuffer,
-                    void *sendBuffer,    // temporary buffer
-                    void *recvBuffer) {
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-  
-  int tag = 999;
-
-  // copy data from outgoing elements into temporary send buffer
-  for(int i=0;i<A->NsendTotal;++i){
-    // outgoing element
-    dlong id = A->haloElementList[i];
-
-    memcpy(((char*)sendBuffer)+i*Nbytes, ((char*)sourceBuffer)+id*Nbytes, Nbytes);
-  }
-
-  // initiate immediate send  and receives to each other process as needed
-  int recvOffset = 0;
-  int sendOffset = 0;
-  int sendMessage = 0, recvMessage = 0;
-  for(int r=0;r<size;++r){
-    if (A->NrecvTotal) {
-      if(A->NrecvPairs[r]) {
-        MPI_Irecv(((char*)recvBuffer)+recvOffset, A->NrecvPairs[r]*Nbytes, MPI_CHAR, r, tag,
-            agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage);
-        recvOffset += A->NrecvPairs[r]*Nbytes;
-        ++recvMessage;
-      }
-    }
-    if (A->NsendTotal) {
-      if(A->NsendPairs[r]){
-        MPI_Isend(((char*)sendBuffer)+sendOffset, A->NsendPairs[r]*Nbytes, MPI_CHAR, r, tag,
-            agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage);
-        sendOffset += A->NsendPairs[r]*Nbytes;
-        ++sendMessage;
-      }
-    }
-  }
-}
-
-void csrHaloExchangeFinish(csr *A) {
-  // Wait for all sent messages to have left and received messages to have arrived
-  if (A->NsendTotal) {
-    MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus);
-    free(sendStatus);
-  }
-  if (A->NrecvTotal) {
-    MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus);
-    free(recvStatus);
-  }
-}
-
-void dcooHaloExchangeStart(dcoo *A, size_t Nbytes, void *sendBuffer, void *recvBuffer) {
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  // count outgoing and incoming meshes
-  int tag = 999;
-
-  // initiate immediate send  and receives to each other process as needed
-  int recvOffset = 0;
-  int sendOffset = 0;
-  int sendMessage = 0, recvMessage = 0;
-  for(int r=0;r<size;++r){
-    if (A->NrecvTotal) {
-      if(A->NrecvPairs[r]) {
-        MPI_Irecv(((char*)A->recvBuffer)+recvOffset, A->NrecvPairs[r]*Nbytes, MPI_CHAR, r, tag,
-            agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage);
-        recvOffset += A->NrecvPairs[r]*Nbytes;
-        ++recvMessage;
-      }
-    }
-    if (A->NsendTotal) {
-      if(A->NsendPairs[r]){
-        MPI_Isend(((char*)A->sendBuffer)+sendOffset, A->NsendPairs[r]*Nbytes, MPI_CHAR, r, tag,
-            agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage);
-        sendOffset += A->NsendPairs[r]*Nbytes;
-        ++sendMessage;
-      }
-    }
-  }
-}
-
-void dcooHaloExchangeFinish(dcoo *A) {
-  // Wait for all sent messages to have left and received messages to have arrived
-  if (A->NsendTotal) {
-    MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus);
-    free(sendStatus);
-  }
-  if (A->NrecvTotal) {
-    MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus);
-    free(recvStatus);
-  }
-}
-
-void hybHaloExchangeStart(hyb *A, size_t Nbytes, void *sendBuffer, void *recvBuffer) {
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  // count outgoing and incoming meshes
-  int tag = 999;
-
-  // initiate immediate send  and receives to each other process as needed
-  int recvOffset = 0;
-  int sendOffset = 0;
-  int sendMessage = 0, recvMessage = 0;
-  for(int r=0;r<size;++r){
-    if (A->NrecvTotal) {
-      if(A->NrecvPairs[r]) {
-        MPI_Irecv(((char*)recvBuffer)+recvOffset, A->NrecvPairs[r]*Nbytes, MPI_CHAR, r, tag,
-            agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage);
-        recvOffset += A->NrecvPairs[r]*Nbytes;
-        ++recvMessage;
-      }
-    }
-    if (A->NsendTotal) {
-      if(A->NsendPairs[r]){
-        MPI_Isend(((char*)sendBuffer)+sendOffset, A->NsendPairs[r]*Nbytes, MPI_CHAR, r, tag,
-            agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage);
-        sendOffset += A->NsendPairs[r]*Nbytes;
-        ++sendMessage;
-      }
-    }
-  }
-}
-
-void hybHaloExchangeFinish(hyb *A) {
-  // Wait for all sent messages to have left and received messages to have arrived
-  if (A->NsendTotal) {
-    MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus);
-    free(sendStatus);
-  }
-  if (A->NrecvTotal) {
-    MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus);
-    free(recvStatus);
-  }
-}
-
diff --git a/solvers/parALMOND/src/agmgSetup.c b/solvers/parALMOND/src/agmgSetup.c
deleted file mode 100644
index 8464ab11a..000000000
--- a/solvers/parALMOND/src/agmgSetup.c
+++ /dev/null
@@ -1,1757 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "agmg.h"
-
-csr *strong_graph(csr *A, dfloat threshold);
-bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i);
-hlong *form_aggregates(agmgLevel *level, csr *C);
-void find_aggregate_owners(agmgLevel *level, hlong* FineToCoarse, setupAide options);
-csr *construct_interpolator(agmgLevel *level, hlong *FineToCoarse, dfloat **nullCoarseA);
-csr *transpose(agmgLevel* level, csr *A, hlong *globalRowStarts, hlong *globalColStarts);
-csr *galerkinProd(agmgLevel *level, csr *R, csr *A, csr *P);
-void coarsenAgmgLevel(agmgLevel *level, csr **coarseA, csr **P, csr **R, dfloat **nullCoarseA, setupAide options);
-
-
-void agmgSetup(parAlmond_t *parAlmond, csr *A, dfloat *nullA, hlong *globalRowStarts, setupAide options){
-
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  // approximate Nrows at coarsest level
-  int gCoarseSize = 1000;
-
-  double seed = (double) rank;
-  srand48(seed);
-
-  agmgLevel **levels = parAlmond->levels;
-
-  int lev = parAlmond->numLevels; //add this level to the end of the chain
-
-  levels[lev] = (agmgLevel *) calloc(1,sizeof(agmgLevel));
-  levels[lev]->gatherLevel = false;
-  levels[lev]->weightedInnerProds = false;
-  parAlmond->numLevels++;
-
-  //copy A matrix and null vector
-  levels[lev]->A = A;
-  levels[lev]->A->null = nullA;
-
-  levels[lev]->Nrows = A->Nrows;
-  levels[lev]->Ncols = A->Ncols;
-
-  
-  SmoothType smoothType;
-  int ChebyshevIterations=2; //default to degree 2
-  if (options.compareArgs("PARALMOND SMOOTHER", "CHEBYSHEV")) {
-    smoothType = CHEBYSHEV;
-    options.getArgs("PARALMOND CHEBYSHEV DEGREE", ChebyshevIterations);
-  } else { //default to DAMPED_JACOBI
-    smoothType = DAMPED_JACOBI;
-  }
-  levels[lev]->ChebyshevIterations = ChebyshevIterations;
-
-  setupSmoother(parAlmond, levels[lev], smoothType);
-
-  levels[lev]->deviceA = newHYB(parAlmond, levels[lev]->A);
-
-  //set operator callback
-  void **args = (void **) calloc(2,sizeof(void*));
-  args[0] = (void *) parAlmond;
-  args[1] = (void *) levels[lev];
-
-  levels[lev]->AxArgs = args;
-  levels[lev]->smoothArgs = args;
-  levels[lev]->Ax = agmgAx;
-  levels[lev]->smooth = agmgSmooth;
-  levels[lev]->device_Ax = device_agmgAx;
-  levels[lev]->device_smooth = device_agmgSmooth;
-
-  //copy global partiton
-  levels[lev]->globalRowStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  for (int r=0;r<size+1;r++)
-      levels[lev]->globalRowStarts[r] = globalRowStarts[r];
-
-  hlong localSize = (hlong) levels[lev]->A->Nrows;
-  hlong globalSize = 0;
-  MPI_Allreduce(&localSize, &globalSize, 1, MPI_HLONG, MPI_SUM, agmg::comm);
-
-  //if the system if already small, dont create MG levels
-  bool done = false;
-  if(globalSize <= gCoarseSize){
-    setupExactSolve(parAlmond, levels[lev],parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-    //setupSmoother(parAlmond, levels[lev], smoothType);
-    done = true;
-  }
-  while(!done){
-    // create coarse MG level
-    levels[lev+1] = (agmgLevel *) calloc(1,sizeof(agmgLevel));
-    dfloat *nullCoarseA;
-
-    //printf("Setting up coarse level %d\n", lev+1);
-
-    coarsenAgmgLevel(levels[lev], &(levels[lev+1]->A), &(levels[lev+1]->P),
-                                  &(levels[lev+1]->R), &nullCoarseA, parAlmond->options);
-
-    //set dimensions of the fine level (max among the A,R ops)
-    levels[lev]->Ncols = mymax(levels[lev]->Ncols, levels[lev+1]->R->Ncols);
-
-    parAlmond->numLevels++;
-
-    levels[lev+1]->A->null = nullCoarseA;
-    levels[lev+1]->Nrows = levels[lev+1]->A->Nrows;
-    levels[lev+1]->Ncols = mymax(levels[lev+1]->A->Ncols, levels[lev+1]->P->Ncols);
-    levels[lev+1]->globalRowStarts = levels[lev]->globalAggStarts;
-    
-    levels[lev+1]->ChebyshevIterations = ChebyshevIterations;
-
-    setupSmoother(parAlmond, levels[lev+1], smoothType);
-
-    levels[lev+1]->deviceA = newHYB (parAlmond, levels[lev+1]->A);
-    levels[lev+1]->deviceR = newHYB (parAlmond, levels[lev+1]->R);
-    levels[lev+1]->dcsrP   = newDCOO(parAlmond, levels[lev+1]->P);
-
-    //set operator callback
-    void **args = (void **) calloc(2,sizeof(void*));
-    args[0] = (void *) parAlmond;
-    args[1] = (void *) levels[lev+1];
-
-    levels[lev+1]->AxArgs = args;
-    levels[lev+1]->coarsenArgs = args;
-    levels[lev+1]->prolongateArgs = args;
-    levels[lev+1]->smoothArgs = args;
-
-    levels[lev+1]->Ax = agmgAx;
-    levels[lev+1]->coarsen = agmgCoarsen;
-    levels[lev+1]->prolongate = agmgProlongate;
-    levels[lev+1]->smooth = agmgSmooth;
-
-    levels[lev+1]->device_Ax = device_agmgAx;
-    levels[lev+1]->device_coarsen = device_agmgCoarsen;
-    levels[lev+1]->device_prolongate = device_agmgProlongate;
-    levels[lev+1]->device_smooth = device_agmgSmooth;
-
-    const hlong localCoarseDim = (hlong) levels[lev+1]->A->Nrows;
-    hlong globalCoarseSize;
-    MPI_Allreduce(&localCoarseDim, &globalCoarseSize, 1, MPI_HLONG, MPI_SUM, agmg::comm);
-
-    if(globalCoarseSize <= gCoarseSize || globalSize < 2*globalCoarseSize){
-      setupExactSolve(parAlmond, levels[lev+1],parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-      //setupSmoother(parAlmond, levels[lev+1], smoothType);
-      break;
-    }
-
-    globalSize = globalCoarseSize;
-    lev++;
-  } 
-  
-  //allocate vectors required
-  occa::device device = parAlmond->device;
-  for (int n=0;n<parAlmond->numLevels;n++) {
-    dlong N = levels[n]->Nrows;
-    dlong M = levels[n]->Ncols;
-
-    if ((n>0)&&(n<parAlmond->numLevels)) { //kcycle vectors
-      if (M) levels[n]->ckp1 = (dfloat *) calloc(M,sizeof(dfloat));
-      if (N) levels[n]->vkp1 = (dfloat *) calloc(N,sizeof(dfloat));
-      if (N) levels[n]->wkp1 = (dfloat *) calloc(N,sizeof(dfloat));
-
-      if (M) levels[n]->o_ckp1 = device.malloc(M*sizeof(dfloat),levels[n]->ckp1);
-      if (N) levels[n]->o_vkp1 = device.malloc(N*sizeof(dfloat),levels[n]->vkp1);
-      if (N) levels[n]->o_wkp1 = device.malloc(N*sizeof(dfloat),levels[n]->wkp1);
-    }
-    if (M) levels[n]->x    = (dfloat *) calloc(M,sizeof(dfloat));
-    if (M) levels[n]->res  = (dfloat *) calloc(M,sizeof(dfloat));
-    if (N) levels[n]->rhs  = (dfloat *) calloc(N,sizeof(dfloat));
-
-    if (M) levels[n]->o_x   = device.malloc(M*sizeof(dfloat),levels[n]->x);
-    if (M) levels[n]->o_res = device.malloc(M*sizeof(dfloat),levels[n]->res);
-    if (N) levels[n]->o_rhs = device.malloc(N*sizeof(dfloat),levels[n]->rhs);
-  }
-  //buffer for innerproducts in kcycle
-  dlong numBlocks = ((levels[0]->Nrows+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD;
-  parAlmond->rho  = (dfloat*) calloc(3*numBlocks,sizeof(dfloat));
-  parAlmond->o_rho  = device.malloc(3*numBlocks*sizeof(dfloat), parAlmond->rho); 
-}
-
-void parAlmondReport(parAlmond_t *parAlmond) {
-
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  if(rank==0) {
-    printf("------------------ParAlmond Report-----------------------------------\n");
-    printf("---------------------------------------------------------------------\n");
-    printf("level| active ranks |   dimension   |  nnzs         |  nnz/row      |\n");
-    printf("     |              | (min,max,avg) | (min,max,avg) | (min,max,avg) |\n");
-    printf("---------------------------------------------------------------------\n");
-  }
-
-  for(int lev=0; lev<parAlmond->numLevels; lev++){
-
-    dlong Nrows = parAlmond->levels[lev]->Nrows;
-    hlong hNrows = (hlong) parAlmond->levels[lev]->Nrows;
-
-    int active = (Nrows>0) ? 1:0;
-    int totalActive=0;
-    MPI_Allreduce(&active, &totalActive, 1, MPI_INT, MPI_SUM, agmg::comm);
-
-    dlong minNrows=0, maxNrows=0;
-    hlong totalNrows=0;
-    dfloat avgNrows;
-    MPI_Allreduce(&Nrows, &maxNrows, 1, MPI_DLONG, MPI_MAX, agmg::comm);
-    MPI_Allreduce(&hNrows, &totalNrows, 1, MPI_HLONG, MPI_SUM, agmg::comm);
-    avgNrows = (dfloat) totalNrows/totalActive;
-
-    if (Nrows==0) Nrows=maxNrows; //set this so it's ignored for the global min
-    MPI_Allreduce(&Nrows, &minNrows, 1, MPI_DLONG, MPI_MIN, agmg::comm);
-
-
-    long long int nnz;
-    if (parAlmond->levels[lev]->A)
-      nnz = parAlmond->levels[lev]->A->diagNNZ+parAlmond->levels[lev]->A->offdNNZ;
-    else
-      nnz =0;
-    long long int minNnz=0, maxNnz=0, totalNnz=0;
-    dfloat avgNnz;
-    MPI_Allreduce(&nnz, &maxNnz, 1, MPI_LONG_LONG_INT, MPI_MAX, agmg::comm);
-    MPI_Allreduce(&nnz, &totalNnz, 1, MPI_LONG_LONG_INT, MPI_SUM, agmg::comm);
-    avgNnz = (dfloat) totalNnz/totalActive;
-
-    if (nnz==0) nnz = maxNnz; //set this so it's ignored for the global min
-    MPI_Allreduce(&nnz, &minNnz, 1, MPI_LONG_LONG_INT, MPI_MIN, agmg::comm);
-
-    Nrows = parAlmond->levels[lev]->Nrows;
-    dfloat nnzPerRow = (Nrows==0) ? 0 : (dfloat) nnz/Nrows;
-    dfloat minNnzPerRow=0, maxNnzPerRow=0, avgNnzPerRow=0;
-    MPI_Allreduce(&nnzPerRow, &maxNnzPerRow, 1, MPI_DFLOAT, MPI_MAX, agmg::comm);
-    MPI_Allreduce(&nnzPerRow, &avgNnzPerRow, 1, MPI_DFLOAT, MPI_SUM, agmg::comm);
-    avgNnzPerRow /= totalActive;
-
-    if (Nrows==0) nnzPerRow = maxNnzPerRow;
-    MPI_Allreduce(&nnzPerRow, &minNnzPerRow, 1, MPI_DFLOAT, MPI_MIN, agmg::comm);
-
-    if (rank==0){
-      printf(" %3d |        %4d  |   %10.2f  |   %10.2f  |   %10.2f  |\n",
-        lev, totalActive, (dfloat)minNrows, (dfloat)minNnz, minNnzPerRow);
-      printf("     |              |   %10.2f  |   %10.2f  |   %10.2f  |\n",
-        (dfloat)maxNrows, (dfloat)maxNnz, maxNnzPerRow);
-      printf("     |              |   %10.2f  |   %10.2f  |   %10.2f  |\n",
-        avgNrows, avgNnz, avgNnzPerRow);
-    }
-  }
-  if(rank==0)
-    printf("---------------------------------------------------------------------\n");
-}
-
-
-//create coarsened problem
-void coarsenAgmgLevel(agmgLevel *level, csr **coarseA, csr **P, csr **R, dfloat **nullCoarseA, setupAide options){
-
-  // establish the graph of strong connections
-  level->threshold = 0.5;
-
-  csr *C = strong_graph(level->A, level->threshold);
-
-  hlong *FineToCoarse = form_aggregates(level, C);
-
-  find_aggregate_owners(level,FineToCoarse,options);
-
-  *P = construct_interpolator(level, FineToCoarse, nullCoarseA);
-  *R = transpose(level, *P, level->globalRowStarts, level->globalAggStarts);
-  *coarseA = galerkinProd(level, *R, level->A, *P);
-}
-
-csr * strong_graph(csr *A, dfloat threshold){
-
-  const dlong N = A->Nrows;
-  const dlong M = A->Ncols;
-
-  csr *C = (csr *) calloc(1, sizeof(csr));
-
-  C->Nrows = N;
-  C->Ncols = M;
-
-  C->diagRowStarts = (dlong *) calloc(N+1,sizeof(dlong));
-  C->offdRowStarts = (dlong *) calloc(N+1,sizeof(dlong));
-
-  dfloat *maxOD;
-  if (N) maxOD = (dfloat *) calloc(N,sizeof(dfloat));
-
-  //store the diagonal of A for all needed columns
-  dfloat *diagA = (dfloat *) calloc(M,sizeof(dfloat));
-  for (dlong i=0;i<N;i++)
-    diagA[i] = A->diagCoefs[A->diagRowStarts[i]];
-  csrHaloExchange(A, sizeof(dfloat), diagA, A->sendBuffer, diagA+A->NlocalCols);
-
-  #pragma omp parallel for
-  for(dlong i=0; i<N; i++){
-    dfloat sign = (diagA[i] >= 0) ? 1:-1;
-    dfloat Aii = fabs(diagA[i]);
-
-    //find maxOD
-    //local entries
-    dlong Jstart = A->diagRowStarts[i], Jend = A->diagRowStarts[i+1];
-    for(dlong jj= Jstart+1; jj<Jend; jj++){
-      dlong col = A->diagCols[jj];
-      dfloat Ajj = fabs(diagA[col]);
-      dfloat OD = -sign*A->diagCoefs[jj]/(sqrt(Aii)*sqrt(Ajj));
-      if(OD > maxOD[i]) maxOD[i] = OD;
-    }
-    //non-local entries
-    Jstart = A->offdRowStarts[i], Jend = A->offdRowStarts[i+1];
-    for(dlong jj= Jstart; jj<Jend; jj++){
-      dlong col = A->offdCols[jj];
-      dfloat Ajj = fabs(diagA[col]);
-      dfloat OD = -sign*A->offdCoefs[jj]/(sqrt(Aii)*sqrt(Ajj));
-      if(OD > maxOD[i]) maxOD[i] = OD;
-    }
-
-    int diag_strong_per_row = 1; // diagonal entry
-    //local entries
-    Jstart = A->diagRowStarts[i], Jend = A->diagRowStarts[i+1];
-    for(dlong jj = Jstart+1; jj<Jend; jj++){
-      dlong col = A->diagCols[jj];
-      dfloat Ajj = fabs(diagA[col]);
-      dfloat OD = -sign*A->diagCoefs[jj]/(sqrt(Aii)*sqrt(Ajj));
-      if(OD > threshold*maxOD[i]) diag_strong_per_row++;
-    }
-    int offd_strong_per_row = 0;
-    //non-local entries
-    Jstart = A->offdRowStarts[i], Jend = A->offdRowStarts[i+1];
-    for(dlong jj= Jstart; jj<Jend; jj++){
-      dlong col = A->offdCols[jj];
-      dfloat Ajj = fabs(diagA[col]);
-      dfloat OD = -sign*A->offdCoefs[jj]/(sqrt(Aii)*sqrt(Ajj));
-      if(OD > threshold*maxOD[i]) offd_strong_per_row++;
-    }
-
-    C->diagRowStarts[i+1] = diag_strong_per_row;
-    C->offdRowStarts[i+1] = offd_strong_per_row;
-  }
-
-  // cumulative sum
-  for(dlong i=1; i<N+1 ; i++) {
-    C->diagRowStarts[i] += C->diagRowStarts[i-1];
-    C->offdRowStarts[i] += C->offdRowStarts[i-1];
-  }
-
-  C->diagNNZ = C->diagRowStarts[N];
-  C->offdNNZ = C->offdRowStarts[N];
-
-  if (C->diagNNZ) C->diagCols = (dlong *) calloc(C->diagNNZ, sizeof(dlong));
-  if (C->offdNNZ) C->offdCols = (dlong *) calloc(C->offdNNZ, sizeof(dlong));
-
-  // fill in the columns for strong connections
-  #pragma omp parallel for
-  for(dlong i=0; i<N; i++){
-    dfloat sign = (diagA[i] >= 0) ? 1:-1;
-    dfloat Aii = fabs(diagA[i]);
-
-    dlong diagCounter = C->diagRowStarts[i];
-    dlong offdCounter = C->offdRowStarts[i];
-
-    //local entries
-    C->diagCols[diagCounter++] = i;// diag entry
-    dlong Jstart = A->diagRowStarts[i], Jend = A->diagRowStarts[i+1];
-    for(dlong jj = Jstart+1; jj<Jend; jj++){
-      dlong col = A->diagCols[jj];
-      dfloat Ajj = fabs(diagA[col]);
-      dfloat OD = -sign*A->diagCoefs[jj]/(sqrt(Aii)*sqrt(Ajj));
-      if(OD > threshold*maxOD[i])
-        C->diagCols[diagCounter++] = A->diagCols[jj];
-    }
-    Jstart = A->offdRowStarts[i], Jend = A->offdRowStarts[i+1];
-    for(dlong jj = Jstart; jj<Jend; jj++){
-      dlong col = A->offdCols[jj];
-      dfloat Ajj = fabs(diagA[col]);
-      dfloat OD = -sign*A->offdCoefs[jj]/(sqrt(Aii)*sqrt(Ajj));
-      if(OD > threshold*maxOD[i])
-        C->offdCols[offdCounter++] = A->offdCols[jj];
-    }
-  }
-  if(N) free(maxOD);
-
-  return C;
-}
-
-bool customLess(int smax, dfloat rmax, hlong imax, int s, dfloat r, hlong i){
-
-  if(s > smax) return true;
-  if(smax > s) return false;
-
-  if(r > rmax) return true;
-  if(rmax > r) return false;
-
-  if(i > imax) return true;
-  if(i < imax) return false;
-
-  return false;
-}
-
-hlong * form_aggregates(agmgLevel *level, csr *C){
-
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  const dlong N   = C->Nrows;
-  const dlong M   = C->Ncols;
-  const dlong diagNNZ = C->diagNNZ;
-  const dlong offdNNZ = C->offdNNZ;
-
-  hlong *FineToCoarse = (hlong *) calloc(M, sizeof(hlong));
-  for (dlong i =0;i<M;i++) FineToCoarse[i] = -1;
-
-  dfloat *rands  = (dfloat *) calloc(M, sizeof(dfloat));
-  int   *states = (int *)   calloc(M, sizeof(int));
-
-  dfloat *Tr = (dfloat *) calloc(M, sizeof(dfloat));
-  int    *Ts = (int *)    calloc(M, sizeof(int));
-  hlong  *Ti = (hlong *)  calloc(M, sizeof(hlong));
-  hlong  *Tc = (hlong *)  calloc(M, sizeof(hlong));
-
-  csr *A = level->A;
-  hlong *globalRowStarts = level->globalRowStarts;
-
-  int    *intSendBuffer;
-  hlong  *hlongSendBuffer;
-  dfloat *dfloatSendBuffer;
-  if (level->A->NsendTotal) {
-    intSendBuffer = (int *) calloc(A->NsendTotal,sizeof(int));
-    hlongSendBuffer = (hlong *) calloc(A->NsendTotal,sizeof(hlong));
-    dfloatSendBuffer = (dfloat *) calloc(A->NsendTotal,sizeof(dfloat));
-  }
-
-  for(dlong i=0; i<N; i++)
-    rands[i] = (dfloat) drand48();
-
-  for(dlong i=0; i<N; i++)
-    states[i] = 0;
-
-  // add the number of non-zeros in each column
-  //local non-zeros
-  for(dlong i=0; i<diagNNZ; i++)
-    rands[C->diagCols[i]] += 1.;
-
-  int *nnzCnt, *recvNnzCnt;
-  if (A->NHalo) nnzCnt = (int *) calloc(A->NHalo,sizeof(int));
-  if (A->NsendTotal) recvNnzCnt = (int *) calloc(A->NsendTotal,sizeof(int));
-
-  //count the non-local non-zeros
-  for (dlong i=0;i<offdNNZ;i++)
-    nnzCnt[C->offdCols[i]-A->NlocalCols]++;
-
-  //do a reverse halo exchange
-  int tag = 999;
-
-  // initiate immediate send  and receives to each other process as needed
-  dlong recvOffset = 0;
-  dlong sendOffset = 0;
-  int sendMessage = 0, recvMessage = 0;
-  for(int r=0;r<size;++r){
-    if (A->NsendTotal) {
-      if(A->NsendPairs[r]) {
-        MPI_Irecv(recvNnzCnt+sendOffset, A->NsendPairs[r], MPI_INT, r, tag,
-            agmg::comm, (MPI_Request*)A->haloSendRequests+sendMessage);
-        sendOffset += A->NsendPairs[r];
-        ++sendMessage;
-      }
-    }
-    if (A->NrecvTotal) {
-      if(A->NrecvPairs[r]){
-        MPI_Isend(nnzCnt+recvOffset, A->NrecvPairs[r], MPI_INT, r, tag,
-            agmg::comm, (MPI_Request*)A->haloRecvRequests+recvMessage);
-        recvOffset += A->NrecvPairs[r];
-        ++recvMessage;
-      }
-    }
-  }
-
-  // Wait for all sent messages to have left and received messages to have arrived
-  if (A->NrecvTotal) {
-    MPI_Status *sendStatus = (MPI_Status*) calloc(A->NsendMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NsendMessages, (MPI_Request*)A->haloSendRequests, sendStatus);
-    free(sendStatus);
-  }
-  if (A->NsendTotal) {
-    MPI_Status *recvStatus = (MPI_Status*) calloc(A->NrecvMessages, sizeof(MPI_Status));
-    MPI_Waitall(A->NrecvMessages, (MPI_Request*)A->haloRecvRequests, recvStatus);
-    free(recvStatus);
-  }
-
-  for(int i=0;i<A->NsendTotal;++i){
-    // local index of outgoing element in halo exchange
-    dlong id = A->haloElementList[i];
-
-    rands[id] += recvNnzCnt[i];
-  }
-
-  if (A->NHalo) free(nnzCnt);
-  if (A->NsendTotal) free(recvNnzCnt);
-
-  //share randomizer values
-  csrHaloExchange(A, sizeof(dfloat), rands, dfloatSendBuffer, rands+A->NlocalCols);
-
-
-
-  hlong done = 0;
-  while(!done){
-    // first neighbours
-    #pragma omp parallel for
-    for(dlong i=0; i<N; i++){
-
-      int smax = states[i];
-      dfloat rmax = rands[i];
-      hlong imax = i + globalRowStarts[rank];
-
-      if(smax != 1){
-        //local entries
-        for(dlong jj=C->diagRowStarts[i]+1;jj<C->diagRowStarts[i+1];jj++){
-          const dlong col = C->diagCols[jj];
-          if(customLess(smax, rmax, imax, states[col], rands[col], col + globalRowStarts[rank])){
-            smax = states[col];
-            rmax = rands[col];
-            imax = col + globalRowStarts[rank];
-          }
-        }
-        //nonlocal entries
-        for(dlong jj=C->offdRowStarts[i];jj<C->offdRowStarts[i+1];jj++){
-          const dlong col = C->offdCols[jj];
-          if(customLess(smax, rmax, imax, states[col], rands[col], A->colMap[col])) {
-            smax = states[col];
-            rmax = rands[col];
-            imax = A->colMap[col];
-          }
-        }
-      }
-      Ts[i] = smax;
-      Tr[i] = rmax;
-      Ti[i] = imax;
-    }
-
-    //share results
-    csrHaloExchange(A, sizeof(dfloat), Tr, dfloatSendBuffer, Tr+A->NlocalCols);
-    csrHaloExchange(A, sizeof(int), Ts, intSendBuffer, Ts+A->NlocalCols);
-    csrHaloExchange(A, sizeof(hlong), Ti, hlongSendBuffer, Ti+A->NlocalCols);
-
-    // second neighbours
-    #pragma omp parallel for
-    for(dlong i=0; i<N; i++){
-      int    smax = Ts[i];
-      dfloat rmax = Tr[i];
-      hlong  imax = Ti[i];
-
-      //local entries
-      for(dlong jj=C->diagRowStarts[i]+1;jj<C->diagRowStarts[i+1];jj++){
-        const dlong col = C->diagCols[jj];
-        if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
-          smax = Ts[col];
-          rmax = Tr[col];
-          imax = Ti[col];
-        }
-      }
-      //nonlocal entries
-      for(dlong jj=C->offdRowStarts[i];jj<C->offdRowStarts[i+1];jj++){
-        const dlong col = C->offdCols[jj];
-        if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
-          smax = Ts[col];
-          rmax = Tr[col];
-          imax = Ti[col];
-        }
-      }
-
-      // if I am the strongest among all the 1 and 2 ring neighbours
-      // I am an MIS node
-      if((states[i] == 0) && (imax == (i + globalRowStarts[rank])))
-        states[i] = 1;
-
-      // if there is an MIS node within distance 2, I am removed
-      if((states[i] == 0) && (smax == 1))
-        states[i] = -1;
-    }
-
-    csrHaloExchange(A, sizeof(int), states, intSendBuffer, states+A->NlocalCols);
-
-    // if number of undecided nodes = 0, algorithm terminates
-    hlong cnt = std::count(states, states+N, 0);
-    MPI_Allreduce(&cnt,&done,1,MPI_HLONG, MPI_SUM,agmg::comm);
-    done = (done == 0) ? 1 : 0;
-  }
-
-  dlong numAggs = 0;
-  dlong *gNumAggs = (dlong *) calloc(size,sizeof(dlong));
-  level->globalAggStarts = (hlong *) calloc(size+1,sizeof(hlong));
-  // count the coarse nodes/aggregates
-  for(dlong i=0; i<N; i++)
-    if(states[i] == 1) numAggs++;
-
-  MPI_Allgather(&numAggs,1,MPI_DLONG,gNumAggs,1,MPI_DLONG,agmg::comm);
-
-  level->globalAggStarts[0] = 0;
-  for (int r=0;r<size;r++)
-    level->globalAggStarts[r+1] = level->globalAggStarts[r] + gNumAggs[r];
-
-  numAggs = 0;
-  // enumerate the coarse nodes/aggregates
-  for(dlong i=0; i<N; i++)
-    if(states[i] == 1)
-      FineToCoarse[i] = level->globalAggStarts[rank] + numAggs++;
-
-  //share the initial aggregate flags
-  csrHaloExchange(A, sizeof(hlong), FineToCoarse, hlongSendBuffer, FineToCoarse+A->NlocalCols);
-
-  // form the aggregates
-  #pragma omp parallel for
-  for(dlong i=0; i<N; i++){
-    int   smax = states[i];
-    dfloat rmax = rands[i];
-    hlong  imax = i + globalRowStarts[rank];
-    hlong  cmax = FineToCoarse[i];
-
-    if(smax != 1){
-      //local entries
-      for(dlong jj=C->diagRowStarts[i]+1;jj<C->diagRowStarts[i+1];jj++){
-        const dlong col = C->diagCols[jj];
-        if(customLess(smax, rmax, imax, states[col], rands[col], col + globalRowStarts[rank])){
-          smax = states[col];
-          rmax = rands[col];
-          imax = col + globalRowStarts[rank];
-          cmax = FineToCoarse[col];
-        }
-      }
-      //nonlocal entries
-      for(dlong jj=C->offdRowStarts[i];jj<C->offdRowStarts[i+1];jj++){
-        const dlong col = C->offdCols[jj];
-        if(customLess(smax, rmax, imax, states[col], rands[col], A->colMap[col])){
-          smax = states[col];
-          rmax = rands[col];
-          imax = A->colMap[col];
-          cmax = FineToCoarse[col];
-        }
-      }
-    }
-    Ts[i] = smax;
-    Tr[i] = rmax;
-    Ti[i] = imax;
-    Tc[i] = cmax;
-
-    if((states[i] == -1) && (smax == 1) && (cmax > -1))
-      FineToCoarse[i] = cmax;
-  }
-
-  csrHaloExchange(A, sizeof(hlong), FineToCoarse, hlongSendBuffer, FineToCoarse+A->NlocalCols);
-  csrHaloExchange(A, sizeof(dfloat), Tr, dfloatSendBuffer, Tr+A->NlocalCols);
-  csrHaloExchange(A, sizeof(int), Ts, intSendBuffer, Ts+A->NlocalCols);
-  csrHaloExchange(A, sizeof(hlong), Ti, hlongSendBuffer, Ti+A->NlocalCols);
-  csrHaloExchange(A, sizeof(hlong), Tc, hlongSendBuffer, Tc+A->NlocalCols);
-
-  // second neighbours
-  #pragma omp parallel for
-  for(dlong i=0; i<N; i++){
-    int    smax = Ts[i];
-    dfloat rmax = Tr[i];
-    hlong  imax = Ti[i];
-    hlong  cmax = Tc[i];
-
-    //local entries
-    for(dlong jj=C->diagRowStarts[i]+1;jj<C->diagRowStarts[i+1];jj++){
-      const dlong col = C->diagCols[jj];
-      if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
-        smax = Ts[col];
-        rmax = Tr[col];
-        imax = Ti[col];
-        cmax = Tc[col];
-      }
-    }
-    //nonlocal entries
-    for(dlong jj=C->offdRowStarts[i];jj<C->offdRowStarts[i+1];jj++){
-      const dlong col = C->offdCols[jj];
-      if(customLess(smax, rmax, imax, Ts[col], Tr[col], Ti[col])){
-        smax = Ts[col];
-        rmax = Tr[col];
-        imax = Ti[col];
-        cmax = Tc[col];
-      }
-    }
-
-    if((states[i] == -1) && (smax == 1) && (cmax > -1))
-      FineToCoarse[i] = cmax;
-  }
-
-  csrHaloExchange(A, sizeof(hlong), FineToCoarse, hlongSendBuffer, FineToCoarse+A->NlocalCols);
-
-  free(rands);
-  free(states);
-  free(Tr);
-  free(Ts);
-  free(Ti);
-  free(Tc);
-  if (level->A->NsendTotal) {
-    free(intSendBuffer);
-    free(hlongSendBuffer);
-    free(dfloatSendBuffer);
-  }
-
-  //TODO maybe free C here?
-
-  return FineToCoarse;
-}
-
-typedef struct {
-
-  dlong fineId;
-  hlong coarseId;
-  hlong newCoarseId;
-
-  int originRank;
-  int ownerRank;
-
-} parallelAggregate_t;
-
-int compareOwner(const void *a, const void *b){
-  parallelAggregate_t *pa = (parallelAggregate_t *) a;
-  parallelAggregate_t *pb = (parallelAggregate_t *) b;
-
-  if (pa->ownerRank < pb->ownerRank) return -1;
-  if (pa->ownerRank > pb->ownerRank) return +1;
-
-  return 0;
-};
-
-int compareAgg(const void *a, const void *b){
-  parallelAggregate_t *pa = (parallelAggregate_t *) a;
-  parallelAggregate_t *pb = (parallelAggregate_t *) b;
-
-  if (pa->coarseId < pb->coarseId) return -1;
-  if (pa->coarseId > pb->coarseId) return +1;
-
-  if (pa->originRank < pb->originRank) return -1;
-  if (pa->originRank > pb->originRank) return +1;
-
-  return 0;
-};
-
-int compareOrigin(const void *a, const void *b){
-  parallelAggregate_t *pa = (parallelAggregate_t *) a;
-  parallelAggregate_t *pb = (parallelAggregate_t *) b;
-
-  if (pa->originRank < pb->originRank) return -1;
-  if (pa->originRank > pb->originRank) return +1;
-
-  return 0;
-};
-
-void find_aggregate_owners(agmgLevel *level, hlong* FineToCoarse, setupAide options) {
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  dlong N = level->A->Nrows;
-
-  //Need to establish 'ownership' of aggregates
-  
-  //Keep the current partitioning for STRONGNODES. 
-  // The rank that had the strong node for each aggregate owns the aggregate
-  if (options.compareArgs("PARALMOND PARTITION", "STRONGNODES")) return;
-
-  //populate aggregate array
-  hlong gNumAggs = level->globalAggStarts[size]; //total number of aggregates
-  
-  parallelAggregate_t *sendAggs;
-  if (N) 
-    sendAggs = (parallelAggregate_t *) calloc(N,sizeof(parallelAggregate_t));
-  else 
-    sendAggs = (parallelAggregate_t *) calloc(1,sizeof(parallelAggregate_t));
-
-  for (dlong i=0;i<N;i++) {
-    sendAggs[i].fineId = i;
-    sendAggs[i].originRank = rank;
-
-    sendAggs[i].coarseId = FineToCoarse[i];
-
-    //set a temporary owner. Evenly distibute aggregates amoungst ranks
-    sendAggs[i].ownerRank = (int) (FineToCoarse[i]*size)/gNumAggs;
-  }
-
-  // Make the MPI_PARALLEL_AGGREGATE data type
-  MPI_Datatype MPI_PARALLEL_AGGREGATE;
-  MPI_Datatype dtype[5] = {MPI_DLONG, MPI_HLONG, MPI_HLONG, MPI_INT, MPI_INT};
-  int blength[5] = {1, 1, 1, 1, 1};
-  MPI_Aint addr[5], displ[5];
-  MPI_Get_address ( &(sendAggs[0]            ), addr+0);
-  MPI_Get_address ( &(sendAggs[0].coarseId   ), addr+1);
-  MPI_Get_address ( &(sendAggs[0].newCoarseId), addr+2);
-  MPI_Get_address ( &(sendAggs[0].originRank ), addr+3);
-  MPI_Get_address ( &(sendAggs[0].ownerRank  ), addr+4);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  displ[4] = addr[4] - addr[0];
-  MPI_Type_create_struct (5, blength, displ, dtype, &MPI_PARALLEL_AGGREGATE);
-  MPI_Type_commit (&MPI_PARALLEL_AGGREGATE);
-
-  //sort by owning rank for all_reduce
-  qsort(sendAggs, N, sizeof(parallelAggregate_t), compareOwner);
-
-  int *sendCounts = (int *) calloc(size,sizeof(int));
-  int *recvCounts = (int *) calloc(size,sizeof(int));
-  int *sendOffsets = (int *) calloc(size+1,sizeof(int));
-  int *recvOffsets = (int *) calloc(size+1,sizeof(int));
-
-  for(dlong i=0;i<N;++i)
-    sendCounts[sendAggs[i].ownerRank]++;
-
-  // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(sendCounts, 1, MPI_INT, recvCounts, 1, MPI_INT, agmg::comm);
-
-  // find send and recv offsets for gather
-  dlong recvNtotal = 0;
-  for(int r=0;r<size;++r){
-    sendOffsets[r+1] = sendOffsets[r] + sendCounts[r];
-    recvOffsets[r+1] = recvOffsets[r] + recvCounts[r];
-    recvNtotal += recvCounts[r];
-  }
-  parallelAggregate_t *recvAggs = (parallelAggregate_t *) calloc(recvNtotal,sizeof(parallelAggregate_t));
-
-  MPI_Alltoallv(sendAggs, sendCounts, sendOffsets, MPI_PARALLEL_AGGREGATE,
-                recvAggs, recvCounts, recvOffsets, MPI_PARALLEL_AGGREGATE,
-                agmg::comm);
-
-  //sort by coarse aggregate number, and then by original rank
-  qsort(recvAggs, recvNtotal, sizeof(parallelAggregate_t), compareAgg);
-
-  //count the number of unique aggregates here
-  dlong NumUniqueAggs =0;
-  if (recvNtotal) NumUniqueAggs++;
-  for (dlong i=1;i<recvNtotal;i++)
-    if(recvAggs[i].coarseId!=recvAggs[i-1].coarseId) NumUniqueAggs++;
-
-  //get their locations in the array
-  dlong *aggStarts;
-  if (NumUniqueAggs)
-    aggStarts = (dlong *) calloc(NumUniqueAggs+1,sizeof(dlong));
-  dlong cnt = 1;
-  for (dlong i=1;i<recvNtotal;i++)
-    if(recvAggs[i].coarseId!=recvAggs[i-1].coarseId) aggStarts[cnt++] = i;
-  aggStarts[NumUniqueAggs] = recvNtotal;
-
-
-  if (options.compareArgs("PARALMOND PARTITION", "DISTRIBUTED")) { //rank that contributes most to the aggregate ownes it
-    //use a random dfloat for each rank to break ties.
-    dfloat rand = (dfloat) drand48();
-    dfloat *gRands = (dfloat *) calloc(size,sizeof(dfloat));
-    MPI_Allgather(&rand, 1, MPI_DFLOAT, gRands, 1, MPI_DFLOAT, agmg::comm);
-
-    //determine the aggregates majority owner
-    int *rankCounts = (int *) calloc(size,sizeof(int));
-    for (dlong n=0;n<NumUniqueAggs;n++) {
-      //populate randomizer
-      for (int r=0;r<size;r++)
-        rankCounts[r] = gRands[r];
-
-      //count the number of contributions to the aggregate from the separate ranks
-      for (dlong i=aggStarts[n];i<aggStarts[n+1];i++)
-        rankCounts[recvAggs[i].originRank]++;
-
-      //find which rank is contributing the most to this aggregate
-      int ownerRank = 0;
-      dfloat maxEntries = rankCounts[0];
-      for (int r=1;r<size;r++) {
-        if (rankCounts[r]>maxEntries) {
-          ownerRank = r;
-          maxEntries = rankCounts[r];
-        }
-      }
-
-      //set this aggregate's owner
-      for (dlong i=aggStarts[n];i<aggStarts[n+1];i++)
-        recvAggs[i].ownerRank = ownerRank;
-    }
-    free(gRands); free(rankCounts);
-  } else { //default SATURATE: always choose the lowest rank to own the aggregate
-    for (dlong n=0;n<NumUniqueAggs;n++) {
-      
-      int minrank = size;
-
-      //count the number of contributions to the aggregate from the separate ranks
-      for (dlong i=aggStarts[n];i<aggStarts[n+1];i++){
-
-        minrank = (recvAggs[i].originRank<minrank) ? recvAggs[i].originRank : minrank;
-      }
-
-      //set this aggregate's owner
-      for (dlong i=aggStarts[n];i<aggStarts[n+1];i++)
-        recvAggs[i].ownerRank = minrank;
-    }
-  }
-  free(aggStarts);
-
-  //sort by owning rank
-  qsort(recvAggs, recvNtotal, sizeof(parallelAggregate_t), compareOwner);
-
-  int *newSendCounts = (int *) calloc(size,sizeof(int));
-  int *newRecvCounts = (int *) calloc(size,sizeof(int));
-  int *newSendOffsets = (int *) calloc(size+1,sizeof(int));
-  int *newRecvOffsets = (int *) calloc(size+1,sizeof(int));
-
-  for(dlong i=0;i<recvNtotal;++i)
-    newSendCounts[recvAggs[i].ownerRank]++;
-
-  // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(newSendCounts, 1, MPI_INT, newRecvCounts, 1, MPI_INT, agmg::comm);
-
-  // find send and recv offsets for gather
-  dlong newRecvNtotal = 0;
-  for(int r=0;r<size;++r){
-    newSendOffsets[r+1] = newSendOffsets[r] + newSendCounts[r];
-    newRecvOffsets[r+1] = newRecvOffsets[r] + newRecvCounts[r];
-    newRecvNtotal += newRecvCounts[r];
-  }
-  parallelAggregate_t *newRecvAggs = (parallelAggregate_t *) calloc(newRecvNtotal,sizeof(parallelAggregate_t));
-
-  MPI_Alltoallv(   recvAggs, newSendCounts, newSendOffsets, MPI_PARALLEL_AGGREGATE,
-                newRecvAggs, newRecvCounts, newRecvOffsets, MPI_PARALLEL_AGGREGATE,
-                agmg::comm);
-
-  //sort by coarse aggregate number, and then by original rank
-  qsort(newRecvAggs, newRecvNtotal, sizeof(parallelAggregate_t), compareAgg);
-
-  //count the number of unique aggregates this rank owns
-  dlong numAggs = 0;
-  if (newRecvNtotal) numAggs++;
-  for (dlong i=1;i<newRecvNtotal;i++)
-    if(newRecvAggs[i].coarseId!=newRecvAggs[i-1].coarseId) numAggs++;
-
-  //determine a global numbering of the aggregates
-  dlong *lNumAggs = (dlong*) calloc(size,sizeof(dlong));
-  MPI_Allgather(&numAggs, 1, MPI_DLONG, lNumAggs, 1, MPI_INT, agmg::comm);
-
-  level->globalAggStarts[0] = 0;
-  for (int r=0;r<size;r++)
-    level->globalAggStarts[r+1] = level->globalAggStarts[r] + lNumAggs[r];
-
-  //set the new global coarse index
-  cnt = level->globalAggStarts[rank];
-  if (newRecvNtotal) newRecvAggs[0].newCoarseId = cnt;
-  for (dlong i=1;i<newRecvNtotal;i++) {
-    if(newRecvAggs[i].coarseId!=newRecvAggs[i-1].coarseId) cnt++;
-
-    newRecvAggs[i].newCoarseId = cnt;
-  }
-
-  //sort by owning rank
-  qsort(newRecvAggs, newRecvNtotal, sizeof(parallelAggregate_t), compareOrigin);
-
-  for(int r=0;r<size;r++) sendCounts[r] = 0;  
-  for(int r=0;r<=size;r++) {
-    sendOffsets[r] = 0;
-    recvOffsets[r] = 0;
-  }
-
-  for(dlong i=0;i<newRecvNtotal;++i)
-    sendCounts[newRecvAggs[i].originRank]++;
-
-  // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(sendCounts, 1, MPI_INT, recvCounts, 1, MPI_INT, agmg::comm);
-
-  // find send and recv offsets for gather
-  recvNtotal = 0;
-  for(int r=0;r<size;++r){
-    sendOffsets[r+1] = sendOffsets[r] + sendCounts[r];
-    recvOffsets[r+1] = recvOffsets[r] + recvCounts[r];
-    recvNtotal += recvCounts[r];
-  }
-
-  //send the aggregate data back
-  MPI_Alltoallv(newRecvAggs, sendCounts, sendOffsets, MPI_PARALLEL_AGGREGATE,
-                   sendAggs, recvCounts, recvOffsets, MPI_PARALLEL_AGGREGATE,
-                agmg::comm);
-
-  //clean up
-  MPI_Barrier(agmg::comm);
-  MPI_Type_free(&MPI_PARALLEL_AGGREGATE);
-
-  free(recvAggs);
-  free(sendCounts);  free(recvCounts);
-  free(sendOffsets); free(recvOffsets);
-  free(newRecvAggs);
-  free(newSendCounts);  free(newRecvCounts);
-  free(newSendOffsets); free(newRecvOffsets);
-
-  //record the new FineToCoarse map
-  for (dlong i=0;i<N;i++)
-    FineToCoarse[sendAggs[i].fineId] = sendAggs[i].newCoarseId;
-
-  free(sendAggs);
-}
-
-
-csr *construct_interpolator(agmgLevel *level, hlong *FineToCoarse, dfloat **nullCoarseA){
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  const dlong N = level->A->Nrows;
-  // const dlong M = level->A->Ncols;
-
-  hlong *globalAggStarts = level->globalAggStarts;
-
-  const hlong globalAggOffset = level->globalAggStarts[rank];
-  const dlong NCoarse = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local num agg
-
-  csr* P = (csr *) calloc(1, sizeof(csr));
-
-  P->Nrows = N;
-  P->Ncols = NCoarse;
-
-  P->NlocalCols = NCoarse;
-  P->NHalo = 0;
-
-  P->diagRowStarts = (dlong *) calloc(N+1, sizeof(dlong));
-  P->offdRowStarts = (dlong *) calloc(N+1, sizeof(dlong));
-
-  // each row has exactly one nonzero per row
-  P->diagNNZ =0;
-  P->offdNNZ =0;
-  for(dlong i=0; i<N; i++) {
-    hlong col = FineToCoarse[i];
-    if ((col>globalAggOffset-1)&&(col<globalAggOffset+NCoarse)) {
-      P->diagNNZ++;
-      P->diagRowStarts[i+1]++;
-    } else {
-      P->offdNNZ++;
-      P->offdRowStarts[i+1]++;
-    }
-  }
-  for(dlong i=0; i<N; i++) {
-    P->diagRowStarts[i+1] += P->diagRowStarts[i];
-    P->offdRowStarts[i+1] += P->offdRowStarts[i];
-  }
-
-  if (P->diagNNZ) {
-    P->diagCols  = (dlong *)  calloc(P->diagNNZ, sizeof(dlong));
-    P->diagCoefs = (dfloat *) calloc(P->diagNNZ, sizeof(dfloat));
-  }
-  hlong *offdCols;
-  if (P->offdNNZ) {
-    offdCols  = (hlong *)  calloc(P->offdNNZ, sizeof(hlong));
-    P->offdCols  = (dlong *)  calloc(P->offdNNZ, sizeof(dlong));
-    P->offdCoefs = (dfloat *) calloc(P->offdNNZ, sizeof(dfloat));
-  }
-
-  dlong diagCnt = 0;
-  dlong offdCnt = 0;
-  for(dlong i=0; i<N; i++) {
-    hlong col = FineToCoarse[i];
-    if ((col>globalAggStarts[rank]-1)&&(col<globalAggStarts[rank+1])) {
-      P->diagCols[diagCnt] = (dlong) (col - globalAggOffset); //local index
-      P->diagCoefs[diagCnt++] = level->A->null[i];
-    } else {
-      offdCols[offdCnt] = col;
-      P->offdCoefs[offdCnt++] = level->A->null[i];
-    }
-  }
-
-  //record global indexing of columns
-  P->colMap = (hlong *)   calloc(P->Ncols, sizeof(hlong));
-  for (dlong i=0;i<P->Ncols;i++)
-    P->colMap[i] = i + globalAggOffset;
-
-  if (P->offdNNZ) {
-    //we now need to reorder the x vector for the halo, and shift the column indices
-    hlong *col = (hlong *) calloc(P->offdNNZ,sizeof(hlong));
-    for (dlong i=0;i<P->offdNNZ;i++)
-      col[i] = offdCols[i]; //copy non-local column global ids
-
-    //sort by global index
-    std::sort(col,col+P->offdNNZ);
-
-    //count unique non-local column ids
-    P->NHalo = 0;
-    for (dlong i=1;i<P->offdNNZ;i++)
-      if (col[i]!=col[i-1])  col[++P->NHalo] = col[i];
-    P->NHalo++; //number of unique columns
-
-    P->Ncols += P->NHalo;
-
-    //save global column ids in colMap
-    P->colMap = (hlong *) realloc(P->colMap, P->Ncols*sizeof(hlong));
-    for (dlong i=0; i<P->NHalo; i++)
-      P->colMap[i+P->NlocalCols] = col[i];
-    free(col);
-
-    //shift the column indices to local indexing
-    for (dlong i=0;i<P->offdNNZ;i++) {
-      hlong gcol = offdCols[i];
-      for (dlong m=P->NlocalCols;m<P->Ncols;m++) {
-        if (gcol == P->colMap[m])
-          P->offdCols[i] = m;
-      }
-    }
-    free(offdCols);
-  }
-
-  csrHaloSetup(P,globalAggStarts);
-
-  // normalize the columns of P
-  *nullCoarseA = (dfloat *) calloc(P->Ncols,sizeof(dfloat));
-
-  //add local nonzeros
-  for(dlong i=0; i<P->diagNNZ; i++)
-    (*nullCoarseA)[P->diagCols[i]] += P->diagCoefs[i] * P->diagCoefs[i];
-
-  dfloat *nnzSum, *recvNnzSum;
-  if (P->NHalo) nnzSum = (dfloat *) calloc(P->NHalo,sizeof(dfloat));
-  if (P->NsendTotal) recvNnzSum = (dfloat *) calloc(P->NsendTotal,sizeof(dfloat));
-
-  //add the non-local non-zeros
-  for (dlong i=0;i<P->offdNNZ;i++)
-    nnzSum[P->offdCols[i]-P->NlocalCols] += P->offdCoefs[i] * P->offdCoefs[i];
-
-  //do a reverse halo exchange
-  int tag = 999;
-
-  // initiate immediate send  and receives to each other process as needed
-  dlong recvOffset = 0;
-  dlong sendOffset = 0;
-  int sendMessage = 0, recvMessage = 0;
-  for(int r=0;r<size;++r){
-    if (P->NsendTotal) {
-      if(P->NsendPairs[r]) {
-        MPI_Irecv(recvNnzSum+sendOffset, P->NsendPairs[r], MPI_DFLOAT, r, tag,
-            agmg::comm, (MPI_Request*)P->haloSendRequests+sendMessage);
-        sendOffset += P->NsendPairs[r];
-        ++sendMessage;
-      }
-    }
-    if (P->NrecvTotal) {
-      if(P->NrecvPairs[r]){
-        MPI_Isend(nnzSum+recvOffset, P->NrecvPairs[r], MPI_DFLOAT, r, tag,
-            agmg::comm, (MPI_Request*)P->haloRecvRequests+recvMessage);
-        recvOffset += P->NrecvPairs[r];
-        ++recvMessage;
-      }
-    }
-  }
-
-  // Wait for all sent messages to have left and received messages to have arrived
-  if (P->NrecvTotal) {
-    MPI_Status *sendStatus = (MPI_Status*) calloc(P->NsendMessages, sizeof(MPI_Status));
-    MPI_Waitall(P->NsendMessages, (MPI_Request*)P->haloSendRequests, sendStatus);
-    free(sendStatus);
-  }
-  if (P->NsendTotal) {
-    MPI_Status *recvStatus = (MPI_Status*) calloc(P->NrecvMessages, sizeof(MPI_Status));
-    MPI_Waitall(P->NrecvMessages, (MPI_Request*)P->haloRecvRequests, recvStatus);
-    free(recvStatus);
-  }
-
-  for(dlong i=0;i<P->NsendTotal;++i){
-    // local index of outgoing element in halo exchange
-    dlong id = P->haloElementList[i];
-
-    (*nullCoarseA)[id] += recvNnzSum[i];
-  }
-
-  if (P->NHalo) free(nnzSum);
-
-  for(dlong i=0; i<NCoarse; i++)
-    (*nullCoarseA)[i] = sqrt((*nullCoarseA)[i]);
-
-  csrHaloExchange(P, sizeof(dfloat), *nullCoarseA, P->sendBuffer, *nullCoarseA+P->NlocalCols);
-
-  for(dlong i=0; i<P->diagNNZ; i++)
-    P->diagCoefs[i] /= (*nullCoarseA)[P->diagCols[i]];
-  for(dlong i=0; i<P->offdNNZ; i++)
-    P->offdCoefs[i] /= (*nullCoarseA)[P->offdCols[i]];
-
-  MPI_Barrier(agmg::comm);
-  if (P->NsendTotal) free(recvNnzSum);
-
-  return P;
-}
-
-typedef struct {
-
-  hlong row;
-  hlong col;
-  dfloat val;
-  int owner;
-
-} nonzero_t;
-
-int compareNonZero(const void *a, const void *b){
-  nonzero_t *pa = (nonzero_t *) a;
-  nonzero_t *pb = (nonzero_t *) b;
-
-  if (pa->owner < pb->owner) return -1;
-  if (pa->owner > pb->owner) return +1;
-
-  if (pa->row < pb->row) return -1;
-  if (pa->row > pb->row) return +1;
-
-  if (pa->col < pb->col) return -1;
-  if (pa->col > pb->col) return +1;
-
-  return 0;
-};
-
-csr * transpose(agmgLevel* level, csr *A,
-                hlong *globalRowStarts, hlong *globalColStarts){
-
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  csr *At = (csr *) calloc(1,sizeof(csr));
-
-  At->Nrows = A->Ncols-A->NHalo;
-  At->Ncols = A->Nrows;
-  At->diagNNZ   = A->diagNNZ; //local entries remain local
-
-  At->NlocalCols = At->Ncols;
-
-  At->diagRowStarts = (dlong *)   calloc(At->Nrows+1, sizeof(dlong));
-  At->offdRowStarts = (dlong *)   calloc(At->Nrows+1, sizeof(dlong));
-
-  //start with local entries
-  if (A->diagNNZ) {
-    At->diagCols      = (dlong *)  calloc(At->diagNNZ, sizeof(dlong));
-    At->diagCoefs     = (dfloat *) calloc(At->diagNNZ, sizeof(dfloat));
-  }
-
-  // count the num of nonzeros per row for transpose
-  for(dlong i=0; i<A->diagNNZ; i++){
-    dlong row = A->diagCols[i];
-    At->diagRowStarts[row+1]++;
-  }
-
-  // cumulative sum for rows
-  for(dlong i=1; i<=At->Nrows; i++)
-    At->diagRowStarts[i] += At->diagRowStarts[i-1];
-
-  int *counter = (int *) calloc(At->Nrows+1,sizeof(int));
-  for (dlong i=0; i<At->Nrows+1; i++)
-    counter[i] = At->diagRowStarts[i];
-
-  for(dlong i=0; i<A->Nrows; i++){
-    const dlong Jstart = A->diagRowStarts[i], Jend = A->diagRowStarts[i+1];
-
-    for(dlong jj=Jstart; jj<Jend; jj++){
-      dlong row = A->diagCols[jj];
-      At->diagCols[counter[row]]  = i;
-      At->diagCoefs[counter[row]] = A->diagCoefs[jj];
-
-      counter[row]++;
-    }
-  }
-  free(counter);
-
-  //record global indexing of columns
-  At->colMap = (hlong *)   calloc(At->Ncols, sizeof(hlong));
-  for (dlong i=0;i<At->Ncols;i++)
-    At->colMap[i] = i + globalRowStarts[rank];
-
-  //now the nonlocal entries. Need to reverse the halo exchange to send the nonzeros
-  int tag = 999;
-
-  nonzero_t *sendNonZeros;
-  if (A->offdNNZ)
-    sendNonZeros = (nonzero_t *) calloc(A->offdNNZ,sizeof(nonzero_t));
-
-  int *Nsend = (int*) calloc(size, sizeof(int));
-  int *Nrecv = (int*) calloc(size, sizeof(int));
-
-  for(int r=0;r<size;r++) {
-    Nsend[r] =0;
-    Nrecv[r] =0;
-  }
-
-  // copy data from nonlocal entries into send buffer
-  for(dlong i=0;i<A->Nrows;++i){
-    for (dlong j=A->offdRowStarts[i];j<A->offdRowStarts[i+1];j++) {
-      hlong col =  A->colMap[A->offdCols[j]]; //global ids
-      for (int r=0;r<size;r++) { //find owner's rank
-        if ((globalColStarts[r]-1<col) && (col < globalColStarts[r+1])) {
-          Nsend[r]++;
-          sendNonZeros[j].owner = r;
-        }
-      }
-      sendNonZeros[j].row = col;
-      sendNonZeros[j].col = i + globalRowStarts[rank];     //global ids
-      sendNonZeros[j].val = A->offdCoefs[j];
-    }
-  }
-
-  //sort outgoing nonzeros by owner, then row and col
-  if (A->offdNNZ)
-    qsort(sendNonZeros, A->offdNNZ, sizeof(nonzero_t), compareNonZero);
-
-  MPI_Alltoall(Nsend, 1, MPI_INT, Nrecv, 1, MPI_INT, agmg::comm);
-
-  //count incoming nonzeros
-  At->offdNNZ = 0;
-  for (int r=0;r<size;r++)
-    At->offdNNZ += Nrecv[r];
-
-  nonzero_t *recvNonZeros;
-  if (At->offdNNZ)
-    recvNonZeros = (nonzero_t *) calloc(At->offdNNZ,sizeof(nonzero_t));
-
-  // initiate immediate send and receives to each other process as needed
-  int recvOffset = 0;
-  int sendOffset = 0;
-  int sendMessage = 0, recvMessage = 0;
-  for(int r=0;r<size;++r){
-    if (At->offdNNZ) {
-      if(Nrecv[r]) {
-        MPI_Irecv(((char*)recvNonZeros)+recvOffset, Nrecv[r]*sizeof(nonzero_t),
-                      MPI_CHAR, r, tag, agmg::comm,
-                      (MPI_Request*)A->haloSendRequests+recvMessage);
-        recvOffset += Nrecv[r]*sizeof(nonzero_t);
-        ++recvMessage;
-      }
-    }
-    if (A->offdNNZ) {
-      if(Nsend[r]){
-        MPI_Isend(((char*)sendNonZeros)+sendOffset, Nsend[r]*sizeof(nonzero_t),
-                      MPI_CHAR, r, tag, agmg::comm,
-                      (MPI_Request*)A->haloRecvRequests+sendMessage);
-        sendOffset += Nsend[r]*sizeof(nonzero_t);
-        ++sendMessage;
-      }
-    }
-  }
-
-  // Wait for all sent messages to have left and received messages to have arrived
-  if (A->offdNNZ) {
-    MPI_Status *sendStatus = (MPI_Status*) calloc(sendMessage, sizeof(MPI_Status));
-    MPI_Waitall(sendMessage, (MPI_Request*)A->haloRecvRequests, sendStatus);
-    free(sendStatus);
-  }
-  if (At->offdNNZ) {
-    MPI_Status *recvStatus = (MPI_Status*) calloc(recvMessage, sizeof(MPI_Status));
-    MPI_Waitall(recvMessage, (MPI_Request*)A->haloSendRequests, recvStatus);
-    free(recvStatus);
-  }
-  if (A->offdNNZ) free(sendNonZeros);
-
-  //free(Nsend); free(Nrecv);
-
-  if (At->offdNNZ) {
-    //sort recieved nonzeros by row and col
-    qsort(recvNonZeros, At->offdNNZ, sizeof(nonzero_t), compareNonZero);
-
-    hlong *offdCols  = (hlong *)   calloc(At->offdNNZ,sizeof(hlong));
-    At->offdCols  = (dlong *)   calloc(At->offdNNZ,sizeof(dlong));
-    At->offdCoefs = (dfloat *) calloc(At->offdNNZ, sizeof(dfloat));
-
-    //find row starts
-    for(dlong n=0;n<At->offdNNZ;++n) {
-      dlong row = (dlong) (recvNonZeros[n].row - globalColStarts[rank]);
-      At->offdRowStarts[row+1]++;
-    }
-    //cumulative sum
-    for (dlong i=0;i<At->Nrows;i++)
-      At->offdRowStarts[i+1] += At->offdRowStarts[i];
-
-    //fill cols and coefs
-    for (dlong i=0; i<At->Nrows; i++) {
-      for (dlong j=At->offdRowStarts[i]; j<At->offdRowStarts[i+1]; j++) {
-        offdCols[j]  = recvNonZeros[j].col;
-        At->offdCoefs[j] = recvNonZeros[j].val;
-      }
-    }
-    free(recvNonZeros);
-
-    //we now need to reorder the x vector for the halo, and shift the column indices
-    hlong *col = (hlong *) calloc(At->offdNNZ,sizeof(hlong));
-    for (dlong n=0;n<At->offdNNZ;n++)
-      col[n] = offdCols[n]; //copy non-local column global ids
-
-    //sort by global index
-    std::sort(col,col+At->offdNNZ);
-
-    //count unique non-local column ids
-    At->NHalo = 0;
-    for (dlong n=1;n<At->offdNNZ;n++)
-      if (col[n]!=col[n-1])  col[++At->NHalo] = col[n];
-    At->NHalo++; //number of unique columns
-
-    At->Ncols += At->NHalo;
-
-    //save global column ids in colMap
-    At->colMap = (hlong *) realloc(At->colMap,At->Ncols*sizeof(hlong));
-    for (dlong n=0; n<At->NHalo; n++)
-      At->colMap[n+At->NlocalCols] = col[n];
-    free(col);
-
-    //shift the column indices to local indexing
-    for (dlong n=0;n<At->offdNNZ;n++) {
-      hlong gcol = offdCols[n];
-      for (dlong m=At->NlocalCols;m<At->Ncols;m++) {
-        if (gcol == At->colMap[m])
-          At->offdCols[n] = m;
-      }
-    }
-    free(offdCols);
-  }
-
-  csrHaloSetup(At,globalRowStarts);
-
-  return At;
-}
-
-typedef struct {
-
-  hlong coarseId;
-  dfloat coef;
-
-} pEntry_t;
-
-typedef struct {
-
-  hlong I;
-  hlong J;
-  dfloat coef;
-
-} rapEntry_t;
-
-int compareRAPEntries(const void *a, const void *b){
-  rapEntry_t *pa = (rapEntry_t *) a;
-  rapEntry_t *pb = (rapEntry_t *) b;
-
-  if (pa->I < pb->I) return -1;
-  if (pa->I > pb->I) return +1;
-
-  if (pa->J < pb->J) return -1;
-  if (pa->J > pb->J) return +1;
-
-  return 0;
-};
-
-csr *galerkinProd(agmgLevel *level, csr *R, csr *A, csr *P){
-
-  // MPI info
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-
-  hlong *globalAggStarts = level->globalAggStarts;
-  // hlong *globalRowStarts = level->globalRowStarts;
-
-  hlong globalAggOffset = globalAggStarts[rank];
-
-  //The galerkin product can be computed as
-  // (RAP)_IJ = sum_{i in Agg_I} sum_{j in Agg_j} P_iI A_ij P_jJ
-  // Since each row of P has only one entry, we can share the ncessary
-  // P entries, form the products, and send them to their destination rank
-
-  dlong N = A->Nrows;
-  dlong M = A->Ncols;
-
-  //printf("Level has %d rows, and is making %d aggregates\n", N, globalAggStarts[rank+1]-globalAggStarts[rank]);
-
-  pEntry_t *PEntries;
-  if (M) 
-    PEntries = (pEntry_t *) calloc(M,sizeof(pEntry_t));
-  else 
-    PEntries = (pEntry_t *) calloc(1,sizeof(pEntry_t));
-
-  //record the entries of P that this rank has
-  dlong cnt =0;
-  for (dlong i=0;i<N;i++) {
-    for (dlong j=P->diagRowStarts[i];j<P->diagRowStarts[i+1];j++) {
-      PEntries[cnt].coarseId = P->diagCols[j] + globalAggOffset; //global ID
-      PEntries[cnt].coef = P->diagCoefs[j];
-      cnt++;
-    }
-    for (dlong j=P->offdRowStarts[i];j<P->offdRowStarts[i+1];j++) {
-      PEntries[cnt].coarseId = P->colMap[P->offdCols[j]]; //global ID
-      PEntries[cnt].coef = P->offdCoefs[j];
-      cnt++;
-    }
-  }
-
-  pEntry_t *entrySendBuffer;
-  if (A->NsendTotal)
-    entrySendBuffer = (pEntry_t *) calloc(A->NsendTotal,sizeof(pEntry_t));
-
-  //fill in the entires of P needed in the halo
-  csrHaloExchange(A, sizeof(pEntry_t), PEntries, entrySendBuffer, PEntries+A->NlocalCols);
-  if (A->NsendTotal) free(entrySendBuffer);
-
-  rapEntry_t *RAPEntries;
-  dlong totalNNZ = A->diagNNZ+A->offdNNZ;
-  if (totalNNZ) 
-    RAPEntries = (rapEntry_t *) calloc(totalNNZ,sizeof(rapEntry_t));
-  else 
-    RAPEntries = (rapEntry_t *) calloc(1,sizeof(rapEntry_t)); //MPI_AlltoAll doesnt like null pointers
-  
-  // Make the MPI_RAPENTRY_T data type
-  MPI_Datatype MPI_RAPENTRY_T;
-  MPI_Datatype dtype[3] = {MPI_HLONG, MPI_HLONG, MPI_DFLOAT};
-  int blength[3] = {1, 1, 1};
-  MPI_Aint addr[3], displ[3];
-  MPI_Get_address ( &(RAPEntries[0]     ), addr+0);
-  MPI_Get_address ( &(RAPEntries[0].J   ), addr+1);
-  MPI_Get_address ( &(RAPEntries[0].coef), addr+2);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  MPI_Type_create_struct (3, blength, displ, dtype, &MPI_RAPENTRY_T);
-  MPI_Type_commit (&MPI_RAPENTRY_T);
-
-  //for the RAP products
-  cnt =0;
-  for (dlong i=0;i<N;i++) {
-    for (dlong j=A->diagRowStarts[i];j<A->diagRowStarts[i+1];j++) {
-      dlong col  = A->diagCols[j];
-      dfloat coef = A->diagCoefs[j];
-
-      RAPEntries[cnt].I = PEntries[i].coarseId;
-      RAPEntries[cnt].J = PEntries[col].coarseId;
-      RAPEntries[cnt].coef = coef*PEntries[i].coef*PEntries[col].coef;
-      cnt++;
-    }
-  }
-  for (dlong i=0;i<N;i++) {
-    for (dlong j=A->offdRowStarts[i];j<A->offdRowStarts[i+1];j++) {
-      dlong col  = A->offdCols[j];
-      dfloat coef = A->offdCoefs[j];
-
-      RAPEntries[cnt].I = PEntries[i].coarseId;
-      RAPEntries[cnt].J = PEntries[col].coarseId;
-      RAPEntries[cnt].coef = PEntries[i].coef*coef*PEntries[col].coef;
-      cnt++;
-    }
-  }
-
-  //sort entries by the coarse row and col
-  if (totalNNZ) qsort(RAPEntries, totalNNZ, sizeof(rapEntry_t), compareRAPEntries);
-
-  int *sendCounts = (int *) calloc(size,sizeof(int));
-  int *recvCounts = (int *) calloc(size,sizeof(int));
-  int *sendOffsets = (int *) calloc(size+1,sizeof(int));
-  int *recvOffsets = (int *) calloc(size+1,sizeof(int));
-
-  for(dlong i=0;i<totalNNZ;++i) {
-    hlong id = RAPEntries[i].I;
-    for (int r=0;r<size;r++) {
-      if (globalAggStarts[r]-1<id && id < globalAggStarts[r+1])
-        sendCounts[r]++;
-    }
-  }
-
-  // find how many nodes to expect (should use sparse version)
-  MPI_Alltoall(sendCounts, 1, MPI_INT, recvCounts, 1, MPI_INT, agmg::comm);
-
-  // find send and recv offsets for gather
-  dlong recvNtotal = 0;
-  for(int r=0;r<size;++r){
-    sendOffsets[r+1] = sendOffsets[r] + sendCounts[r];
-    recvOffsets[r+1] = recvOffsets[r] + recvCounts[r];
-    recvNtotal += recvCounts[r];
-  }
-  rapEntry_t *recvRAPEntries;
-  if (recvNtotal) 
-    recvRAPEntries = (rapEntry_t *) calloc(recvNtotal,sizeof(rapEntry_t));
-  else 
-    recvRAPEntries = (rapEntry_t *) calloc(1,sizeof(rapEntry_t));//MPI_AlltoAll doesnt like null pointers
-  
-  MPI_Alltoallv(    RAPEntries, sendCounts, sendOffsets, MPI_RAPENTRY_T,
-                recvRAPEntries, recvCounts, recvOffsets, MPI_RAPENTRY_T,
-                agmg::comm);
-
-  //sort entries by the coarse row and col
-  if (recvNtotal) qsort(recvRAPEntries, recvNtotal, sizeof(rapEntry_t), compareRAPEntries);
-
-  //count total number of nonzeros;
-  dlong nnz =0;
-  if (recvNtotal) nnz++;
-  for (dlong i=1;i<recvNtotal;i++)
-    if ((recvRAPEntries[i].I!=recvRAPEntries[i-1].I)||
-          (recvRAPEntries[i].J!=recvRAPEntries[i-1].J)) nnz++;
-
-  rapEntry_t *newRAPEntries;
-  if (nnz)
-    newRAPEntries = (rapEntry_t *) calloc(nnz,sizeof(rapEntry_t));
-  else 
-    newRAPEntries = (rapEntry_t *) calloc(1,sizeof(rapEntry_t));
-  
-  //compress nonzeros
-  nnz = 0;
-  if (recvNtotal) newRAPEntries[nnz++] = recvRAPEntries[0];
-  for (dlong i=1;i<recvNtotal;i++) {
-    if ((recvRAPEntries[i].I!=recvRAPEntries[i-1].I)||
-          (recvRAPEntries[i].J!=recvRAPEntries[i-1].J)) {
-      newRAPEntries[nnz++] = recvRAPEntries[i];
-    } else {
-      newRAPEntries[nnz-1].coef += recvRAPEntries[i].coef;
-    }
-  }
-
-  dlong numAggs = (dlong) (globalAggStarts[rank+1]-globalAggStarts[rank]); //local number of aggregates
-
-  csr *RAP = (csr*) calloc(1,sizeof(csr));
-
-  RAP->Nrows = numAggs;
-  RAP->Ncols = numAggs;
-
-  RAP->NlocalCols = numAggs;
-
-  RAP->diagRowStarts = (dlong *) calloc(numAggs+1, sizeof(dlong));
-  RAP->offdRowStarts = (dlong *) calloc(numAggs+1, sizeof(dlong));
-
-  for (dlong n=0;n<nnz;n++) {
-    dlong row = (dlong) (newRAPEntries[n].I - globalAggOffset);
-    if ((newRAPEntries[n].J > globalAggStarts[rank]-1)&&
-          (newRAPEntries[n].J < globalAggStarts[rank+1])) {
-      RAP->diagRowStarts[row+1]++;
-    } else {
-      RAP->offdRowStarts[row+1]++;
-    }
-  }
-
-  // cumulative sum
-  for(dlong i=0; i<numAggs; i++) {
-    RAP->diagRowStarts[i+1] += RAP->diagRowStarts[i];
-    RAP->offdRowStarts[i+1] += RAP->offdRowStarts[i];
-  }
-  RAP->diagNNZ = RAP->diagRowStarts[numAggs];
-  RAP->offdNNZ = RAP->offdRowStarts[numAggs];
-
-  dlong *diagCols;
-  dfloat *diagCoefs;
-  if (RAP->diagNNZ) {
-    RAP->diagCols  = (dlong *)   calloc(RAP->diagNNZ, sizeof(dlong));
-    RAP->diagCoefs = (dfloat *) calloc(RAP->diagNNZ, sizeof(dfloat));
-    diagCols  = (dlong *)   calloc(RAP->diagNNZ, sizeof(dlong));
-    diagCoefs = (dfloat *) calloc(RAP->diagNNZ, sizeof(dfloat));
-  }
-  hlong *offdCols;
-  if (RAP->offdNNZ) {
-    offdCols  = (hlong *)   calloc(RAP->offdNNZ,sizeof(hlong));
-    RAP->offdCols  = (dlong *)   calloc(RAP->offdNNZ,sizeof(dlong));
-    RAP->offdCoefs = (dfloat *) calloc(RAP->offdNNZ, sizeof(dfloat));
-  }
-
-  dlong diagCnt =0;
-  dlong offdCnt =0;
-  for (dlong n=0;n<nnz;n++) {
-    if ((newRAPEntries[n].J > globalAggStarts[rank]-1)&&
-          (newRAPEntries[n].J < globalAggStarts[rank+1])) {
-      diagCols[diagCnt]  = (dlong) (newRAPEntries[n].J - globalAggOffset);
-      diagCoefs[diagCnt] = newRAPEntries[n].coef;
-      diagCnt++;
-    } else {
-      offdCols[offdCnt]  = newRAPEntries[n].J;
-      RAP->offdCoefs[offdCnt] = newRAPEntries[n].coef;
-      offdCnt++;
-    }
-  }
-
-  //move diagonal entries first
-  for (dlong i=0;i<RAP->Nrows;i++) {
-    dlong start = RAP->diagRowStarts[i];
-    int cnt = 1;
-    for (dlong j=RAP->diagRowStarts[i]; j<RAP->diagRowStarts[i+1]; j++) {
-      if (diagCols[j] == i) { //move diagonal to first entry
-        RAP->diagCols[start] = diagCols[j];
-        RAP->diagCoefs[start] = diagCoefs[j];
-      } else {
-        RAP->diagCols[start+cnt] = diagCols[j];
-        RAP->diagCoefs[start+cnt] = diagCoefs[j];
-        cnt++;
-      }
-    }
-  }
-
-  //record global indexing of columns
-  RAP->colMap = (hlong *)   calloc(RAP->Ncols, sizeof(hlong));
-  for (dlong i=0;i<RAP->Ncols;i++)
-    RAP->colMap[i] = i + globalAggOffset;
-
-  if (RAP->offdNNZ) {
-    //we now need to reorder the x vector for the halo, and shift the column indices
-    hlong *col = (hlong *) calloc(RAP->offdNNZ,sizeof(hlong));
-    for (dlong n=0;n<RAP->offdNNZ;n++)
-      col[n] = offdCols[n]; //copy non-local column global ids
-
-    //sort by global index
-    std::sort(col,col+RAP->offdNNZ);
-
-    //count unique non-local column ids
-    RAP->NHalo = 0;
-    for (dlong n=1;n<RAP->offdNNZ;n++)
-      if (col[n]!=col[n-1])  col[++RAP->NHalo] = col[n];
-    RAP->NHalo++; //number of unique columns
-
-    RAP->Ncols += RAP->NHalo;
-
-    //save global column ids in colMap
-    RAP->colMap = (hlong *) realloc(RAP->colMap,RAP->Ncols*sizeof(hlong));
-    for (dlong n=0; n<RAP->NHalo; n++)
-      RAP->colMap[n+RAP->NlocalCols] = col[n];
-
-    //shift the column indices to local indexing
-    for (dlong n=0;n<RAP->offdNNZ;n++) {
-      hlong gcol = offdCols[n];
-      for (dlong m=RAP->NlocalCols;m<RAP->Ncols;m++) {
-        if (gcol == RAP->colMap[m])
-          RAP->offdCols[n] = m;
-      }
-    }
-    free(col);
-    free(offdCols);
-  }
-  csrHaloSetup(RAP,globalAggStarts);
-
-  //clean up
-  MPI_Barrier(agmg::comm);
-  MPI_Type_free(&MPI_RAPENTRY_T);
-
-  free(PEntries);
-  free(sendCounts); free(recvCounts);
-  free(sendOffsets); free(recvOffsets);
-  if (RAP->diagNNZ) {
-    free(diagCols);
-    free(diagCoefs);
-  }
-  free(RAPEntries);
-  free(newRAPEntries);
-  free(recvRAPEntries);
-
-  return RAP;
-}
-
diff --git a/solvers/parALMOND/src/almondKernels.c b/solvers/parALMOND/src/almondKernels.c
deleted file mode 100644
index adcb7a944..000000000
--- a/solvers/parALMOND/src/almondKernels.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "agmg.h"
-
-void buildAlmondKernels(parAlmond_t *parAlmond){
-
-  int rank, size;
-  rank = agmg::rank;
-  size = agmg::size;
-  
-  occa::properties kernelInfo;
- kernelInfo["defines"].asObject();
- kernelInfo["includes"].asArray();
- kernelInfo["header"].asArray();
- kernelInfo["flags"].asObject();
-
-
-  kernelInfo["defines/" "bdim"]= AGMGBDIM;
-  kernelInfo["defines/" "simd"]= SIMDWIDTH;
-
-  if(sizeof(dlong)==4){
-    kernelInfo["defines/" "dlong"]="int";
-  }
-  if(sizeof(dlong)==8){
-    kernelInfo["defines/" "dlong"]="long long int";
-  }
-
-  if(sizeof(dfloat) == sizeof(double)){
-    kernelInfo["defines/" "dfloat"]= "double";
-    kernelInfo["defines/" "dfloat4"]= "double4";
-  }
-  else if(sizeof(dfloat) == sizeof(float)){
-    kernelInfo["defines/" "dfloat"]= "float";
-    kernelInfo["defines/" "dfloat4"]= "float4";
-  }
-
-  kernelInfo["defines/" "p_RDIMX"]= RDIMX;
-  kernelInfo["defines/" "p_RDIMY"]= RDIMY;
-
-  kernelInfo["includes"] += DPWD "/okl/twoPhaseReduction.h";
-
-  if(parAlmond->device.mode()=="OpenCL"){
-    //    parAlmond->device.setCompilerFlags("-cl-opt-disable");
-    kernelInfo["compiler_flags"] += "-cl-opt-disable";
-  }
-
-  if(parAlmond->device.mode()=="CUDA"){ // add backend compiler optimization for CUDA
-    kernelInfo["compiler_flags"] += "--ftz=true";
-    kernelInfo["compiler_flags"] += "--prec-div=false";
-    kernelInfo["compiler_flags"] += "--prec-sqrt=false";
-    kernelInfo["compiler_flags"] += "--use_fast_math";
-    kernelInfo["compiler_flags"] += "--fmad=true"; // compiler option for cuda
-  }
-
-  if (rank==0) printf("Compiling parALMOND Kernels \n");
-
-  for (int r=0;r<size;r++) {
-    if (r==rank) {
-      parAlmond->ellAXPYKernel = parAlmond->device.buildKernel(DPWD "/okl/ellAXPY.okl",
-           "ellAXPY", kernelInfo);
-
-      parAlmond->ellZeqAXPYKernel = parAlmond->device.buildKernel(DPWD "/okl/ellAXPY.okl",
-              "ellZeqAXPY", kernelInfo);
-
-      parAlmond->ellJacobiKernel = parAlmond->device.buildKernel(DPWD "/okl/ellAXPY.okl",
-              "ellJacobi", kernelInfo);
-
-      parAlmond->cooAXKernel = parAlmond->device.buildKernel(DPWD "/okl/cooAX.okl",
-             "cooAXKernel", kernelInfo);
-
-      parAlmond->scaleVectorKernel = parAlmond->device.buildKernel(DPWD "/okl/scaleVector.okl",
-             "scaleVectorKernel", kernelInfo);
-
-      parAlmond->sumVectorKernel = parAlmond->device.buildKernel(DPWD "/okl/sumVector.okl",
-             "sumVectorKernel", kernelInfo);
-
-      parAlmond->addScalarKernel = parAlmond->device.buildKernel(DPWD "/okl/addScalar.okl",
-             "addScalarKernel", kernelInfo);
-
-      parAlmond->vectorAddKernel = parAlmond->device.buildKernel(DPWD "/okl/vectorAdd.okl",
-             "vectorAddKernel", kernelInfo);
-
-      parAlmond->vectorAddKernel2 = parAlmond->device.buildKernel(DPWD "/okl/vectorAdd.okl",
-              "vectorAddKernel2", kernelInfo);
-
-      parAlmond->setVectorKernel = parAlmond->device.buildKernel(DPWD "/okl/setVector.okl",
-              "setVectorKernel", kernelInfo);
-
-      parAlmond->dotStarKernel = parAlmond->device.buildKernel(DPWD "/okl/dotStar.okl",
-               "dotStarKernel", kernelInfo);
-
-      parAlmond->simpleDotStarKernel = parAlmond->device.buildKernel(DPWD "/okl/dotStar.okl",
-               "simpleDotStarKernel", kernelInfo);
-
-      parAlmond->haloExtract = parAlmond->device.buildKernel(DPWD "/okl/haloExtract.okl",
-                "haloExtract", kernelInfo);
-
-      parAlmond->agg_interpolateKernel = parAlmond->device.buildKernel(DPWD "/okl/agg_interpolate.okl",
-                 "agg_interpolate", kernelInfo);
-
-      parAlmond->innerProdKernel = parAlmond->device.buildKernel(DPWD "/okl/innerProduct.okl",
-                 "innerProductKernel", kernelInfo);
-
-      parAlmond->vectorAddInnerProdKernel = parAlmond->device.buildKernel(DPWD "/okl/vectorAddInnerProduct.okl",
-                 "vectorAddInnerProductKernel", kernelInfo);
-
-      parAlmond->kcycleCombinedOp1Kernel = parAlmond->device.buildKernel(DPWD "/okl/kcycleCombinedOp.okl",
-                 "kcycleCombinedOp1Kernel", kernelInfo);
-
-      parAlmond->kcycleCombinedOp2Kernel = parAlmond->device.buildKernel(DPWD "/okl/kcycleCombinedOp.okl",
-                 "kcycleCombinedOp2Kernel", kernelInfo);
-
-      parAlmond->vectorAddWeightedInnerProdKernel = parAlmond->device.buildKernel(DPWD "/okl/vectorAddInnerProduct.okl",
-                 "vectorAddWeightedInnerProductKernel", kernelInfo);
-
-      parAlmond->kcycleWeightedCombinedOp1Kernel = parAlmond->device.buildKernel(DPWD "/okl/kcycleCombinedOp.okl",
-                 "kcycleWeightedCombinedOp1Kernel", kernelInfo);
-
-      parAlmond->kcycleWeightedCombinedOp2Kernel = parAlmond->device.buildKernel(DPWD "/okl/kcycleCombinedOp.okl",
-                 "kcycleWeightedCombinedOp2Kernel", kernelInfo);
-    }
-    MPI_Barrier(agmg::comm);
-  }
-}
diff --git a/solvers/parALMOND/src/parAlmond.c b/solvers/parALMOND/src/parAlmond.c
deleted file mode 100644
index 5e3cca31d..000000000
--- a/solvers/parALMOND/src/parAlmond.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "agmg.h"
-
-void parAlmondPrecon(parAlmond_t *parAlmond, occa::memory o_x, occa::memory o_rhs) {
-
-  agmgLevel *baseLevel = parAlmond->levels[0];
-  setupAide options = parAlmond->options;
-
-  if (baseLevel->gatherLevel==true) {// gather rhs
-    baseLevel->device_gather(baseLevel->gatherArgs, o_rhs, baseLevel->o_rhs);
-  } else {
-    baseLevel->o_rhs.copyFrom(o_rhs);
-  }
-
-  if (options.compareArgs("PARALMOND CYCLE", "HOST")) {
-    //host versions
-    baseLevel->o_rhs.copyTo(baseLevel->rhs);
-    if(options.compareArgs("PARALMOND CYCLE", "EXACT")) {
-      if(parAlmond->ktype == PCG) {
-        pcg(parAlmond,1000,1e-8);
-      } else if(parAlmond->ktype == GMRES) {
-        pgmres(parAlmond,1000,1e-8);
-      }
-    } else if(options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-      kcycle(parAlmond, 0);
-    } else if(options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-      vcycle(parAlmond, 0);
-    }
-    baseLevel->o_x.copyFrom(baseLevel->x);
-  } else {
-    if(options.compareArgs("PARALMOND CYCLE", "EXACT")){
-      if(parAlmond->ktype == PCG) {
-        device_pcg(parAlmond,1000,1e-8);
-      } else if(parAlmond->ktype == GMRES) {
-        device_pgmres(parAlmond,1000,1e-8);
-      }
-    } else if(options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-      device_kcycle(parAlmond, 0);
-    } else if(options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-      device_vcycle(parAlmond, 0);
-    }
-  }
-
-  if (baseLevel->gatherLevel==true) {// scatter solution
-    baseLevel->device_scatter(baseLevel->scatterArgs, baseLevel->o_x, o_x);
-  } else {
-    baseLevel->o_x.copyTo(o_x,baseLevel->Nrows*sizeof(dfloat));
-  }
-}
-
-parAlmond_t *parAlmondInit(mesh_t *mesh, setupAide options) {
-
-  parAlmond_t *parAlmond = (parAlmond_t *) calloc(1,sizeof(parAlmond_t));
-
-  parAlmond->device = mesh->device;
-  parAlmond->defaultStream = mesh->defaultStream;
-  parAlmond->dataStream = mesh->dataStream;
-  parAlmond->options = options;
-
-  parAlmond->levels = (agmgLevel **) calloc(MAX_LEVELS,sizeof(agmgLevel *));
-  parAlmond->numLevels = 0;
-  
-  if (options.compareArgs("PARALMOND CYCLE", "NONSYM")) {
-    parAlmond->ktype = GMRES;  
-  } else {
-    parAlmond->ktype = PCG;
-  }
-
-  agmg::rank = mesh->rank;
-  agmg::size = mesh->size;
-  MPI_Comm_dup(mesh->comm, &(agmg::comm));
-  
-  buildAlmondKernels(parAlmond);
-
-  return parAlmond;
-}
-
-void parAlmondAgmgSetup(parAlmond_t *parAlmond,
-                         hlong* globalRowStarts,       //global partition
-                         dlong nnz,                    //--
-                         hlong* Ai,                    //-- Local A matrix data (globally indexed, COO storage, row sorted)
-                         hlong* Aj,                    //--
-                         dfloat* Avals,                //--
-                         bool nullSpace,
-                         dfloat nullSpacePenalty){   
-
-  int size, rank;
-  size = agmg::size;
-  rank = agmg::rank;
-
-  hlong TotalRows = globalRowStarts[size];
-  dlong numLocalRows = (dlong) (globalRowStarts[rank+1]-globalRowStarts[rank]);
-
-  if(rank==0) printf("Setting up AMG...");fflush(stdout);
-
-  csr *A = newCSRfromCOO(numLocalRows,globalRowStarts,nnz, Ai, Aj, Avals);
-
-  //record if there is null space
-  parAlmond->nullSpace = nullSpace;
-  parAlmond->nullSpacePenalty = nullSpacePenalty;
-
-  //populate null space vector
-  dfloat *nullA = (dfloat *) calloc(numLocalRows, sizeof(dfloat));
-  for (dlong i=0;i<numLocalRows;i++) nullA[i] = 1/sqrt(TotalRows);
-
-  agmgSetup(parAlmond, A, nullA, globalRowStarts, parAlmond->options);
-  
-  if(rank==0) printf("done.\n");
-
-  if (parAlmond->options.compareArgs("VERBOSE","TRUE"))
-    parAlmondReport(parAlmond);
-}
-
-//TODO code this
-int parAlmondFree(void* A) {
-  return 0;
-}
-
diff --git a/solvers/parALMOND/src/pcg.c b/solvers/parALMOND/src/pcg.c
deleted file mode 100644
index 921e46bcf..000000000
--- a/solvers/parALMOND/src/pcg.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "agmg.h"
-
-
-void pcg(parAlmond_t *parAlmond,
-         int maxIt,
-         dfloat tol){
-
-  csr *A = parAlmond->levels[0]->A;
-
-  const dlong m = A->Nrows;
-  // const dlong n = A->Ncols;
-
-  parAlmond->ktype = PCG;
-
-  // use parAlmond's buffers
-  dfloat *r = parAlmond->levels[0]->rhs;
-  dfloat *z = parAlmond->levels[0]->x;
-
-  // initial residual
-  dfloat rdotr0Local = innerProd(m, r, r);
-  dfloat rdotr0 = 0;
-  MPI_Allreduce(&rdotr0Local,&rdotr0,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-  dfloat *x, *p, *Ap;
-
-  x  = (dfloat *) calloc(m,sizeof(dfloat));
-  Ap = (dfloat *) calloc(m,sizeof(dfloat));
-  p  = (dfloat *) calloc(m,sizeof(dfloat));
-
-  //    x = 0;
-  setVector(m, x, 0.0);
-
-  //sanity check
-  if (rdotr0<=(tol*tol)) {
-    for (dlong i=0;i<m;i++)
-      parAlmond->levels[0]->x[i] = x[i];
-
-    free(x); free(p); free(Ap);
-    return;
-  }
-
-  // Precondition, z = M^{-1}*r
-  if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-    kcycle(parAlmond, 0);
-  } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-    vcycle(parAlmond, 0);
-  }
-  for (dlong i=0;i<m;i++)
-    p[i] = z[i];
-
-  dfloat rdotz0Local = innerProd(m, r, z);
-  dfloat rdotz0 = 0;
-  MPI_Allreduce(&rdotz0Local,&rdotz0,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-  dfloat rdotr1 = 0;
-  dfloat rdotz1 = 0;
-  dfloat alpha, beta, pAp;
-
-  int Niter = 0;
-  while(rdotr0>(tol*tol)){
-    //   Ap = A*p;
-    axpy(A, 1.0, p, 0.0, Ap,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-
-    dfloat pApLocal = innerProd(m, p, Ap);
-    pAp = 0;
-    MPI_Allreduce(&pApLocal,&pAp,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-    alpha = rdotz0/pAp;
-
-    // update solution
-    //    x = x + alpha * p;
-    vectorAdd(m, alpha, p, 1.0, x);
-
-    // update residual
-    // r = r - alpha * Ap;
-    vectorAdd(m, -alpha, Ap, 1.0, r);
-
-
-    dfloat rdotr1Local = innerProd(m, r, r);
-    rdotr1 = 0;
-    MPI_Allreduce(&rdotr1Local,&rdotr1,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-    if(rdotr1 < tol*tol) {
-      rdotr0 = rdotr1;
-      break;
-    }
-
-    // Precondition, z = M^{-1}*r
-    if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-      kcycle(parAlmond, 0);
-    } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-      vcycle(parAlmond, 0);
-    }
-
-    dfloat rdotz1Local = innerProd(m, r, z);
-    rdotz1 = 0;
-    MPI_Allreduce(&rdotz1Local,&rdotz1,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-  #if 1
-    // flexible pcg beta = (z.(-alpha*Ap))/zdotz0
-    dfloat zdotApLocal = innerProd(m, z, Ap);
-    dfloat zdotAp = 0;
-    MPI_Allreduce(&zdotApLocal,&zdotAp,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-    beta = -alpha*zdotAp/rdotz0;
-  #else
-    beta = rdotz1/rdotz0;
-  #endif
-
-    // p = z + beta*p
-    vectorAdd(m, 1.0, z, beta, p);
-
-    // switch rdotz0 <= rdotz1
-    rdotz0 = rdotz1;
-
-    // switch rdotz0,rdotr0 <= rdotz1,rdotr1
-    rdotr0 = rdotr1;
-
-    Niter++;
-
-    printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0));
-
-    if(Niter==maxIt) break;
-  }
-
-  //copy result back to parAlmond's x storage
-  for (dlong i=0;i<m;i++)
-    parAlmond->levels[0]->x[i] = x[i];
-
-  free(x); free(p); free(Ap);
-}
-
-void device_pcg(parAlmond_t *parAlmond, int maxIt, dfloat tol){
-
-  hyb* A = parAlmond->levels[0]->deviceA;
-
-  const dlong m = A->Nrows;
-  const dlong n = A->Ncols;
-
-  parAlmond->ktype = PCG;
-
-  // use parAlmond's buffers
-  occa::memory &o_r = parAlmond->levels[0]->o_rhs;
-  occa::memory &o_z = parAlmond->levels[0]->o_x;
-
-  // initial residual
-  dfloat rdotr0Local = innerProd(parAlmond, m, o_r, o_r);
-  dfloat rdotr0 = 0;
-  MPI_Allreduce(&rdotr0Local,&rdotr0,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-  occa::memory o_x, o_p, o_Ap;
-
-  o_x  = parAlmond->device.malloc(n*sizeof(dfloat),parAlmond->levels[0]->x);
-  o_Ap = parAlmond->device.malloc(n*sizeof(dfloat),parAlmond->levels[0]->x);
-  o_p  = parAlmond->device.malloc(n*sizeof(dfloat),parAlmond->levels[0]->x);
-
-  //    x = 0;
-  setVector(parAlmond, m, o_x, 0.0);
-
-  //sanity check
-  if (rdotr0<=(tol*tol)) {
-    parAlmond->levels[0]->o_x.copyFrom(o_x);
-    printf("Almond PCG iter %d, res = %g\n", 0, sqrt(rdotr0));
-    o_x.free(); o_p.free(); o_Ap.free();
-    return;
-  }
-
-  // Precondition, z = M^{-1}*r
-  if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-    device_kcycle(parAlmond, 0);
-  } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-    device_vcycle(parAlmond, 0);
-  }
-  o_p.copyFrom(o_z);
-
-  dfloat rdotz0Local = innerProd(parAlmond, m, o_r, o_z);
-  dfloat rdotz0 = 0;
-  MPI_Allreduce(&rdotz0Local,&rdotz0,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-  dfloat rdotr1 = 0;
-  dfloat rdotz1 = 0;
-  dfloat alpha, beta, pAp;
-
-  int Niter = 0;
-  while(rdotr0>(tol*tol)){
-    //   Ap = A*p;
-    axpy(parAlmond, A, 1.0, o_p, 0.0, o_Ap,parAlmond->nullSpace,parAlmond->nullSpacePenalty);
-
-    dfloat pApLocal = innerProd(parAlmond, m, o_p, o_Ap);
-    pAp = 0;
-    MPI_Allreduce(&pApLocal,&pAp,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-    alpha = rdotz0/pAp;
-
-    // update solution
-    //    x = x + alpha * p;
-    vectorAdd(parAlmond, m, alpha, o_p, 1.0, o_x);
-
-    // update residual
-    // r = r - alpha * Ap;
-    vectorAdd(parAlmond, m, -alpha, o_Ap, 1.0, o_r);
-
-
-    dfloat rdotr1Local = innerProd(parAlmond, m, o_r, o_r);
-    rdotr1 = 0.;
-    MPI_Allreduce(&rdotr1Local,&rdotr1,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-    if(rdotr1 < tol*tol) {
-      rdotr0 = rdotr1;
-      break;
-    }
-
-    // Precondition, z = M^{-1}*r
-    if(parAlmond->options.compareArgs("PARALMOND CYCLE", "KCYCLE")) {
-      device_kcycle(parAlmond, 0);
-    } else if(parAlmond->options.compareArgs("PARALMOND CYCLE", "VCYCLE")) {
-      device_vcycle(parAlmond, 0);
-    }
-
-    dfloat rdotz1Local = innerProd(parAlmond, m, o_r, o_z);
-    rdotz1 = 0;
-    MPI_Allreduce(&rdotz1Local,&rdotz1,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-
-  #if 1
-    // flexible pcg beta = (z.(-alpha*Ap))/zdotz0
-    dfloat zdotApLocal = innerProd(parAlmond, m, o_z, o_Ap);
-    dfloat zdotAp = 0;
-    MPI_Allreduce(&zdotApLocal,&zdotAp,1,MPI_DFLOAT,MPI_SUM,agmg::comm);
-    beta = -alpha*zdotAp/rdotz0;
-  #else
-    beta = rdotz1/rdotz0;
-  #endif
-
-    // p = z + beta*p
-    vectorAdd(parAlmond, m, 1.0, o_z, beta, o_p);
-
-    // switch rdotz0 <= rdotz1
-    rdotz0 = rdotz1;
-
-    // switch rdotz0,rdotr0 <= rdotz1,rdotr1
-    rdotr0 = rdotr1;
-
-    Niter++;
-
-    //printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0));
-
-    if(Niter==maxIt) break;
-  }
-
-  //copy result back to parAlmond's x storage
-  parAlmond->levels[0]->o_x.copyFrom(o_x);
-
-  printf("Almond PCG iter %d, res = %g\n", Niter, sqrt(rdotr0));
-
-  o_x.free(); o_p.free(); o_Ap.free();
-}
-
-
diff --git a/solvers/parALMOND/src/vectorPrimitives.c b/solvers/parALMOND/src/vectorPrimitives.c
deleted file mode 100644
index 7d2f6d036..000000000
--- a/solvers/parALMOND/src/vectorPrimitives.c
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus, Rajesh Gandham
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#include "agmg.h"
-
-dfloat norm(dlong n, dfloat *a){
-  dfloat result = 0.;
-  #pragma omp parallel for reduction(+:result)
-  for(dlong i=0; i<n; i++){
-    result += a[i]*a[i];
-  }
-  return sqrt(result);
-}
-
-dfloat innerProd(dlong n, dfloat *a, dfloat *b){
-  dfloat result = 0.;
-  #pragma omp parallel for reduction(+:result)
-  for(dlong i=0; i<n; i++)
-    result += a[i]*b[i];
-  return result;
-}
-
-void doubleInnerProd(dlong n, dfloat *aDotbc, dfloat *a, dfloat *b, dfloat *c) {
-  dfloat aDotb = 0.;
-  dfloat aDotc = 0.;
-  #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc)
-  for(dlong i=0; i<n; i++) {
-    aDotb += a[i]*b[i];
-    aDotc += a[i]*c[i];
-  }
-  aDotbc[0] = aDotb;
-  aDotbc[1] = aDotc;
-}
-
-// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b,
-void kcycleCombinedOp1(dlong n, dfloat *aDotbc, dfloat *a, 
-                      dfloat *b, dfloat *c, dfloat* w, bool weighted) {
-  dfloat aDotb = 0.;
-  dfloat aDotc = 0.;
-  dfloat bDotb = 0.;
-  if (weighted) {
-    #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:bDotb)
-    for(dlong i=0; i<n; i++) {
-      aDotb += w[i]*a[i]*b[i];
-      aDotc += w[i]*a[i]*c[i];
-      bDotb += w[i]*b[i]*b[i];
-    }
-  } else {
-    #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:bDotb)
-    for(dlong i=0; i<n; i++) {
-      aDotb += a[i]*b[i];
-      aDotc += a[i]*c[i];
-      bDotb += b[i]*b[i];
-    }
-  }
-  aDotbc[0] = aDotb;
-  aDotbc[1] = aDotc;
-  aDotbc[2] = bDotb;
-}
-
-// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d,
-void kcycleCombinedOp2(dlong n, dfloat *aDotbcd, dfloat *a, dfloat *b, 
-                        dfloat *c, dfloat* d, dfloat *w, bool weighted) {
-  dfloat aDotb = 0.;
-  dfloat aDotc = 0.;
-  dfloat aDotd = 0.;
-  if (weighted) {
-    #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:aDotd)
-    for(dlong i=0; i<n; i++) {
-      aDotb += w[i]*a[i]*b[i];
-      aDotc += w[i]*a[i]*c[i];
-      aDotd += w[i]*a[i]*d[i];
-    }  
-  } else {
-    #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:aDotd)
-    for(dlong i=0; i<n; i++) {
-      aDotb += a[i]*b[i];
-      aDotc += a[i]*c[i];
-      aDotd += a[i]*d[i];
-    }  
-  }
-  
-  aDotbcd[0] = aDotb;
-  aDotbcd[1] = aDotc;
-  aDotbcd[2] = aDotd;
-}
-
-// y = beta*y + alpha*x
-void vectorAdd(dlong n, dfloat alpha, dfloat *x, dfloat beta, dfloat *y){
-  #pragma omp parallel for
-  for(dlong i=0; i<n; i++)
-    y[i] = beta*y[i] + alpha*x[i];
-}
-
-// y = beta*y + alpha*x, and return y\dot y
-dfloat vectorAddInnerProd(dlong n, dfloat alpha, dfloat *x, dfloat beta, dfloat *y,
-                          dfloat *w, bool weighted){
-  dfloat result = 0.;
-  if (weighted) {
-    #pragma omp parallel for reduction(+:result)
-    for(dlong i=0; i<n; i++) {
-      y[i] = beta*y[i] + alpha*x[i];
-      result += w[i]*y[i]*y[i];
-    }  
-  } else {
-    #pragma omp parallel for reduction(+:result)
-    for(dlong i=0; i<n; i++) {
-      y[i] = beta*y[i] + alpha*x[i];
-      result += y[i]*y[i];
-    }  
-  }
-  return result;
-}
-
-void dotStar(dlong m, dfloat *a, dfloat *b){
-  #pragma omp parallel for
-  for(dlong i=0; i<m; i++)
-    b[i] *= a[i];
-}
-
-void scaleVector(dlong m, dfloat *a, dfloat alpha){
-  #pragma omp parallel for
-  for(dlong i=0; i<m; i++)
-    a[i] *= alpha;
-}
-
-void setVector(dlong m, dfloat *a, dfloat alpha){
-  #pragma omp parallel for
-  for(dlong i=0; i<m; i++)
-    a[i] = alpha;
-}
-
-void addScalar(dlong m, dfloat alpha, dfloat *a){
-  #pragma omp parallel for
-  for(dlong i=0; i<m; i++)
-    a[i] += alpha;
-}
-
-dfloat sumVector(dlong m, dfloat *a){
-  dfloat alpha = 0.;
-
-  #pragma omp parallel for reduction(+:alpha)
-  for (dlong i=0; i<m; i++) {
-    alpha += a[i];
-  }
-  return alpha;
-}
-    
-void randomize(dlong m, dfloat *a){
-  for(dlong i=0; i<m; i++)
-    a[i] = (dfloat) drand48();
-}
-
-dfloat maxEntry(dlong n, dfloat *a){
-  if(n == 0)
-    return 0;
-
-  dfloat maxVal = 0.;
-  //  #pragma omp parallel for reduction(max:maxVal)
-  for(dlong i=0; i<n; i++){
-    dfloat a2 = (a[i] < 0) ? -a[i] : a[i];
-    if(maxVal < a2){
-      maxVal = a2;
-    }
-  }
-  return maxVal;
-}
-
-
-
-
-void scaleVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a, dfloat alpha){
-  if (N) parAlmond->scaleVectorKernel(N, alpha, o_a);
-}
-
-void setVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a, dfloat alpha){
-  if (N) parAlmond->setVectorKernel(N, alpha, o_a);
-}
-
-dfloat sumVector(parAlmond_t *parAlmond, dlong N, occa::memory o_a){
-  dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD;
-  if(!numBlocks) numBlocks = 1;
-
-  if (N) parAlmond->sumVectorKernel(numBlocks,N,o_a,parAlmond->o_rho);
-  parAlmond->o_rho.copyTo(parAlmond->rho,numBlocks*sizeof(dfloat),0);
-  
-  dfloat alpha =0.;
-  #pragma omp parallel for reduction(+:alpha)
-  for (dlong i=0; i<numBlocks; i++) {
-    alpha += parAlmond->rho[i];
-  }
-
-  return alpha;
-}
-
-void addScalar(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_a){
-  if (N) parAlmond->addScalarKernel(N, alpha, o_a);
-}
-
-void dotStar(parAlmond_t *parAlmond, dlong N, occa::memory o_a, occa::memory o_b){
-  if (N) parAlmond->simpleDotStarKernel(N, o_a, o_b);
-}
-
-void dotStar(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_a,
-	           occa::memory o_b, dfloat beta, occa::memory o_c){
-  if (N) parAlmond->dotStarKernel(N, alpha, beta, o_a, o_b, o_c);
-}
-
-dfloat innerProd(parAlmond_t *parAlmond, dlong N,
-                  occa::memory o_x, occa::memory o_y){
-
-  dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD;
-  if(!numBlocks) numBlocks = 1;
-
-  parAlmond->innerProdKernel(numBlocks,N,o_x,o_y,parAlmond->o_rho);
-  parAlmond->o_rho.copyTo(parAlmond->rho,numBlocks*sizeof(dfloat),0);
-  
-  dfloat result =0.;
-  #pragma omp parallel for reduction(+:result)
-  for (dlong i=0; i<numBlocks; i++) {
-    result += parAlmond->rho[i];
-  }
-
-  return result;
-}
-
-// returns aDotbc[0] = a\dot b, aDotbc[1] = a\dot c, aDotbc[2] = b\dot b,
-void kcycleCombinedOp1(parAlmond_t *parAlmond, dlong N, dfloat *aDotbc, 
-                        occa::memory o_a, occa::memory o_b, 
-                        occa::memory o_c, occa::memory o_w, bool weighted) {
-  dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD;
-  if(!numBlocks) numBlocks = 1;
-
-  if (weighted) {
-    parAlmond->kcycleWeightedCombinedOp1Kernel(numBlocks,N,o_a,o_b,o_c,o_w,parAlmond->o_rho);
-  } else {
-    parAlmond->kcycleCombinedOp1Kernel(numBlocks,N,o_a,o_b,o_c,parAlmond->o_rho);
-  }
-  parAlmond->o_rho.copyTo(parAlmond->rho,3*numBlocks*sizeof(dfloat),0);
-  
-  dfloat aDotb = 0., aDotc = 0., bDotb = 0.;
-  #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:bDotb)
-  for(dlong i=0; i<numBlocks; i++) {
-    aDotb += parAlmond->rho[3*i+0];
-    aDotc += parAlmond->rho[3*i+1];
-    bDotb += parAlmond->rho[3*i+2];
-  }  
-
-  aDotbc[0] = aDotb;
-  aDotbc[1] = aDotc;
-  aDotbc[2] = bDotb;
-}
-
-// returns aDotbcd[0] = a\dot b, aDotbcd[1] = a\dot c, aDotbcd[2] = a\dot d,
-void kcycleCombinedOp2(parAlmond_t *parAlmond, dlong N, dfloat *aDotbcd, 
-                        occa::memory o_a, occa::memory o_b, 
-                        occa::memory o_c, occa::memory o_d,
-                        occa::memory o_w, bool weighted) {
-
-  dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD;
-  if(!numBlocks) numBlocks = 1;
-
-  if (weighted) {
-    parAlmond->kcycleWeightedCombinedOp2Kernel(numBlocks,N,o_a,o_b,o_c,o_d,o_w,parAlmond->o_rho);
-  } else {
-    parAlmond->kcycleCombinedOp2Kernel(numBlocks,N,o_a,o_b,o_c,o_d,parAlmond->o_rho);
-  }
-  parAlmond->o_rho.copyTo(parAlmond->rho,3*numBlocks*sizeof(dfloat),0);
-  
-  dfloat aDotb = 0., aDotc = 0., aDotd = 0.;
-  #pragma omp parallel for reduction(+:aDotb) reduction(+:aDotc) reduction(+:aDotd)
-  for(dlong i=0; i<numBlocks; i++) {
-    aDotb += parAlmond->rho[3*i+0];
-    aDotc += parAlmond->rho[3*i+1];
-    aDotd += parAlmond->rho[3*i+2];
-  }  
-
-  aDotbcd[0] = aDotb;
-  aDotbcd[1] = aDotc;
-  aDotbcd[2] = aDotd;
-}
-
-// y = beta*y + alpha*x, and return y\dot y
-dfloat vectorAddInnerProd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x,
-                                                          dfloat beta, occa::memory o_y,
-                                                          occa::memory o_w, bool weighted){
-  dlong numBlocks = ((N+RDIMX*RDIMY-1)/(RDIMX*RDIMY))/RLOAD;
-  if(!numBlocks) numBlocks = 1;
-
-  if (weighted) {
-    parAlmond->vectorAddWeightedInnerProdKernel(numBlocks,N,alpha,beta,o_x,o_y,o_w,parAlmond->o_rho);
-  } else {
-    parAlmond->vectorAddInnerProdKernel(numBlocks,N,alpha,beta,o_x,o_y,parAlmond->o_rho);
-  }
-  parAlmond->o_rho.copyTo(parAlmond->rho,numBlocks*sizeof(dfloat),0);
-  
-  dfloat result =0.;
-  #pragma omp parallel for reduction(+:result)
-  for (dlong i=0; i<numBlocks; i++) {
-    result += parAlmond->rho[i];
-  }
-
-  return result;
-}
-
-
-void vectorAdd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x, dfloat beta, occa::memory o_y){
-  parAlmond->vectorAddKernel(N, alpha, beta, o_x, o_y);
-}
-
-void vectorAdd(parAlmond_t *parAlmond, dlong N, dfloat alpha, occa::memory o_x,
-	 dfloat beta, occa::memory o_y, occa::memory o_z){
-  parAlmond->vectorAddKernel2(N, alpha, beta, o_x, o_y, o_z);
-}