From c938e003ccc136e79dccd93f548ff295f2c69068 Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:09:13 -0600
Subject: [PATCH 01/12] New comm_t object

---
 include/comm.hpp | 567 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 567 insertions(+)
 create mode 100644 include/comm.hpp
diff --git a/include/comm.hpp b/include/comm.hpp
new file mode 100644
index 0000000..7781da0
--- /dev/null
+++ b/include/comm.hpp
@@ -0,0 +1,567 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef LIBP_COMM_HPP
+#define LIBP_COMM_HPP
+
+#include <mpi.h>
+#include "core.hpp"
+
+namespace libp {
+
+#define MAX_PROCESSOR_NAME MPI_MAX_PROCESSOR_NAME
+
+/*Generic data type*/
+template<typename T>
+struct mpiType {
+  static MPI_Datatype getMpiType() {
+    MPI_Datatype type;
+    MPI_Type_contiguous(sizeof(T), MPI_CHAR, &type);
+    MPI_Type_commit(&type);
+    return type;
+  }
+  static void freeMpiType(MPI_Datatype type) {
+    MPI_Type_free(&type);
+  }
+  static constexpr bool isMpiType() { return false; }
+};
+
+/*Pre-defined MPI datatypes*/
+#define TYPE(T, MPI_T)                               \
+template<> struct mpiType<T> {                       \
+  static MPI_Datatype getMpiType() { return MPI_T; } \
+  static void freeMpiType(MPI_Datatype type) { }     \
+  static constexpr bool isMpiType() { return true; } \
+}
+
+TYPE(char,   MPI_CHAR);
+TYPE(int,    MPI_INT);
+TYPE(long long int, MPI_LONG_LONG_INT);
+TYPE(float,  MPI_FLOAT);
+TYPE(double, MPI_DOUBLE);
+#undef TYPE
+
+/*Communicator class*/
+class comm_t {
+
+ private:
+  std::shared_ptr<MPI_Comm> comm_ptr;
+  int _rank=0;
+  int _size=0;
+
+ public:
+  comm_t() = default;
+  comm_t(const comm_t &c) = default;
+  comm_t& operator = (const comm_t &c)=default;
+
+  /*Static MPI_Init and MPI_Finalize*/
+  static void Init(int &argc, char** &argv);
+  static void Finalize();
+
+  /*Static handle to MPI_COMM_WORLD*/
+  static comm_t world();
+
+  /*MPI_Comm_dup and MPI_Comm_delete*/
+  comm_t Dup() const;
+  comm_t Split(const int color, const int key) const;
+  void Free();
+
+  /*Rank and size getters*/
+  const int rank() const;
+  const int size() const;
+
+  /*MPI_Comm getter*/
+  MPI_Comm comm() const;
+
+  using request_t = MPI_Request;
+
+  /*Predefined ops*/
+  using op_t = MPI_Op;
+  static constexpr op_t Max  = MPI_MAX;
+  static constexpr op_t Min  = MPI_MIN;
+  static constexpr op_t Sum  = MPI_SUM;
+  static constexpr op_t Prod = MPI_PROD;
+  static constexpr op_t And  = MPI_LAND;
+  static constexpr op_t Or   = MPI_LOR;
+  static constexpr op_t Xor  = MPI_LXOR;
+
+  /*libp::memory send*/
+  template <template<typename> class mem, typename T>
+  void Send(mem<T> m,
+            const int dest,
+            const int count=-1,
+            const int tag=0) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Send(m.ptr(), cnt, type, dest, tag, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory recv*/
+  template <template<typename> class mem, typename T>
+  void Recv(mem<T> m,
+            const int source,
+            const int count=-1,
+            const int tag=0) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Recv(m.ptr(), cnt, type, source, tag, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar send*/
+  template <typename T>
+  void Send(T& val,
+            const int dest,
+            const int tag=0) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Send(&val, 1, type, dest, tag, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar recv*/
+  template <typename T>
+  void Recv(T& val,
+            const int source,
+            const int tag=0) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Recv(&val, 1, type, source, tag, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory non-blocking send*/
+  template <template<typename> class mem, typename T>
+  void Isend(mem<T> m,
+             const int dest,
+             const int count,
+             const int tag,
+             request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Isend(m.ptr(), count, type, dest, tag, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory non-blocking recv*/
+  template <template<typename> class mem, typename T>
+  void Irecv(mem<T> m,
+             const int source,
+             const int count,
+             const int tag,
+             request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Irecv(m.ptr(), count, type, source, tag, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar non-blocking send*/
+  template <typename T>
+  void Isend(T& val,
+             const int dest,
+             const int tag,
+             request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Isend(&val, 1, type, dest, tag, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar non-blocking recv*/
+  template <typename T>
+  void Irecv(T& val,
+             const int source,
+             const int tag,
+             request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Irecv(&val, 1, type, source, tag, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory broadcast*/
+  template <template<typename> class mem, typename T>
+  void Bcast(mem<T> m,
+             const int root,
+             const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Bcast(m.ptr(), cnt, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar broadcast*/
+  template <typename T>
+  void Bcast(T& val,
+             const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Bcast(&val, 1, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory reduce*/
+  template <template<typename> class mem, typename T>
+  void Reduce(const mem<T> snd,
+                    mem<T> rcv,
+              const int root,
+              const op_t op = Sum,
+              const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(snd.length()) : count;
+    MPI_Reduce(snd.ptr(), rcv.ptr(), cnt, type, op, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory in-place reduce*/
+  template <template<typename> class mem, typename T>
+  void Reduce(mem<T> m,
+              const int root,
+              const op_t op = Sum,
+              const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    if (_rank==root) {
+      MPI_Reduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, root, comm());
+    } else {
+      MPI_Reduce(m.ptr(), nullptr, cnt, type, op, root, comm());
+    }
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar reduce*/
+  template <typename T>
+  void Reduce(const T& snd,
+                    T& rcv,
+              const int root,
+              const op_t op = Sum) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Reduce(&snd, &rcv, 1, type, op, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+  template <typename T>
+  void Reduce(T& val,
+              const int root,
+              const op_t op = Sum) const {
+    T rcv=val;
+    Reduce(val, rcv, root, op);
+    if (rank()==root) val=rcv;
+  }
+
+  /*libp::memory allreduce*/
+  template <template<typename> class mem, typename T>
+  void Allreduce(const mem<T> snd,
+                       mem<T> rcv,
+                 const op_t op = Sum,
+                 const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(snd.length()) : count;
+    MPI_Allreduce(snd.ptr(), rcv.ptr(), cnt, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory in-place allreduce*/
+  template <template<typename> class mem, typename T>
+  void Allreduce(mem<T> m,
+                 const op_t op = Sum,
+                 const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Allreduce(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar allreduce*/
+  template <typename T>
+  void Allreduce(const T& snd,
+                       T& rcv,
+                 const op_t op = Sum) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Allreduce(&snd, &rcv, 1, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+  template <typename T>
+  void Allreduce(T& val,
+                 const op_t op = Sum) const {
+    T rcv=val;
+    Allreduce(val, rcv, op);
+    val = rcv;
+  }
+
+  /*libp::memory non-blocking allreduce*/
+  template <template<typename> class mem, typename T>
+  void Iallreduce(const mem<T> snd,
+                        mem<T> rcv,
+                  const op_t op,
+                  const int count,
+                  request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Iallreduce(snd.ptr(), rcv.ptr(), count, type, op, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory non-blocking in-place allreduce*/
+  template <template<typename> class mem, typename T>
+  void Iallreduce(mem<T> m,
+                  const int root,
+                  const op_t op,
+                  const int count,
+                  request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Iallreduce(MPI_IN_PLACE, m.ptr(), count, type, op, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar non-blocking allreduce*/
+  template <template<typename> class mem, typename T>
+  void Iallreduce(const T& snd,
+                        T& rcv,
+                  const op_t op,
+                  request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Iallreduce(&snd, &rcv, 1, type, op, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+  /*scalar non-blocking in-place allreduce*/
+  template <template<typename> class mem, typename T>
+  void Iallreduce(T& val,
+                  const op_t op,
+                  request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Iallreduce(MPI_IN_PLACE, &val, 1, type, op, comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory scan*/
+  template <template<typename> class mem, typename T>
+  void Scan(const mem<T> snd,
+                  mem<T> rcv,
+            const op_t op = Sum,
+            const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(snd.length()) : count;
+    MPI_Scan(snd.ptr(), rcv.ptr(), cnt, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory in-place scan*/
+  template <template<typename> class mem, typename T>
+  void Scan(mem<T> m,
+            const op_t op = Sum,
+            const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(m.length()) : count;
+    MPI_Scan(MPI_IN_PLACE, m.ptr(), cnt, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar scan*/
+  template <typename T>
+  void Scan(const T& snd,
+                  T& rcv,
+            const op_t op = Sum) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Scan(&snd, &rcv, 1, type, op, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+  template <typename T>
+  void Scan(T& snd,
+            const op_t op = Sum) const {
+    T rcv=snd;
+    Scan(snd, rcv, op);
+    snd = rcv;
+  }
+
+  /*libp::memory gather*/
+  template <template<typename> class mem, typename T>
+  void Gather(const mem<T> snd,
+                    mem<T> rcv,
+              const int root,
+              const int sendCount=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (sendCount==-1) ? static_cast<int>(snd.length()) : sendCount;
+    MPI_Gather(snd.ptr(), cnt, type,
+               rcv.ptr(), cnt, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory gatherv*/
+  template <template<typename> class mem, typename T>
+  void Gatherv(const mem<T> snd,
+               const int sendcount,
+                     mem<T> rcv,
+               const memory<int> recvCounts,
+               const memory<int> recvOffsets,
+               const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Gatherv(snd.ptr(), sendcount, type,
+                rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type,
+                root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar gather*/
+  template <template<typename> class mem, typename T>
+  void Gather(const T& snd,
+                    mem<T> rcv,
+              const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Gather(&snd,      1, type,
+               rcv.ptr(), 1, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory scatter*/
+  template <template<typename> class mem, typename T>
+  void Scatter(const mem<T> snd,
+                     mem<T> rcv,
+               const int root,
+               const int count=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (count==-1) ? static_cast<int>(rcv.length()) : count;
+    MPI_Scatter(snd.ptr(), cnt, type,
+                rcv.ptr(), cnt, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory scatterv*/
+  template <template<typename> class mem, typename T>
+  void Scatterv(const mem<T> snd,
+                const memory<int> sendCounts,
+                const memory<int> sendOffsets,
+                      mem<T> rcv,
+                const int recvcount,
+                const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Scatterv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type,
+                 rcv.ptr(), recvcount, type,
+                 root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar scatter*/
+  template <template<typename> class mem, typename T>
+  void Scatter(T& rcv,
+               const mem<T> snd,
+               const int root) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Scatter(snd.ptr,   1, type,
+                &rcv,      1, type, root, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory allgather*/
+  template <template<typename> class mem, typename T>
+  void Allgather(const mem<T> snd,
+                       mem<T> rcv,
+                 const int sendCount=-1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    const int cnt = (sendCount==-1) ? static_cast<int>(snd.length()) : sendCount;
+    MPI_Allgather(snd.ptr(), cnt, type,
+                  rcv.ptr(), cnt, type, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+  template <template<typename> class mem, typename T>
+  void Allgather(mem<T> m,
+                 const int cnt) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Allgather(MPI_IN_PLACE, cnt, type,
+                  m.ptr(),      cnt, type, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory allgatherv*/
+  template <template<typename> class mem, typename T>
+  void Allgatherv(const mem<T> snd,
+                  const int sendcount,
+                        mem<T> rcv,
+                  const memory<int> recvCounts,
+                  const memory<int> recvOffsets) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Allgatherv(snd.ptr(), sendcount, type,
+                   rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type,
+                   comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*scalar allgather*/
+  template <template<typename> class mem, typename T>
+  void Allgather(const T& snd,
+                       mem<T> rcv) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Allgather(&snd,      1, type,
+                  rcv.ptr(), 1, type, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory alltoall*/
+  template <template<typename> class mem, typename T>
+  void Alltoall(const mem<T> snd,
+                      mem<T> rcv,
+                const int cnt=1) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Alltoall(snd.ptr(), cnt, type,
+                 rcv.ptr(), cnt, type, comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  /*libp::memory alltoallv*/
+  template <template<typename> class mem, typename T>
+  void Alltoallv(const mem<T> snd,
+                 const memory<int> sendCounts,
+                 const memory<int> sendOffsets,
+                       mem<T> rcv,
+                 const memory<int> recvCounts,
+                 const memory<int> recvOffsets) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Alltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type,
+                  rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type,
+                  comm());
+    mpiType<T>::freeMpiType(type);
+  }
+
+  template <template<typename> class mem, typename T>
+  void Ialltoallv(const mem<T> snd,
+                  const memory<int> sendCounts,
+                  const memory<int> sendOffsets,
+                        mem<T> rcv,
+                  const memory<int> recvCounts,
+                  const memory<int> recvOffsets,
+                  request_t &request) const {
+    MPI_Datatype type = mpiType<T>::getMpiType();
+    MPI_Ialltoallv(snd.ptr(), sendCounts.ptr(), sendOffsets.ptr(), type,
+                  rcv.ptr(), recvCounts.ptr(), recvOffsets.ptr(), type,
+                  comm(), &request);
+    mpiType<T>::freeMpiType(type);
+  }
+
+  void Wait(request_t &request) const;
+  void Waitall(const int count, memory<request_t> &requests) const;
+  void Barrier() const;
+
+  static void GetProcessorName(char* name, int &namelen) {
+    MPI_Get_processor_name(name,&namelen);
+  }
+};
+
+} //namespace libp
+
+#endif

From 395bb934a4a048f596bdb2d67c6b6755640f4097 Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:09:36 -0600
Subject: [PATCH 02/12] New comm_t object

---
 libs/core/comm.cpp | 110 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 libs/core/comm.cpp

diff --git a/libs/core/comm.cpp b/libs/core/comm.cpp
new file mode 100644
index 0000000..02b33dc
--- /dev/null
+++ b/libs/core/comm.cpp
@@ -0,0 +1,110 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "comm.hpp"
+
+namespace libp {
+
+/*Static MPI_Init and MPI_Finalize*/
+void comm_t::Init(int &argc, char** &argv) { MPI_Init(&argc, &argv); }
+void comm_t::Finalize() { MPI_Finalize(); }
+
+/*Static handle to MPI_COMM_WORLD*/
+comm_t comm_t::world() {
+  comm_t c;
+  c.comm_ptr = std::make_shared<MPI_Comm>();
+  *(c.comm_ptr) = MPI_COMM_WORLD;
+  MPI_Comm_rank(c.comm(), &(c._rank));
+  MPI_Comm_size(c.comm(), &(c._size));
+  return c;
+}
+
+/*MPI_Comm_dup and free*/
+comm_t comm_t::Dup() const {
+  comm_t c;
+  /*Make a new comm shared_ptr, which will call MPI_Comm_free when destroyed*/
+  c.comm_ptr = std::shared_ptr<MPI_Comm>(new MPI_Comm,
+                                        [](MPI_Comm *comm) {
+                                          if (*comm != MPI_COMM_NULL)
+                                            MPI_Comm_free(comm);
+                                          delete comm;
+                                        });
+  MPI_Comm_dup(comm(), c.comm_ptr.get());
+  MPI_Comm_rank(c.comm(), &(c._rank));
+  MPI_Comm_size(c.comm(), &(c._size));
+  return c;
+}
+void comm_t::Free() {
+  comm_ptr = nullptr;
+  _rank=0;
+  _size=0;
+}
+/*Split*/
+comm_t comm_t::Split(const int color, const int key) const {
+  comm_t c;
+  /*Make a new comm shared_ptr, which will call MPI_Comm_free when destroyed*/
+  c.comm_ptr = std::shared_ptr<MPI_Comm>(new MPI_Comm,
+                                        [](MPI_Comm *comm) {
+                                          if (*comm != MPI_COMM_NULL)
+                                            MPI_Comm_free(comm);
+                                          delete comm;
+                                        });
+
+  MPI_Comm_split(comm(), color, key, c.comm_ptr.get());
+  MPI_Comm_rank(c.comm(), &(c._rank));
+  MPI_Comm_size(c.comm(), &(c._size));
+  return c;
+}
+
+/*Rank and size getters*/
+const int comm_t::rank() const {
+  return _rank;
+}
+const int comm_t::size() const {
+  return _size;
+}
+
+MPI_Comm comm_t::comm() const {
+  if (comm_ptr == nullptr) {
+    return MPI_COMM_NULL;
+  } else {
+    return *comm_ptr;
+  }
+}
+
+void comm_t::Wait(request_t &request) const {
+  MPI_Wait(&request, MPI_STATUS_IGNORE);
+}
+
+void comm_t::Waitall(const int count, memory<request_t> &requests) const {
+  MPI_Waitall(count, requests.ptr(), MPI_STATUSES_IGNORE);
+}
+
+void comm_t::Barrier() const {
+  MPI_Barrier(comm());
+}
+
+} //namespace libp

From 79783f9f534a761ed0098fbdf3bafcfabcb6f05a Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:09:58 -0600
Subject: [PATCH 03/12] New timer wrapper

---
 include/timer.hpp   | 56 ++++++++++++++++++++++++++++++++++++++++++
 libs/core/timer.cpp | 60 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 include/timer.hpp
 create mode 100644 libs/core/timer.cpp

diff --git a/include/timer.hpp b/include/timer.hpp
new file mode 100644
index 0000000..15a7067
--- /dev/null
+++ b/include/timer.hpp
@@ -0,0 +1,56 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#ifndef LIBP_TIMER_HPP
+#define LIBP_TIMER_HPP
+
+#include "core.hpp"
+#include "comm.hpp"
+#include "platform.hpp"
+#include <chrono>
+
+namespace libp {
+
+using timePoint_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
+
+/* Host time*/
+timePoint_t Time();
+
+/* Host time after global sync*/
+timePoint_t GlobalTime(comm_t comm);
+
+/* Host time after platform sync*/
+timePoint_t PlatformTime(platform_t &platform);
+
+/* Host time after platform sync*/
+timePoint_t GlobalPlatformTime(platform_t &platform);
+
+/*Time between time points, in seconds*/
+double ElapsedTime(const timePoint_t start, const timePoint_t end);
+
+} //namespace libp
+
+#endif
diff --git a/libs/core/timer.cpp b/libs/core/timer.cpp
new file mode 100644
index 0000000..cb9cfaf
--- /dev/null
+++ b/libs/core/timer.cpp
@@ -0,0 +1,60 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "timer.hpp"
+
+namespace libp {
+
+/* Host time*/
+timePoint_t Time() {
+  return std::chrono::high_resolution_clock::now();
+}
+
+/* Host time after global sync*/
+timePoint_t GlobalTime(comm_t comm) {
+  comm.Barrier();
+  return Time();
+}
+
+/* Host time after platform sync*/
+timePoint_t PlatformTime(platform_t &platform) {
+  platform.finish();
+  return Time();
+}
+
+/* Host time after platform sync*/
+timePoint_t GlobalPlatformTime(platform_t &platform) {
+  platform.finish();
+  platform.comm.Barrier();
+  return Time();
+}
+
+/*Time between time points, in seconds*/
+double ElapsedTime(const timePoint_t start, const timePoint_t end) {
+  return std::chrono::duration_cast<std::chrono::microseconds>(end-start).count()/(1.0e6);
+}
+
+} //namespace libp

From b7af4f78c49bddfa144cca84faed6fc1c42ee1f0 Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:10:23 -0600
Subject: [PATCH 04/12] Better exception handling

---
 include/utils.hpp       | 104 +++++++++++++++++++++++++++-------------
 libs/core/exception.cpp |  90 ++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+), 32 deletions(-)
 create mode 100644 libs/core/exception.cpp

diff --git a/include/utils.hpp b/include/utils.hpp
index bd4c645..4fdc85b 100644
--- a/include/utils.hpp
+++ b/include/utils.hpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -27,48 +27,88 @@ SOFTWARE.
 #ifndef UTILS_HPP
 #define UTILS_HPP
 
-#include <occa.hpp>
-#include <mpi.h>
 #include <string>
+#include <cstring>
+#include <ostream>
+#include <iostream>
+#include <cstddef>
+#include <memory>
+#include <algorithm>
+#include <typeinfo>
+#include <cmath>
+#include <occa.hpp>
 #include "types.h"
 
 namespace libp {
 
+using properties_t = occa::json;
+using device_t = occa::device;
+using kernel_t = occa::kernel;
+using stream_t = occa::stream;
+
 //error codes
-#define HIPBONE_SUCCESS 0
-#define HIPBONE_ERROR -1
+#define LIBP_SUCCESS 0
+#define LIBP_ERROR -1
 
 #ifndef __PRETTY_FUNCTION__
 #  define __PRETTY_FUNCTION__ __FUNCTION__
 #endif
 
-#define HIPBONE_ABORT2(filename, function, line, message)              \
-  {                                                                 \
-    std::string banner = "---[ Error ]";                            \
-    std::cerr << '\n'                                               \
-       << std::string(74, '=') << '\n'                              \
-       << banner << std::string(74 - banner.size(), '-') << '\n'    \
-       << "    File     : " << filename << '\n'                     \
-       << "    Line     : " << line     << '\n'                     \
-       << "    Function : " << function << '\n'                     \
-       << "    Message  : " << message  << '\n'                     \
-       << std::string(74, '=') << '\n';                             \
-    MPI_Abort(MPI_COMM_WORLD,HIPBONE_ERROR);                           \
-  }
-#define HIPBONE_ABORT(message) HIPBONE_ABORT2(__FILE__, __PRETTY_FUNCTION__, __LINE__, message)
-
-#define HIPBONE_WARNING(message)                                       \
-  {                                                                 \
-    std::string banner = "---[ Warning ]";                          \
-    std::cerr << '\n'                                               \
-       << std::string(74, '=') << '\n'                              \
-       << banner << std::string(74 - banner.size(), '-') << '\n'    \
-       << "     " << message  << '\n'                               \
-       << std::string(74, '=') << '\n';                             \
-  }
-
-#define mymax(a,b) (((a)>(b))?(a):(b))
-#define mymin(a,b) (((a)<(b))?(a):(b))
+#define LIBP_TEMPLATE_CHECK(checkFunction, expr, filename, function, line, message) \
+  do {                                                                  \
+    const bool isErr = (bool) (expr);                                   \
+    if (isErr) {                                                        \
+      std::stringstream _check_ss;                                      \
+      _check_ss << message;                                             \
+      checkFunction(filename, function, line, _check_ss.str());         \
+    }                                                                   \
+  } while (false)
+
+#define LIBP_ABORT3(expr, filename, function, line, message) LIBP_TEMPLATE_CHECK(libp::abort, expr, filename, function, line, message)
+#define LIBP_ABORT2(expr, filename, function, line, message) LIBP_ABORT3(expr, filename, function, line, message)
+#define LIBP_ABORT(message, expr)                            LIBP_ABORT2(expr, __FILE__, __PRETTY_FUNCTION__, __LINE__, message)
+
+#define LIBP_WARNING3(expr, filename, function, line, message) LIBP_TEMPLATE_CHECK(libp::warn, expr, filename, function, line, message)
+#define LIBP_WARNING2(expr, filename, function, line, message) LIBP_WARNING3(expr, filename, function, line, message)
+#define LIBP_WARNING(message, expr)                            LIBP_WARNING2(expr, __FILE__, __PRETTY_FUNCTION__, __LINE__, message)
+
+#define LIBP_FORCE_ABORT(message)   LIBP_ABORT(message, true)
+#define LIBP_FORCE_WARNING(message) LIBP_WARNING(message, true)
+
+class exception : public std::exception {
+ public:
+  const std::string header;
+  const std::string filename;
+  const std::string function;
+  const std::string message;
+  const int line;
+
+  std::string exceptionMessage;
+
+  exception(const std::string &header_,
+            const std::string &filename_,
+            const std::string &function_,
+            const int line_,
+            const std::string &message_);
+  ~exception() throw();
+
+  const char* what() const throw();
+  std::string toString() const;
+  std::string location() const;
+};
+
+std::ostream& operator << (std::ostream& out,
+                           const exception &exc);
+
+void abort(const std::string &filename,
+           const std::string &function,
+           const int line,
+           const std::string &message);
+
+void warn(const std::string &filename,
+          const std::string &function,
+          const int line,
+          const std::string &message);
 
 } //namespace libp
 
diff --git a/libs/core/exception.cpp b/libs/core/exception.cpp
new file mode 100644
index 0000000..5fd60d8
--- /dev/null
+++ b/libs/core/exception.cpp
@@ -0,0 +1,90 @@
+/*
+
+The MIT License (MIT)
+
+Copyright (c) 2017-2021 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+*/
+
+#include "utils.hpp"
+
+namespace libp {
+
+
+exception::exception(const std::string &header_,
+                      const std::string &filename_,
+                      const std::string &function_,
+                      const int line_,
+                      const std::string &message_) :
+    header(header_),
+    filename(filename_),
+    function(function_),
+    message(message_),
+    line(line_),
+    exceptionMessage(toString()) {}
+
+exception::~exception() throw() {}
+
+const char* exception::what() const throw() {
+  return exceptionMessage.c_str();
+}
+
+std::string exception::toString() const {
+  std::stringstream ss;
+  std::string banner = "---[ " + header + " ]";
+  ss << '\n'
+     << banner << std::string(80 - banner.size(), '-') << '\n'
+     << location()
+     << "    Message  : " << message << '\n'
+     << std::string(80, '=') << '\n';
+  return ss.str();
+}
+
+std::string exception::location() const {
+  std::stringstream ss;
+  ss << "    File     : " << filename << '\n'
+     << "    Line     : " << line     << '\n'
+     << "    Function : " << function << '\n';
+  return ss.str();
+}
+
+std::ostream& operator << (std::ostream& out,
+                           const exception &exc) {
+  out << exc.toString() << std::flush;
+  return out;
+}
+
+void abort(const std::string &filename,
+           const std::string &function,
+           const int line,
+           const std::string &message) {
+  throw exception("Error", filename, function, line, message);
+}
+
+void warn(const std::string &filename,
+          const std::string &function,
+          const int line,
+          const std::string &message) {
+  exception exp("Warning", filename, function, line, message);
+  std::cout << exp;
+}
+
+} //namespace libp

From a3b2cf1b23ea211c07d59a450d82e2bc4134c406 Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:11:03 -0600
Subject: [PATCH 05/12] Latest ogs lib from libp

---
 include/ogs.hpp               | 190 +++++++-----
 include/ogs/ogsBase.hpp       |  25 +-
 include/ogs/ogsExchange.hpp   | 308 +++++++++++++-------
 include/ogs/ogsOperator.hpp   | 125 ++++----
 include/ogs/ogsUtils.hpp      |  26 +-
 libs/ogs/ogs.cpp              | 528 +++++++++++++++++++++++++---------
 libs/ogs/ogsAllToAll.cpp      | 210 +++++++-------
 libs/ogs/ogsAuto.cpp          | 145 +++++++---
 libs/ogs/ogsCrystalRouter.cpp | 295 +++++++++++--------
 libs/ogs/ogsHalo.cpp          | 363 ++++++++++++++++-------
 libs/ogs/ogsOperator.cpp      | 456 ++++++++++++++---------------
 libs/ogs/ogsPairwise.cpp      | 254 ++++++++--------
 libs/ogs/ogsSetup.cpp         | 208 +++++++-------
 libs/ogs/ogsUtils.cpp         |  61 +---
 14 files changed, 1918 insertions(+), 1276 deletions(-)
 mode change 100644 => 100755 include/ogs.hpp

diff --git a/include/ogs.hpp b/include/ogs.hpp
old mode 100644
new mode 100755
index e97010a..cddd8e6
--- a/include/ogs.hpp
+++ b/include/ogs.hpp
@@ -29,9 +29,9 @@ SOFTWARE.
 
   The code
 
-    MPI_Comm comm;
-  	dlong N;
-    hlong id[N];    // the hlong and dlong types are defined in "types.h"
+    comm_t comm;
+    dlong N;
+    memory<hlong> id(N);    // the hlong and dlong types are defined in "types.h"
     bool verbose;
     bool unique;
     ogs_t ogs(platform);
@@ -57,9 +57,9 @@ SOFTWARE.
 
   A basic gatherScatter operation is, e.g.,
 
-    occa::memory o_v;
+    deviceMemory<double> o_v;
     ...
-    ogs.GatherScatter(o_v, 1, ogs::Double, ogs::Add, ogs::Sym);
+    ogs.GatherScatter(o_v, 1, ogs::Add, ogs::Sym);
 
   This gs call has the effect,
 
@@ -74,14 +74,11 @@ SOFTWARE.
   Summation on doubles is not the only operation and datatype supported. Support
   includes the operations
     ogs::Add, ogs::Mul, ogs::Max, ogs::Min
-  and datatypes
-    ogs::Dfloat, ogs::Double, ogs::Float, ogs::Int32, ogs::Int64, ogs::Dlong, ogs::Hlong.
-  (The int32 and int64 types are the normal C++ types, whereas dfloat, dlong, and hlong
-   are defined in "types.h").
+  and datatypes: float, double, int, long long int.
 
   For the nonsymmetric behavior, the "Transpose" parameter is important:
 
-    ogs.GatherScatter(o_v, 1, ogs::Double, ogs::Add, ogs::[NoTrans/Trans/Sym]);
+    ogs.GatherScatter(o_v, 1, ogs::Add, ogs::[NoTrans/Trans/Sym]);
 
   When transpose == ogs::NoTrans, any "flagged" (p,i) pairs (id[i] negative on p)
   do not participate in the sum, but *do* still receive the sum on output.
@@ -102,7 +99,7 @@ SOFTWARE.
   consistent way. When all groups of (p,i) pairs have a single "unflagged"
   pair in this mannor, an additional nonsymmetric operation is available:
 
-    ogs.Gather(o_Gv, o_v, 1, ogs::Double, ogs::Add, ogs::Trans);
+    ogs.Gather(o_Gv, o_v, 1, ogs::Add, ogs::Trans);
 
   this has the effect of "assembling" the vector o_Gv. That is
 
@@ -115,7 +112,7 @@ SOFTWARE.
 
   The inverse of this operation is
 
-    ogs.Scatter(o_v, o_Gv, 1, ogs::Double, ogs::Add, ogs::Trans);
+    ogs.Scatter(o_v, o_Gv, 1, ogs::Add, ogs::Trans);
 
   which has the effect of scattering in the assembled entries in o_Gv back to the
   orginal ordering. When Transpose == ogs::Trans, "flagged" (p,i) pairs (id[i]
@@ -124,7 +121,7 @@ SOFTWARE.
 
   For operating on contiguously packed vectors, the K parameter is used, e.g.,
 
-    ogs.GatherScatter(o_v, 3, ogs::Double, ogs::Add, ogs::Sym);
+    ogs.GatherScatter(o_v, 3, ogs::Add, ogs::Sym);
 
   which is like "GatherScatter" operating on the datatype double[3],
   with summation here being vector summation. Number of messages sent
@@ -132,9 +129,9 @@ SOFTWARE.
 
   Asynchronous versions of the various GatherScatter functions are provided by
 
-    ogs.GatherScatterStart(o_v, k, ogs::Double, ogs::Add, ogs::Sym);
+    ogs.GatherScatterStart(o_v, k, ogs::Add, ogs::Sym);
     ...
-    ogs.GatherScatterFinish(o_v, k, ogs::Double, ogs::Add, ogs::Sym);
+    ogs.GatherScatterFinish(o_v, k, ogs::Add, ogs::Sym);
 
   MPI communication is not initiated in GatherScatterStart, rather some initial
   message packing and host<->device transfers are queued. The user can then queue
@@ -154,14 +151,14 @@ SOFTWARE.
 
     halo_t halo(platofrm);
     halo.Setup(N, ids, comm, ogs::Auto, verbose);
-    halo.Exchange(o_v, k, ogs::Double);
+    halo.Exchange(o_v, k);
 
   which has the effect of filling all "flagged" pairs (p,i) on all processes with
   the corresponding value from the unique "unflagged" pair in S_j.
 
   An additional untility operation available in the halo_t object is
 
-    halo.Combine(o_v, k, ogs::Double);
+    halo.Combine(o_v, k);
 
   which has the effect of summing the entries in S_j and writing the result to
   the sole "unflagged" pair in S_j.
@@ -222,94 +219,125 @@ class ogs_t : public ogsBase_t {
   ~ogs_t()=default;
 
   void Setup(const dlong _N,
-             hlong *ids,
-             MPI_Comm _comm,
+             memory<hlong> ids,
+             comm_t _comm,
              const Kind _kind,
              const Method method,
              const bool _unique,
              const bool verbose,
              platform_t& _platform);
 
-  void SetupGlobalToLocalMapping(dlong *GlobalToLocal);
+  void SetupGlobalToLocalMapping(memory<dlong> GlobalToLocal);
 
-  // host versions
-  void GatherScatter(void* v,
+  // Synchronous host versions
+  template<typename T>
+  void GatherScatter(memory<T> v,
                      const int k,
-                     const Type type,
                      const Op op,
                      const Transpose trans);
+  // Asynchronous host buffer versions
+  template<typename T>
+  void GatherScatterStart (memory<T> v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans);
+  template<typename T>
+  void GatherScatterFinish(memory<T> v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans);
   // Synchronous device buffer versions
-  void GatherScatter(occa::memory&  o_v,
+  template<typename T>
+  void GatherScatter(deviceMemory<T> o_v,
                      const int k,
-                     const Type type,
                      const Op op,
                      const Transpose trans);
   // Asynchronous device buffer versions
-  void GatherScatterStart (occa::memory&  o_v,
+  template<typename T>
+  void GatherScatterStart (deviceMemory<T> o_v,
                            const int k,
-                           const Type type,
                            const Op op,
                            const Transpose trans);
-  void GatherScatterFinish(occa::memory&  o_v,
+  template<typename T>
+  void GatherScatterFinish(deviceMemory<T> o_v,
                            const int k,
-                           const Type type,
                            const Op op,
                            const Transpose trans);
 
-  // host versions
-  void Gather(void* gv,
-              const void* v,
+  // Synchronous host versions
+  template<typename T>
+  void Gather(memory<T> gv,
+              const memory<T> v,
               const int k,
-              const Type type,
               const Op op,
               const Transpose trans);
+  // Asynchronous host buffer versions
+  template<typename T>
+  void GatherStart (memory<T> gv,
+                    const memory<T> v,
+                    const int k,
+                    const Op op,
+                    const Transpose trans);
+  template<typename T>
+  void GatherFinish(memory<T> gv,
+                    const memory<T> v,
+                    const int k,
+                    const Op op,
+                    const Transpose trans);
   // Synchronous device buffer versions
-  void Gather(occa::memory&  o_gv,
-              occa::memory&  o_v,
+  template<typename T>
+  void Gather(deviceMemory<T> o_gv,
+              deviceMemory<T> o_v,
               const int k,
-              const Type type,
               const Op op,
               const Transpose trans);
   // Asynchronous device buffer versions
-  void GatherStart (occa::memory&  o_gv,
-                    occa::memory&  o_v,
+  template<typename T>
+  void GatherStart (deviceMemory<T> o_gv,
+                    deviceMemory<T> o_v,
                     const int k,
-                    const Type type,
                     const Op op,
                     const Transpose trans);
-  void GatherFinish(occa::memory&  o_gv,
-                    occa::memory&  o_v,
+  template<typename T>
+  void GatherFinish(deviceMemory<T> o_gv,
+                    deviceMemory<T> o_v,
                     const int k,
-                    const Type type,
                     const Op op,
                     const Transpose trans);
 
-  // host versions
-  void Scatter(void* v,
-               const void* gv,
+  // Synchronous host versions
+  template<typename T>
+  void Scatter(memory<T> v,
+               const memory<T> gv,
                const int k,
-               const Type type,
-               const Op op,
                const Transpose trans);
+  // Asynchronous host buffer versions
+  template<typename T>
+  void ScatterStart (memory<T> v,
+                     const memory<T> gv,
+                     const int k,
+                     const Transpose trans);
+  template<typename T>
+  void ScatterFinish(memory<T> v,
+                     memory<T> gv,
+                     const int k,
+                     const Transpose trans);
   // Synchronous device buffer versions
-  void Scatter(occa::memory&  o_v,
-               occa::memory&  o_gv,
+  template<typename T>
+  void Scatter(deviceMemory<T> o_v,
+               deviceMemory<T> o_gv,
                const int k,
-               const Type type,
-               const Op op,
                const Transpose trans);
   // Asynchronous device buffer versions
-  void ScatterStart (occa::memory&  o_v,
-                     occa::memory&  o_gv,
+  template<typename T>
+  void ScatterStart (deviceMemory<T> o_v,
+                     deviceMemory<T> o_gv,
                      const int k,
-                     const Type type,
-                     const Op op,
                      const Transpose trans);
-  void ScatterFinish(occa::memory&  o_v,
-                     occa::memory&  o_gv,
+  template<typename T>
+  void ScatterFinish(deviceMemory<T> o_v,
+                     deviceMemory<T> o_gv,
                      const int k,
-                     const Type type,
-                     const Op op,
                      const Transpose trans);
 
   friend class halo_t;
@@ -325,29 +353,47 @@ class halo_t : public ogsBase_t {
   dlong Nhalo=0;
 
   void Setup(const dlong _N,
-             hlong *ids,
-             MPI_Comm _comm,
+             memory<hlong> ids,
+             comm_t _comm,
              const Method method,
              const bool verbose,
              platform_t& _platform);
 
   void SetupFromGather(ogs_t& ogs);
 
-  // Host version
-  void Exchange(void  *v, const int k, const Type type);
+  // Synchronous Host version
+  template<typename T>
+  void Exchange(memory<T> v, const int k);
+  // Asynchronous host version
+  template<typename T>
+  void ExchangeStart (memory<T> v, const int k);
+  template<typename T>
+  void ExchangeFinish(memory<T> v, const int k);
   // Synchronous device buffer version
-  void Exchange(occa::memory &o_v, const int k, const Type type);
+  template<typename T>
+  void Exchange(deviceMemory<T> o_v, const int k);
   // Asynchronous device buffer version
-  void ExchangeStart (occa::memory &o_v, const int k, const Type type);
-  void ExchangeFinish(occa::memory &o_v, const int k, const Type type);
-
-  // Host version
-  void Combine(void  *v, const int k, const Type type);
+  template<typename T>
+  void ExchangeStart (deviceMemory<T> o_v, const int k);
+  template<typename T>
+  void ExchangeFinish(deviceMemory<T> o_v, const int k);
+
+  // Synchronous Host version
+  template<typename T>
+  void Combine(memory<T> v, const int k);
+  // Asynchronous host version
+  template<typename T>
+  void CombineStart (memory<T> v, const int k);
+  template<typename T>
+  void CombineFinish(memory<T> v, const int k);
   // Synchronous device buffer version
-  void Combine(occa::memory &o_v, const int k, const Type type);
+  template<typename T>
+  void Combine(deviceMemory<T> o_v, const int k);
   // Asynchronous device buffer version
-  void CombineStart (occa::memory &o_v, const int k, const Type type);
-  void CombineFinish(occa::memory &o_v, const int k, const Type type);
+  template<typename T>
+  void CombineStart (deviceMemory<T> o_v, const int k);
+  template<typename T>
+  void CombineFinish(deviceMemory<T> o_v, const int k);
 };
 
 } //namespace ogs
diff --git a/include/ogs/ogsBase.hpp b/include/ogs/ogsBase.hpp
index 9c42be4..4962578 100644
--- a/include/ogs/ogsBase.hpp
+++ b/include/ogs/ogsBase.hpp
@@ -45,7 +45,7 @@ class halo_t;
 class ogsBase_t {
 public:
   platform_t platform;
-  MPI_Comm comm;
+  comm_t comm;
 
   dlong         N=0;
   dlong         Ngather=0;        //  total number of local positive gather nodes
@@ -61,13 +61,14 @@ class ogsBase_t {
   bool unique=false;
   bool gather_defined=false;
 
+  static stream_t dataStream;
+
   ogsBase_t()=default;
-  ogsBase_t(platform_t& _platform);
   virtual ~ogsBase_t()=default;
 
   virtual void Setup(const dlong _N,
-                      hlong *ids,
-                      MPI_Comm _comm,
+                      memory<hlong> ids,
+                      comm_t _comm,
                       const Kind _kind,
                       const Method method,
                       const bool _unique,
@@ -84,22 +85,22 @@ class ogsBase_t {
 
 private:
   void FindSharedNodes(const dlong Nids,
-                       libp::memory<parallelNode_t> &nodes,
+                       memory<parallelNode_t> &nodes,
                        const int verbose);
 
   void ConstructSharedNodes(const dlong Nids,
-                           libp::memory<parallelNode_t> &nodes,
+                           memory<parallelNode_t> &nodes,
                            dlong &Nshared,
-                           libp::memory<parallelNode_t> &sharedNodes);
+                           memory<parallelNode_t> &sharedNodes);
 
-  void LocalSignedSetup(const dlong Nids, libp::memory<parallelNode_t> &nodes);
-  void LocalUnsignedSetup(const dlong Nids, libp::memory<parallelNode_t> &nodes);
-  void LocalHaloSetup(const dlong Nids, libp::memory<parallelNode_t> &nodes);
+  void LocalSignedSetup(const dlong Nids, memory<parallelNode_t> &nodes);
+  void LocalUnsignedSetup(const dlong Nids, memory<parallelNode_t> &nodes);
+  void LocalHaloSetup(const dlong Nids, memory<parallelNode_t> &nodes);
 
   ogsExchange_t* AutoSetup(dlong Nshared,
-                           libp::memory<parallelNode_t> &sharedNodes,
+                           memory<parallelNode_t> &sharedNodes,
                            ogsOperator_t& gatherHalo,
-                           MPI_Comm _comm,
+                           comm_t _comm,
                            platform_t &_platform,
                            const int verbose);
 };
diff --git a/include/ogs/ogsExchange.hpp b/include/ogs/ogsExchange.hpp
index f5bd9c6..4f80d7c 100644
--- a/include/ogs/ogsExchange.hpp
+++ b/include/ogs/ogsExchange.hpp
@@ -38,16 +38,16 @@ namespace ogs {
 class ogsExchange_t {
 public:
   platform_t platform;
-  MPI_Comm comm;
+  comm_t comm;
   int rank, size;
 
   dlong Nhalo, NhaloP;
 
-  char* haloBuf;
-  occa::memory o_haloBuf, h_haloBuf;
+  pinnedMemory<char> h_workspace, h_sendspace;
+  deviceMemory<char> o_workspace, o_sendspace;
 
-  static occa::stream dataStream;
-  static occa::kernel extractKernel[4];
+  stream_t dataStream;
+  static kernel_t extractKernel[4];
 
 #ifdef GPU_AWARE_MPI
   bool gpu_aware=true;
@@ -55,27 +55,33 @@ class ogsExchange_t {
   bool gpu_aware=false;
 #endif
 
-  ogsExchange_t(platform_t &_platform, MPI_Comm _comm):
+  ogsExchange_t(platform_t &_platform, comm_t _comm,
+                stream_t _datastream):
     platform(_platform),
-    comm(_comm) {
-    MPI_Comm_rank(comm, &rank);
-    MPI_Comm_size(comm, &size);
-
-    if (!dataStream.isInitialized())
-      dataStream = platform.device.createStream();
+    comm(_comm),
+    dataStream(_datastream) {
+    rank = comm.rank();
+    size = comm.size();
   }
   virtual ~ogsExchange_t() {}
 
-  virtual void Start(const int k,
-                     const Type type,
-                     const Op op,
-                     const Transpose trans,
-                     const bool host=false)=0;
-  virtual void Finish(const int k,
-                      const Type type,
-                      const Op op,
-                      const Transpose trans,
-                      const bool host=false)=0;
+  virtual void Start(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans)=0;
+
+  virtual void Start(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Start(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans)=0;
+  virtual void Finish(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans)=0;
 
   virtual void AllocBuffer(size_t Nbytes)=0;
 
@@ -87,46 +93,76 @@ class ogsAllToAll_t: public ogsExchange_t {
 private:
 
   dlong NsendN=0, NsendT=0;
-  libp::memory<dlong> sendIdsN, sendIdsT;
-  occa::memory o_sendIdsN, o_sendIdsT;
+  memory<dlong> sendIdsN, sendIdsT;
+  deviceMemory<dlong> o_sendIdsN, o_sendIdsT;
 
   ogsOperator_t postmpi;
 
-  char* sendBuf;
-  occa::memory o_sendBuf;
-  occa::memory h_sendBuf;
+  memory<int> mpiSendCountsN;
+  memory<int> mpiSendCountsT;
+  memory<int> mpiRecvCountsN;
+  memory<int> mpiRecvCountsT;
+  memory<int> mpiSendOffsetsN;
+  memory<int> mpiSendOffsetsT;
+  memory<int> mpiRecvOffsetsN;
+  memory<int> mpiRecvOffsetsT;
 
-  libp::memory<int> mpiSendCountsN;
-  libp::memory<int> mpiSendCountsT;
-  libp::memory<int> mpiRecvCountsN;
-  libp::memory<int> mpiRecvCountsT;
-  libp::memory<int> mpiSendOffsetsN;
-  libp::memory<int> mpiSendOffsetsT;
-  libp::memory<int> mpiRecvOffsetsN;
-  libp::memory<int> mpiRecvOffsetsT;
+  memory<int> sendCounts;
+  memory<int> recvCounts;
+  memory<int> sendOffsets;
+  memory<int> recvOffsets;
 
-  libp::memory<int> sendCounts;
-  libp::memory<int> recvCounts;
-  libp::memory<int> sendOffsets;
-  libp::memory<int> recvOffsets;
+  comm_t::request_t request;
 
 public:
   ogsAllToAll_t(dlong Nshared,
-               libp::memory<parallelNode_t> &sharedNodes,
+               memory<parallelNode_t> &sharedNodes,
                ogsOperator_t &gatherHalo,
-               MPI_Comm _comm,
+               stream_t _dataStream,
+               comm_t _comm,
                platform_t &_platform);
 
-  virtual void Start(const int k,
-                     const Type type,
-                     const Op op,
-                     const Transpose trans,
-                     const bool host=false);
-  virtual void Finish(const int k,
-                      const Type type,
-                      const Op op,
-                      const Transpose trans,
-                      const bool host=false);
+  template<typename T>
+  void Start(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  template<typename T>
+  void Start(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
 
   virtual void AllocBuffer(size_t Nbytes);
 
@@ -137,49 +173,76 @@ class ogsPairwise_t: public ogsExchange_t {
 private:
 
   dlong NsendN=0, NsendT=0;
-  libp::memory<dlong> sendIdsN, sendIdsT;
-  occa::memory o_sendIdsN, o_sendIdsT;
+  memory<dlong> sendIdsN, sendIdsT;
+  deviceMemory<dlong> o_sendIdsN, o_sendIdsT;
 
   ogsOperator_t postmpi;
 
-  char* sendBuf;
-  occa::memory o_sendBuf;
-  occa::memory h_sendBuf;
-
   int NranksSendN=0, NranksRecvN=0;
   int NranksSendT=0, NranksRecvT=0;
-  libp::memory<int> sendRanksN;
-  libp::memory<int> sendRanksT;
-  libp::memory<int> recvRanksN;
-  libp::memory<int> recvRanksT;
-  libp::memory<int> sendCountsN;
-  libp::memory<int> sendCountsT;
-  libp::memory<int> recvCountsN;
-  libp::memory<int> recvCountsT;
-  libp::memory<int> sendOffsetsN;
-  libp::memory<int> sendOffsetsT;
-  libp::memory<int> recvOffsetsN;
-  libp::memory<int> recvOffsetsT;
-  libp::memory<MPI_Request> requests;
-  libp::memory<MPI_Status> statuses;
+  memory<int> sendRanksN;
+  memory<int> sendRanksT;
+  memory<int> recvRanksN;
+  memory<int> recvRanksT;
+  memory<int> sendCountsN;
+  memory<int> sendCountsT;
+  memory<int> recvCountsN;
+  memory<int> recvCountsT;
+  memory<int> sendOffsetsN;
+  memory<int> sendOffsetsT;
+  memory<int> recvOffsetsN;
+  memory<int> recvOffsetsT;
+  memory<comm_t::request_t> requests;
 
 public:
   ogsPairwise_t(dlong Nshared,
-               libp::memory<parallelNode_t> &sharedNodes,
+               memory<parallelNode_t> &sharedNodes,
                ogsOperator_t &gatherHalo,
-               MPI_Comm _comm,
+               stream_t _dataStream,
+               comm_t _comm,
                platform_t &_platform);
 
-  virtual void Start(const int k,
-                     const Type type,
-                     const Op op,
-                     const Transpose trans,
-                     const bool host=false);
-  virtual void Finish(const int k,
-                      const Type type,
-                      const Op op,
-                      const Transpose trans,
-                      const bool host=false);
+  template<typename T>
+  void Start(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  template<typename T>
+  void Start(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
 
   virtual void AllocBuffer(size_t Nbytes);
 };
@@ -195,48 +258,73 @@ class ogsCrystalRouter_t: public ogsExchange_t {
     int Nsend, Nrecv0, Nrecv1;
     dlong recvOffset;
 
-    libp::memory<dlong> sendIds;
-    occa::memory o_sendIds;
+    memory<dlong> sendIds;
+    deviceMemory<dlong> o_sendIds;
 
     ogsOperator_t gather;
   };
 
-  int buf_id=0;
-  occa::memory o_buf[2];
-  occa::memory h_buf[2];
-  char* buf[2];
+  int buf_id=0, hbuf_id=0;
+  pinnedMemory<char> h_work[2];
+  deviceMemory<char> o_work[2];
 
-  MPI_Request request[3];
-  MPI_Status status[3];
+  memory<comm_t::request_t> request;
 
   int Nlevels=0;
-  libp::memory<crLevel> levelsN;
-  libp::memory<crLevel> levelsT;
+  memory<crLevel> levelsN;
+  memory<crLevel> levelsT;
 
   int NsendMax=0, NrecvMax=0;
-  char* sendBuf;
-  char* recvBuf;
-  occa::memory o_sendBuf;
-  occa::memory h_sendBuf;
-  occa::memory o_recvBuf;
 
 public:
   ogsCrystalRouter_t(dlong Nshared,
-                   libp::memory<parallelNode_t> &sharedNodes,
+                   memory<parallelNode_t> &sharedNodes,
                    ogsOperator_t &gatherHalo,
-                   MPI_Comm _comm,
+                   stream_t _dataStream,
+                   comm_t _comm,
                    platform_t &_platform);
 
-  virtual void Start(const int k,
-                     const Type type,
-                     const Op op,
-                     const Transpose trans,
-                     const bool host=false);
-  virtual void Finish(const int k,
-                      const Type type,
-                      const Op op,
-                      const Transpose trans,
-                      const bool host=false);
+  template<typename T>
+  void Start(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(pinnedMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(pinnedMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+
+  template<typename T>
+  void Start(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  template<typename T>
+  void Finish(deviceMemory<T> &buf,
+                const int k,
+                const Op op,
+                const Transpose trans);
+
+  virtual void Start(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Start(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<float> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<double> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<int> &buf,const int k,const Op op,const Transpose trans);
+  virtual void Finish(deviceMemory<long long int> &buf,const int k,const Op op,const Transpose trans);
 
   virtual void AllocBuffer(size_t Nbytes);
 };
diff --git a/include/ogs/ogsOperator.hpp b/include/ogs/ogsOperator.hpp
index 601b5e1..bd33c6f 100644
--- a/include/ogs/ogsOperator.hpp
+++ b/include/ogs/ogsOperator.hpp
@@ -46,22 +46,22 @@ class ogsOperator_t {
   dlong nnzN=0;
   dlong nnzT=0;
 
-  libp::memory<dlong> rowStartsN;
-  libp::memory<dlong> rowStartsT;
-  libp::memory<dlong> colIdsN;
-  libp::memory<dlong> colIdsT;
+  memory<dlong> rowStartsN;
+  memory<dlong> rowStartsT;
+  memory<dlong> colIdsN;
+  memory<dlong> colIdsT;
 
-  occa::memory o_rowStartsN;
-  occa::memory o_rowStartsT;
-  occa::memory o_colIdsN;
-  occa::memory o_colIdsT;
+  deviceMemory<dlong> o_rowStartsN;
+  deviceMemory<dlong> o_rowStartsT;
+  deviceMemory<dlong> o_colIdsN;
+  deviceMemory<dlong> o_colIdsT;
 
   dlong NrowBlocksN=0;
   dlong NrowBlocksT=0;
-  libp::memory<dlong> blockRowStartsN;
-  libp::memory<dlong> blockRowStartsT;
-  occa::memory o_blockRowStartsN;
-  occa::memory o_blockRowStartsT;
+  memory<dlong> blockRowStartsN;
+  memory<dlong> blockRowStartsT;
+  deviceMemory<dlong> o_blockRowStartsN;
+  deviceMemory<dlong> o_blockRowStartsT;
 
   Kind kind;
 
@@ -74,55 +74,49 @@ class ogsOperator_t {
   void setupRowBlocks();
 
   //Apply Z operator
-  void Gather(occa::memory&  o_gv,
-              occa::memory&  o_v,
-              const int k,
-              const Type type,
-              const Op op,
-              const Transpose trans);
-  void Gather(void* gv,
-              const void* v,
-              const int k,
-              const Type type,
-              const Op op,
-              const Transpose trans);
+  template<template<typename> class U,
+           template<typename> class V,
+           typename T>
+  void Gather(U<T> gv, const V<T> v,
+              const int k, const Op op, const Transpose trans);
+
+  template<typename T>
+  void Gather(deviceMemory<T> gv, const deviceMemory<T> v,
+              const int k, const Op op, const Transpose trans);
 
   //Apply Z^T transpose operator
-  void Scatter(occa::memory&  o_v,
-               occa::memory&  o_gv,
-               const int k,
-               const Type type,
-               const Op op,
-               const Transpose trans);
-  void Scatter(void* v,
-               const void* gv,
-               const int k,
-               const Type type,
-               const Op op,
-               const Transpose trans);
+  template<template<typename> class U,
+           template<typename> class V,
+           typename T>
+  void Scatter(U<T> v, const V<T> gv,
+               const int k, const Transpose trans);
+
+  template<typename T>
+  void Scatter(deviceMemory<T> v, const deviceMemory<T> gv,
+               const int k, const Transpose trans);
 
   //Apply Z^T*Z operator
-  void GatherScatter(occa::memory&  o_v,
-                     const int k,
-                     const Type type,
-                     const Op op,
-                     const Transpose trans);
-  void GatherScatter(void* v,
-                     const int k,
-                     const Type type,
-                     const Op op,
-                     const Transpose trans);
+  template<template<typename> class U,
+           typename T>
+  void GatherScatter(U<T> v, const int k,
+                     const Op op, const Transpose trans);
+
+  template<typename T>
+  void GatherScatter(deviceMemory<T> v, const int k,
+                     const Op op, const Transpose trans);
 
 private:
-  template <typename T, template<typename> class Op>
-  void Gather(T* gv, const T* v,
-              const int K, const Transpose trans);
-  template <typename T>
-  void Scatter(T* v, const T* gv,
-              const int K, const Transpose trans);
-  template <typename T, template<typename> class Op>
-  void GatherScatter(T* v,
+  template <template<typename> class U,
+            template<typename> class V,
+            template<typename> class Op,
+            typename T>
+  void Gather(U<T> gv, const V<T> v,
               const int K, const Transpose trans);
+  template <template<typename> class U,
+            template<typename> class Op,
+            typename T>
+  void GatherScatter(U<T> v, const int K,
+                     const Transpose trans);
 
   //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU
   static constexpr int blockSize = 256;
@@ -130,26 +124,21 @@ class ogsOperator_t {
 
   //4 types - Float, Double, Int32, Int64
   //4 ops - Add, Mul, Max, Min
-  static occa::kernel gatherScatterKernel[4][4];
-  static occa::kernel gatherKernel[4][4];
-  static occa::kernel scatterKernel[4];
+  static kernel_t gatherScatterKernel[4][4];
+  static kernel_t gatherKernel[4][4];
+  static kernel_t scatterKernel[4];
 
   friend void InitializeKernels(platform_t& platform, const Type type, const Op op);
 };
 
-template<typename T>
-void extract(const dlong N,
-             const int K,
-             const dlong *ids,
-             const T *q,
-             T *gatherq);
-
+template <template<typename> class U,
+          template<typename> class V,
+          typename T>
 void extract(const dlong N,
              const int K,
-             const Type type,
-             const dlong *ids,
-             const void *q,
-             void *gatherq);
+             const memory<dlong> ids,
+             const U<T> q,
+             V<T> gatherq);
 
 } //namespace ogs
 
diff --git a/include/ogs/ogsUtils.hpp b/include/ogs/ogsUtils.hpp
index aba29f9..b25d649 100644
--- a/include/ogs/ogsUtils.hpp
+++ b/include/ogs/ogsUtils.hpp
@@ -33,11 +33,6 @@ namespace libp {
 
 namespace ogs {
 
-extern MPI_Datatype MPI_PARALLELNODE_T;
-
-void InitMPIType();
-void DestroyMPIType();
-
 struct parallelNode_t{
 
   dlong localId;    // local node id
@@ -51,13 +46,28 @@ struct parallelNode_t{
 
 };
 
-size_t Sizeof(const Type type);
-MPI_Datatype MPI_Type(const Type type);
+template<typename T>
+struct ogsType {
+  static constexpr Type get();
+};
+
+template<> struct ogsType<float> {
+  static constexpr Type get() { return Float; }
+};
+template<> struct ogsType<double> {
+  static constexpr Type get() { return Double; }
+};
+template<> struct ogsType<int> {
+  static constexpr Type get() { return Int32; }
+};
+template<> struct ogsType<long long int> {
+  static constexpr Type get() { return Int64; }
+};
 
 //permute an array A, according to the ordering returned by P
 // i.e. for all n, A[P(n)] <- A[n]
 template<typename T, class Order>
-void permute(const dlong N, libp::memory<T> A, Order P) {
+void permute(const dlong N, memory<T> A, Order P) {
 
   for(dlong n=0;n<N;++n) {
     //get what index A[n] should move to
diff --git a/libs/ogs/ogs.cpp b/libs/ogs/ogs.cpp
index 78ab066..6895748 100644
--- a/libs/ogs/ogs.cpp
+++ b/libs/ogs/ogs.cpp
@@ -34,276 +34,544 @@ namespace libp {
 namespace ogs {
 
 /********************************
- * Scalar GatherScatter
+ * Device GatherScatter
  ********************************/
-void ogs_t::GatherScatter(occa::memory&  o_v,
+template<typename T>
+void ogs_t::GatherScatter(deviceMemory<T> o_v,
                           const int k,
-                          const Type type,
                           const Op op,
                           const Transpose trans){
-  GatherScatterStart (o_v, k, type, op, trans);
-  GatherScatterFinish(o_v, k, type, op, trans);
+  GatherScatterStart (o_v, k, op, trans);
+  GatherScatterFinish(o_v, k, op, trans);
 }
 
-void ogs_t::GatherScatterStart(occa::memory& o_v,
+template<typename T>
+void ogs_t::GatherScatterStart(deviceMemory<T> o_v,
                                const int k,
-                               const Type type,
                                const Op op,
                                const Transpose trans){
-  exchange->AllocBuffer(k*Sizeof(type));
+  exchange->AllocBuffer(k*sizeof(T));
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
 
   //collect halo buffer
-  gatherHalo->Gather(exchange->o_haloBuf, o_v,
-                     k, type, op, trans);
+  gatherHalo->Gather(o_haloBuf, o_v, k, op, trans);
 
-  //prepare MPI exchange
-  exchange->Start(k, type, op, trans);
+  if (exchange->gpu_aware) {
+    //prepare MPI exchange
+    exchange->Start(o_haloBuf, k, op, trans);
+  } else {
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //if not using gpu-aware mpi move the halo buffer to the host
+    const dlong Nhalo = (trans == NoTrans) ? NhaloP : NhaloT;
+
+    //wait for o_haloBuf to be ready
+    device.finish();
+
+    //queue copy to host
+    device.setStream(dataStream);
+    haloBuf.copyFrom(o_haloBuf, Nhalo*k,
+                     0, "async: true");
+    device.setStream(currentStream);
+  }
 }
 
-void ogs_t::GatherScatterFinish(occa::memory& o_v,
+template<typename T>
+void ogs_t::GatherScatterFinish(deviceMemory<T> o_v,
                                 const int k,
-                                const Type type,
                                 const Op op,
                                 const Transpose trans){
 
   //queue local gs operation
-  gatherLocal->GatherScatter(o_v, k, type, op, trans);
+  gatherLocal->GatherScatter(o_v, k, op, trans);
 
-  //finish MPI exchange
-  exchange->Finish(k, type, op, trans);
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  if (exchange->gpu_aware) {
+    //finish MPI exchange
+    exchange->Finish(o_haloBuf, k, op, trans);
+  } else {
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //synchronize data stream to ensure the buffer is on the host
+    device.setStream(dataStream);
+    device.finish();
+
+    /*MPI exchange of host buffer*/
+    exchange->Start (haloBuf, k, op, trans);
+    exchange->Finish(haloBuf, k, op, trans);
+
+    // copy recv back to device
+    const dlong Nhalo = (trans == Trans) ? NhaloP : NhaloT;
+    haloBuf.copyTo(o_haloBuf, Nhalo*k,
+                   0, "async: true");
+    device.finish(); //wait for transfer to finish
+    device.setStream(currentStream);
+  }
 
   //write exchanged halo buffer back to vector
-  gatherHalo->Scatter(o_v, exchange->o_haloBuf,
-                      k, type, op, trans);
+  gatherHalo->Scatter(o_v, o_haloBuf, k, trans);
 }
 
-void ogs_t::GatherScatter(void* v,
+template
+void ogs_t::GatherScatter(deviceMemory<float> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(deviceMemory<double> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(deviceMemory<int> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(deviceMemory<long long int> v, const int k,
+                          const Op op, const Transpose trans);
+
+/********************************
+ * Host GatherScatter
+ ********************************/
+template<typename T>
+void ogs_t::GatherScatter(memory<T> v,
                           const int k,
-                          const Type type,
                           const Op op,
                           const Transpose trans){
-  exchange->AllocBuffer(k*Sizeof(type));
+  GatherScatterStart (v, k, op, trans);
+  GatherScatterFinish(v, k, op, trans);
+}
+
+template<typename T>
+void ogs_t::GatherScatterStart(memory<T> v,
+                               const int k,
+                               const Op op,
+                               const Transpose trans){
+  exchange->AllocBuffer(k*sizeof(T));
+
+  /*Cast workspace to type T*/
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
 
   //collect halo buffer
-  gatherHalo->Gather(exchange->haloBuf, v,
-                     k, type, op, trans);
+  gatherHalo->Gather(haloBuf, v, k, op, trans);
 
   //prepare MPI exchange
-  exchange->Start(k, type, op, trans, true);
+  exchange->Start(haloBuf, k, op, trans);
+}
+
+template<typename T>
+void ogs_t::GatherScatterFinish(memory<T> v,
+                                const int k,
+                                const Op op,
+                                const Transpose trans){
+
+  /*Cast workspace to type T*/
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
 
   //queue local gs operation
-  gatherLocal->GatherScatter(v, k, type, op, trans);
+  gatherLocal->GatherScatter(v, k, op, trans);
 
   //finish MPI exchange
-  exchange->Finish(k, type, op, trans, true);
+  exchange->Finish(haloBuf, k, op, trans);
 
   //write exchanged halo buffer back to vector
-  gatherHalo->Scatter(v, exchange->haloBuf,
-                      k, type, op, trans);
+  gatherHalo->Scatter(v, haloBuf, k, trans);
 }
 
+template
+void ogs_t::GatherScatter(memory<float> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(memory<double> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(memory<int> v, const int k,
+                          const Op op, const Transpose trans);
+template
+void ogs_t::GatherScatter(memory<long long int> v, const int k,
+                          const Op op, const Transpose trans);
+
 /********************************
- * Scalar Gather
+ * Device Gather
  ********************************/
-void ogs_t::Gather(occa::memory&  o_gv,
-                   occa::memory&  o_v,
+template<typename T>
+void ogs_t::Gather(deviceMemory<T> o_gv,
+                   deviceMemory<T> o_v,
                    const int k,
-                   const Type type,
                    const Op op,
                    const Transpose trans){
-  GatherStart (o_gv, o_v, k, type, op, trans);
-  GatherFinish(o_gv, o_v, k, type, op, trans);
+  GatherStart (o_gv, o_v, k, op, trans);
+  GatherFinish(o_gv, o_v, k, op, trans);
 }
 
-void ogs_t::GatherStart(occa::memory&  o_gv,
-                        occa::memory&  o_v,
+template<typename T>
+void ogs_t::GatherStart(deviceMemory<T> o_gv,
+                        deviceMemory<T> o_v,
                         const int k,
-                        const Type type,
                         const Op op,
                         const Transpose trans){
   AssertGatherDefined();
 
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
   if (trans==Trans) { //if trans!=ogs::Trans theres no comms required
-    exchange->AllocBuffer(k*Sizeof(type));
+    exchange->AllocBuffer(k*sizeof(T));
 
     //collect halo buffer
-    gatherHalo->Gather(exchange->o_haloBuf, o_v,
-                       k, type, op, Trans);
-
-    //prepare MPI exchange
-    exchange->Start(k, type, op, Trans);
+    gatherHalo->Gather(o_haloBuf, o_v, k, op, Trans);
+
+    if (exchange->gpu_aware) {
+      //prepare MPI exchange
+      exchange->Start(o_haloBuf, k, op, Trans);
+    } else {
+      //get current stream
+      device_t &device = platform.device;
+      stream_t currentStream = device.getStream();
+
+      //if not using gpu-aware mpi move the halo buffer to the host
+      pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+      //wait for o_haloBuf to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_haloBuf, NhaloT*k,
+                       0, "async: true");
+      device.setStream(currentStream);
+    }
   } else {
     //gather halo
-    occa::memory o_gvHalo = o_gv + k*NlocalT*Sizeof(type);
-    gatherHalo->Gather(o_gvHalo, o_v,
-                       k, type, op, trans);
+    gatherHalo->Gather(o_gv + k*NlocalT, o_v, k, op, trans);
   }
 }
 
-void ogs_t::GatherFinish(occa::memory&  o_gv,
-                         occa::memory&  o_v,
+template<typename T>
+void ogs_t::GatherFinish(deviceMemory<T> o_gv,
+                         deviceMemory<T> o_v,
                          const int k,
-                         const Type type,
                          const Op op,
                          const Transpose trans){
   AssertGatherDefined();
 
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
   //queue local g operation
-  gatherLocal->Gather(o_gv, o_v,
-                      k, type, op, trans);
+  gatherLocal->Gather(o_gv, o_v, k, op, trans);
 
   if (trans==Trans) { //if trans!=ogs::Trans theres no comms required
-    //finish MPI exchange
-    exchange->Finish(k, type, op, Trans);
-
-    //put the result at the end of o_gv
-    if (NhaloP)
-      exchange->o_haloBuf.copyTo(o_gv + k*NlocalT*Sizeof(type),
-                                 k*NhaloP*Sizeof(type), 0, 0, "async: true");
+    if (exchange->gpu_aware) {
+      //finish MPI exchange
+      exchange->Finish(o_haloBuf, k, op, Trans);
+
+      //put the result at the end of o_gv
+      o_haloBuf.copyTo(o_gv + k*NlocalT,
+                       k*NhaloP, 0, "async: true");
+    } else {
+      pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+      //get current stream
+      device_t &device = platform.device;
+      stream_t currentStream = device.getStream();
+
+      //synchronize data stream to ensure the buffer is on the host
+      device.setStream(dataStream);
+      device.finish();
+
+      /*MPI exchange of host buffer*/
+      exchange->Start (haloBuf, k, op, trans);
+      exchange->Finish(haloBuf, k, op, trans);
+
+      // copy recv back to device
+      //put the result at the end of o_gv
+      haloBuf.copyTo(o_gv + k*NlocalT, k*NhaloP,
+                     0, "async: true");
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+    }
   }
 }
 
+template
+void ogs_t::Gather(deviceMemory<float> v, const deviceMemory<float> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(deviceMemory<double> v, const deviceMemory<double> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(deviceMemory<int> v, const deviceMemory<int> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(deviceMemory<long long int> v, const deviceMemory<long long int> gv,
+                   const int k, const Op op, const Transpose trans);
+
+/********************************
+ * Host Gather
+ ********************************/
+
 //host versions
-void ogs_t::Gather(void*  gv,
-                   const void*  v,
+template<typename T>
+void ogs_t::Gather(memory<T> gv,
+                   const memory<T> v,
                    const int k,
-                   const Type type,
                    const Op op,
                    const Transpose trans){
+  GatherStart (gv, v, k, op, trans);
+  GatherFinish(gv, v, k, op, trans);
+}
+
+template<typename T>
+void ogs_t::GatherStart(memory<T> gv,
+                        const memory<T> v,
+                        const int k,
+                        const Op op,
+                        const Transpose trans){
   AssertGatherDefined();
 
   if (trans==Trans) { //if trans!=ogs::Trans theres no comms required
-    exchange->AllocBuffer(k*Sizeof(type));
+    exchange->AllocBuffer(k*sizeof(T));
+
+    /*Cast workspace to type T*/
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
 
     //collect halo buffer
-    gatherHalo->Gather(exchange->haloBuf, v,
-                       k, type, op, Trans);
+    gatherHalo->Gather(haloBuf, v, k, op, Trans);
 
     //prepare MPI exchange
-    exchange->Start(k, type, op, Trans, true);
+    exchange->Start(haloBuf, k, op, Trans);
+  } else {
+    //gather halo
+    gatherHalo->Gather(gv + k*NlocalT, v, k, op, trans);
+  }
+}
+
+template<typename T>
+void ogs_t::GatherFinish(memory<T> gv,
+                         const memory<T> v,
+                         const int k,
+                         const Op op,
+                         const Transpose trans){
+  AssertGatherDefined();
 
-    //local gather
-    gatherLocal->Gather(gv, v, k, type, op, Trans);
+  //queue local g operation
+  gatherLocal->Gather(gv, v, k, op, trans);
+
+  if (trans==Trans) { //if trans!=ogs::Trans theres no comms required
+    /*Cast workspace to type T*/
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
 
     //finish MPI exchange
-    exchange->Finish(k, type, op, Trans, true);
+    exchange->Finish(haloBuf, k, op, Trans);
 
     //put the result at the end of o_gv
-    std::memcpy(static_cast<char*>(gv) + k*NlocalT*Sizeof(type),
-                exchange->haloBuf,
-                k*NhaloP*Sizeof(type));
-  } else {
-    //local gather
-    gatherLocal->Gather(gv, v, k, type, op, trans);
-
-    //gather halo
-    gatherHalo->Gather(static_cast<char*>(gv) + k*NlocalT*Sizeof(type),
-                       v, k, type, op, trans);
+    haloBuf.copyTo(gv+k*NlocalT, k*NhaloP);
   }
 }
 
+template
+void ogs_t::Gather(memory<float> v, const memory<float> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(memory<double> v, const memory<double> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(memory<int> v, const memory<int> gv,
+                   const int k, const Op op, const Transpose trans);
+template
+void ogs_t::Gather(memory<long long int> v, const memory<long long int> gv,
+                   const int k, const Op op, const Transpose trans);
 
 /********************************
- * Scalar Scatter
+ * Device Scatter
  ********************************/
-void ogs_t::Scatter(occa::memory&  o_v,
-                    occa::memory&  o_gv,
+template<typename T>
+void ogs_t::Scatter(deviceMemory<T> o_v,
+                    deviceMemory<T> o_gv,
                     const int k,
-                    const Type type,
-                    const Op op,
                     const Transpose trans){
-  ScatterStart (o_v, o_gv, k, type, op, trans);
-  ScatterFinish(o_v, o_gv, k, type, op, trans);
+  ScatterStart (o_v, o_gv, k, trans);
+  ScatterFinish(o_v, o_gv, k, trans);
 }
 
-void ogs_t::ScatterStart(occa::memory&  o_v,
-                         occa::memory&  o_gv,
+template<typename T>
+void ogs_t::ScatterStart(deviceMemory<T> o_v,
+                         deviceMemory<T> o_gv,
                          const int k,
-                         const Type type,
-                         const Op op,
                          const Transpose trans){
   AssertGatherDefined();
 
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
   if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required
-    exchange->AllocBuffer(k*Sizeof(type));
+    exchange->AllocBuffer(k*sizeof(T));
 
-    //collect halo buffer
-    if (NhaloP)
-      exchange->o_haloBuf.copyFrom(o_gv + k*NlocalT*Sizeof(type),
-                                   k*NhaloP*Sizeof(type), 0, 0, "async: true");
+    device_t &device = platform.device;
 
-    //prepare MPI exchange
-    exchange->Start(k, type, op, NoTrans);
+    if (exchange->gpu_aware) {
+      //collect halo buffer
+      o_haloBuf.copyFrom(o_gv + k*NlocalT,
+                         k*NhaloP, 0, "async: true");
+
+      //wait for o_haloBuf to be ready
+      device.finish();
+
+      //prepare MPI exchange
+      exchange->Start(o_haloBuf, k, Add, NoTrans);
+    } else {
+      //get current stream
+      stream_t currentStream = device.getStream();
+
+      //if not using gpu-aware mpi move the halo buffer to the host
+      pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+      //wait for o_gv to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_gv + k*NlocalT, NhaloP*k,
+                       0, "async: true");
+      device.setStream(currentStream);
+    }
   }
 }
 
-
-void ogs_t::ScatterFinish(occa::memory&  o_v,
-                          occa::memory&  o_gv,
+template<typename T>
+void ogs_t::ScatterFinish(deviceMemory<T> o_v,
+                          deviceMemory<T> o_gv,
                           const int k,
-                          const Type type,
-                          const Op op,
                           const Transpose trans){
   AssertGatherDefined();
 
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
   //queue local s operation
-  gatherLocal->Scatter(o_v, o_gv, k, type, op, trans);
+  gatherLocal->Scatter(o_v, o_gv, k, trans);
 
   if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required
-    //finish MPI exchange (and put the result at the end of o_gv)
-    exchange->Finish(k, type, op, NoTrans);
+    if (exchange->gpu_aware) {
+      //finish MPI exchange
+      exchange->Finish(o_haloBuf, k, Add, NoTrans);
+    } else {
+      pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+      //get current stream
+      device_t &device = platform.device;
+      stream_t currentStream = device.getStream();
+
+      //synchronize data stream to ensure the buffer is on the host
+      device.setStream(dataStream);
+      device.finish();
+
+      /*MPI exchange of host buffer*/
+      exchange->Start (haloBuf, k, Add, NoTrans);
+      exchange->Finish(haloBuf, k, Add, NoTrans);
+
+      // copy recv back to device
+      haloBuf.copyTo(o_haloBuf, NhaloT*k,
+                     0, "async: true");
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+    }
 
     //scatter halo buffer
-    gatherHalo->Scatter(o_v, exchange->o_haloBuf,
-                        k, type, op, NoTrans);
+    gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans);
   } else {
     //scatter halo
-    occa::memory o_gvHalo = o_gv + k*NlocalT*Sizeof(type);
-    gatherHalo->Scatter(o_v, o_gvHalo, k, type, op, trans);
+    gatherHalo->Scatter(o_v, o_gv + k*NlocalT, k, trans);
   }
 }
 
+template
+void ogs_t::Scatter(deviceMemory<float> v, const deviceMemory<float> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(deviceMemory<double> v, const deviceMemory<double> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(deviceMemory<int> v, const deviceMemory<int> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(deviceMemory<long long int> v, const deviceMemory<long long int> gv,
+                    const int k, const Transpose trans);
+
+/********************************
+ * Host Scatter
+ ********************************/
+
 //host versions
-void ogs_t::Scatter(void*  v,
-                    const void*  gv,
+template<typename T>
+void ogs_t::Scatter(memory<T> v,
+                    const memory<T> gv,
                     const int k,
-                    const Type type,
-                    const Op op,
                     const Transpose trans){
+  ScatterStart (v, gv, k, trans);
+  ScatterFinish(v, gv, k, trans);
+}
+
+template<typename T>
+void ogs_t::ScatterStart(memory<T> v,
+                         const memory<T> gv,
+                         const int k,
+                         const Transpose trans){
   AssertGatherDefined();
 
   if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required
-    exchange->AllocBuffer(k*Sizeof(type));
+    exchange->AllocBuffer(k*sizeof(T));
+
+    /*Cast workspace to type T*/
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
 
     //collect halo buffer
-    std::memcpy(exchange->haloBuf,
-                static_cast<const char*>(gv) + k*NlocalT*Sizeof(type),
-                k*NhaloP*Sizeof(type));
+    haloBuf.copyFrom(gv + k*NlocalT, k*NhaloP);
 
     //prepare MPI exchange
-    exchange->Start(k, type, op, NoTrans, true);
+    exchange->Start(haloBuf, k, Add, NoTrans);
+  }
+}
 
-    //local scatter
-    gatherLocal->Scatter(v, gv, k, type, op, NoTrans);
+template<typename T>
+void ogs_t::ScatterFinish(memory<T> v,
+                          const memory<T> gv,
+                          const int k,
+                          const Transpose trans){
+  AssertGatherDefined();
+
+  //queue local s operation
+  gatherLocal->Scatter(v, gv, k, trans);
+
+  if (trans==NoTrans) { //if trans!=ogs::NoTrans theres no comms required
+    /*Cast workspace to type T*/
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
 
     //finish MPI exchange (and put the result at the end of o_gv)
-    exchange->Finish(k, type, op, NoTrans, true);
+    exchange->Finish(haloBuf, k, Add, NoTrans);
 
     //scatter halo buffer
-    gatherHalo->Scatter(v, exchange->haloBuf,
-                        k, type, op, NoTrans);
+    gatherHalo->Scatter(v, haloBuf, k, NoTrans);
   } else {
-    //local scatter
-    gatherLocal->Scatter(v, gv, k, type, op, trans);
-
     //scatter halo
-    gatherHalo->Scatter(v,
-                        static_cast<const char*>(gv)
-                            + k*NlocalT*Sizeof(type),
-                        k, type, op, trans);
+    gatherHalo->Scatter(v, gv + k*NlocalT, k, trans);
   }
 }
 
+template
+void ogs_t::Scatter(memory<float> v, const memory<float> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(memory<double> v, const memory<double> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(memory<int> v, const memory<int> gv,
+                    const int k, const Transpose trans);
+template
+void ogs_t::Scatter(memory<long long int> v, const memory<long long int> gv,
+                    const int k, const Transpose trans);
 } //namespace ogs
 
 } //namespace libp
diff --git a/libs/ogs/ogsAllToAll.cpp b/libs/ogs/ogsAllToAll.cpp
index 7f92fa5..6ca2713 100644
--- a/libs/ogs/ogsAllToAll.cpp
+++ b/libs/ogs/ogsAllToAll.cpp
@@ -39,80 +39,100 @@ namespace libp {
 
 namespace ogs {
 
-void ogsAllToAll_t::Start(const int k,
-                          const Type type,
-                          const Op op,
-                          const Transpose trans,
-                          const bool host){
-
-  occa::device &device = platform.device;
+/**********************************
+* Host exchange
+***********************************/
+template<typename T>
+inline void ogsAllToAll_t::Start(pinnedMemory<T> &buf, const int k,
+                          const Op op, const Transpose trans){
 
-  //get current stream
-  occa::stream currentStream = device.getStream();
+  pinnedMemory<T> sendBuf = h_sendspace;
 
-  const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT;
-  const dlong N     = (trans == NoTrans) ? NhaloP : Nhalo;
+  // extract the send buffer
+  if (trans == NoTrans)
+    extract(NsendN, k, sendIdsN, buf, sendBuf);
+  else
+    extract(NsendT, k, sendIdsT, buf, sendBuf);
 
-  if (Nsend) {
-    if (gpu_aware && !host) {
-      //if using gpu-aware mpi and exchanging device buffers,
-      //  assemble the send buffer on device
-      if (trans == NoTrans) {
-        extractKernel[type](NsendN, k, o_sendIdsN, o_haloBuf, o_sendBuf);
-      } else {
-        extractKernel[type](NsendT, k, o_sendIdsT, o_haloBuf, o_sendBuf);
-      }
-      //if not overlapping, wait for kernel to finish on default stream
-      device.finish();
-    } else if (!host) {
-      //if not using gpu-aware mpi and exchanging device buffers,
-      // move the halo buffer to the host
-      device.setStream(dataStream);
-      const size_t Nbytes = k*Sizeof(type);
-      o_haloBuf.copyTo(haloBuf, N*Nbytes, 0, "async: true");
-      device.setStream(currentStream);
+  if (trans==NoTrans) {
+    for (int r=0;r<size;++r) {
+      sendCounts[r] = k*mpiSendCountsN[r];
+      recvCounts[r] = k*mpiRecvCountsN[r];
+      sendOffsets[r+1] = k*mpiSendOffsetsN[r+1];
+      recvOffsets[r+1] = k*mpiRecvOffsetsN[r+1];
+    }
+  } else {
+    for (int r=0;r<size;++r) {
+      sendCounts[r] = k*mpiSendCountsT[r];
+      recvCounts[r] = k*mpiRecvCountsT[r];
+      sendOffsets[r+1] = k*mpiSendOffsetsT[r+1];
+      recvOffsets[r+1] = k*mpiRecvOffsetsT[r+1];
     }
   }
+
+  // collect everything needed with single MPI all to all
+  comm.Ialltoallv(sendBuf,     sendCounts, sendOffsets,
+                  buf+Nhalo*k, recvCounts, recvOffsets,
+                  request);
 }
 
+template<typename T>
+inline void ogsAllToAll_t::Finish(pinnedMemory<T> &buf, const int k,
+                           const Op op, const Transpose trans){
 
-void ogsAllToAll_t::Finish(const int k,
-                           const Type type,
-                           const Op op,
-                           const Transpose trans,
-                           const bool host){
+  comm.Wait(request);
 
-  const size_t Nbytes = k*Sizeof(type);
-  occa::device &device = platform.device;
+  //if we recvieved anything via MPI, gather the recv buffer and scatter
+  // it back to to original vector
+  dlong Nrecv = recvOffsets[size];
+  if (Nrecv) {
+    // gather the recieved nodes
+    postmpi.Gather(buf, buf, k, op, trans);
+  }
+}
 
-  //get current stream
-  occa::stream currentStream = device.getStream();
+void ogsAllToAll_t::Start(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+/**********************************
+* GPU-aware exchange
+***********************************/
+template<typename T>
+void ogsAllToAll_t::Start(deviceMemory<T> &o_buf,
+                          const int k,
+                          const Op op,
+                          const Transpose trans){
 
   const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT;
 
-  if (Nsend && !gpu_aware && !host) {
-    //synchronize data stream to ensure the host buffer is on the host
-    device.setStream(dataStream);
+  if (Nsend) {
+    deviceMemory<T> o_sendBuf = o_sendspace;
+
+    // assemble the send buffer on device
+    if (trans == NoTrans) {
+      extractKernel[ogsType<T>::get()](NsendN, k, o_sendIdsN, o_buf, o_sendBuf);
+    } else {
+      extractKernel[ogsType<T>::get()](NsendT, k, o_sendIdsT, o_buf, o_sendBuf);
+    }
+    //wait for kernel to finish on default stream
+    device_t &device = platform.device;
     device.finish();
-    device.setStream(currentStream);
   }
+}
 
-  //if the halo data is on the host, extract the send buffer
-  if (host || !gpu_aware) {
-    if (trans == NoTrans)
-      extract(NsendN, k, type, sendIdsN.ptr(), haloBuf, sendBuf);
-    else
-      extract(NsendT, k, type, sendIdsT.ptr(), haloBuf, sendBuf);
-  }
+template<typename T>
+void ogsAllToAll_t::Finish(deviceMemory<T> &o_buf,
+                           const int k,
+                           const Op op,
+                           const Transpose trans){
 
-  char *sendPtr, *recvPtr;
-  if (gpu_aware && !host) { //device pointer
-    sendPtr = static_cast<char*>(o_sendBuf.ptr());
-    recvPtr = static_cast<char*>(o_haloBuf.ptr()) + Nhalo*Nbytes;
-  } else { //host pointer
-    sendPtr = sendBuf;
-    recvPtr = haloBuf + Nhalo*Nbytes;
-  }
+  deviceMemory<T> o_sendBuf = o_sendspace;
 
   if (trans==NoTrans) {
     for (int r=0;r<size;++r) {
@@ -131,40 +151,34 @@ void ogsAllToAll_t::Finish(const int k,
   }
 
   // collect everything needed with single MPI all to all
-  MPI_Alltoallv(sendPtr, sendCounts.ptr(), sendOffsets.ptr(), MPI_Type(type),
-                recvPtr, recvCounts.ptr(), recvOffsets.ptr(), MPI_Type(type),
-                comm);
+  comm.Alltoallv(o_sendBuf,     sendCounts, sendOffsets,
+                 o_buf+Nhalo*k, recvCounts, recvOffsets);
 
   //if we recvieved anything via MPI, gather the recv buffer and scatter
   // it back to to original vector
   dlong Nrecv = recvOffsets[size];
   if (Nrecv) {
-    if (!gpu_aware || host) {
-      //if not gpu-aware or recieved data is on the host,
-      // gather the recieved nodes
-      postmpi.Gather(haloBuf, haloBuf, k, type, op, trans);
-
-      if (!host) {
-        // copy recv back to device
-        device.setStream(dataStream);
-        const dlong N = (trans == Trans) ? NhaloP : Nhalo;
-        o_haloBuf.copyFrom(haloBuf, N*Nbytes, 0, "async: true");
-        device.finish(); //wait for transfer to finish
-        device.setStream(currentStream);
-      }
-    } else {
-      // gather the recieved nodes on device
-      postmpi.Gather(o_haloBuf, o_haloBuf, k, type, op, trans);
-    }
+    // gather the recieved nodes on device
+    postmpi.Gather(o_buf, o_buf, k, op, trans);
   }
 }
 
+void ogsAllToAll_t::Start(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsAllToAll_t::Start(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsAllToAll_t::Finish(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
 ogsAllToAll_t::ogsAllToAll_t(dlong Nshared,
-                             libp::memory<parallelNode_t> &sharedNodes,
+                             memory<parallelNode_t> &sharedNodes,
                              ogsOperator_t& gatherHalo,
-                             MPI_Comm _comm,
+                             stream_t _dataStream,
+                             comm_t _comm,
                              platform_t &_platform):
-  ogsExchange_t(_platform,_comm) {
+  ogsExchange_t(_platform,_comm, _dataStream) {
 
   Nhalo  = gatherHalo.NrowsT;
   NhaloP = gatherHalo.NrowsN;
@@ -195,10 +209,8 @@ ogsAllToAll_t::ogsAllToAll_t(dlong Nshared,
   }
 
   //shared counts
-  MPI_Alltoall(mpiSendCountsT.ptr(), 1, MPI_INT,
-               mpiRecvCountsT.ptr(), 1, MPI_INT, comm);
-  MPI_Alltoall(mpiSendCountsN.ptr(), 1, MPI_INT,
-               mpiRecvCountsN.ptr(), 1, MPI_INT, comm);
+  comm.Alltoall(mpiSendCountsT, mpiRecvCountsT);
+  comm.Alltoall(mpiSendCountsN, mpiRecvCountsN);
 
   //cumulative sum
   mpiSendOffsetsN[0] = 0;
@@ -234,13 +246,11 @@ ogsAllToAll_t::ogsAllToAll_t(dlong Nshared,
 
   //send the node lists so we know what we'll receive
   dlong Nrecv = mpiRecvOffsetsT[size];
-  libp::memory<parallelNode_t> recvNodes(Nrecv);
+  memory<parallelNode_t> recvNodes(Nrecv);
 
   //Send list of nodes to each rank
-  MPI_Alltoallv(sharedNodes.ptr(), mpiSendCountsT.ptr(), mpiSendOffsetsT.ptr(), MPI_PARALLELNODE_T,
-                  recvNodes.ptr(), mpiRecvCountsT.ptr(), mpiRecvOffsetsT.ptr(), MPI_PARALLELNODE_T,
-                comm);
-  MPI_Barrier(comm);
+  comm.Alltoallv(sharedNodes, mpiSendCountsT, mpiSendOffsetsT,
+                   recvNodes, mpiRecvCountsT, mpiRecvOffsetsT);
 
   //make ops for gathering halo nodes after an MPI_Allgatherv
   postmpi.platform = platform;
@@ -252,10 +262,10 @@ ogsAllToAll_t::ogsAllToAll_t(dlong Nshared,
   postmpi.rowStartsT.malloc(Nhalo+1);
 
   //make array of counters
-  libp::memory<dlong> haloGatherTCounts(Nhalo);
-  libp::memory<dlong> haloGatherNCounts(Nhalo);
+  memory<dlong> haloGatherTCounts(Nhalo);
+  memory<dlong> haloGatherNCounts(Nhalo);
 
-  //count the data that will already be in haloBuf
+  //count the data that will already be in h_haloBuf.ptr()
   for (dlong n=0;n<Nhalo;n++) {
     haloGatherNCounts[n] = (n<NhaloP) ? 1 : 0;
     haloGatherTCounts[n] = 1;
@@ -329,19 +339,17 @@ ogsAllToAll_t::ogsAllToAll_t(dlong Nshared,
   recvOffsets[0]=0;
 
   //make scratch space
-  AllocBuffer(Sizeof(Dfloat));
+  AllocBuffer(sizeof(dfloat));
 }
 
 void ogsAllToAll_t::AllocBuffer(size_t Nbytes) {
-  if (o_haloBuf.size() < postmpi.nnzT*Nbytes) {
-    if (o_haloBuf.size()) o_haloBuf.free();
-    haloBuf = static_cast<char*>(platform.hostMalloc(postmpi.nnzT*Nbytes,  nullptr, h_haloBuf));
-    o_haloBuf = platform.malloc(postmpi.nnzT*Nbytes);
+  if (o_workspace.size() < postmpi.nnzT*Nbytes) {
+    h_workspace = platform.hostMalloc<char>(postmpi.nnzT*Nbytes);
+    o_workspace = platform.malloc<char>(postmpi.nnzT*Nbytes);
   }
-  if (o_sendBuf.size() < NsendT*Nbytes) {
-    if (o_sendBuf.size()) o_sendBuf.free();
-    sendBuf = static_cast<char*>(platform.hostMalloc(NsendT*Nbytes,  nullptr, h_sendBuf));
-    o_sendBuf = platform.malloc(NsendT*Nbytes);
+  if (o_sendspace.size() < NsendT*Nbytes) {
+    h_sendspace = platform.hostMalloc<char>(NsendT*Nbytes);
+    o_sendspace = platform.malloc<char>(NsendT*Nbytes);
   }
 }
 
diff --git a/libs/ogs/ogsAuto.cpp b/libs/ogs/ogsAuto.cpp
index 8469574..7ec99e5 100644
--- a/libs/ogs/ogsAuto.cpp
+++ b/libs/ogs/ogsAuto.cpp
@@ -28,41 +28,111 @@ SOFTWARE.
 #include "ogs/ogsUtils.hpp"
 #include "ogs/ogsOperator.hpp"
 #include "ogs/ogsExchange.hpp"
+#include "timer.hpp"
 
 namespace libp {
 
 namespace ogs {
 
-static void ExchangeTest(ogsExchange_t* exchange, double time[3], bool host=false) {
+static void DeviceExchangeTest(ogsExchange_t* exchange, double time[3]) {
   const int Ncold = 10;
   const int Nhot  = 10;
-  double start, end;
   double localTime, sumTime, minTime, maxTime;
 
-  int rank, size;
-  MPI_Comm_rank(exchange->comm, &rank);
-  MPI_Comm_size(exchange->comm, &size);
+  comm_t& comm = exchange->comm;
+  int size = comm.size();
+
+  pinnedMemory<dfloat>   buf = exchange->h_workspace;
+  deviceMemory<dfloat> o_buf = exchange->o_workspace;
+
+  device_t &device = exchange->platform.device;
+
+  //dry run
+  for (int n=0;n<Ncold;++n) {
+    if (exchange->gpu_aware) {
+      /*GPU-aware exchange*/
+      exchange->Start (o_buf, 1, Add, Sym);
+      exchange->Finish(o_buf, 1, Add, Sym);
+    } else {
+      //if not using gpu-aware mpi move the halo buffer to the host
+      o_buf.copyTo(buf, exchange->Nhalo,
+                   0, "async: true");
+      device.finish();
+
+      /*MPI exchange of host buffer*/
+      exchange->Start (buf, 1, Add, Sym);
+      exchange->Finish(buf, 1, Add, Sym);
+
+      // copy recv back to device
+      o_buf.copyFrom(buf, exchange->Nhalo,
+                     0, "async: true");
+      device.finish(); //wait for transfer to finish
+    }
+  }
+
+  //hot runs
+  timePoint_t start = Time();
+  for (int n=0;n<Nhot;++n) {
+    if (exchange->gpu_aware) {
+      /*GPU-aware exchange*/
+      exchange->Start (o_buf, 1, Add, Sym);
+      exchange->Finish(o_buf, 1, Add, Sym);
+    } else {
+      //if not using gpu-aware mpi move the halo buffer to the host
+      o_buf.copyTo(buf, exchange->Nhalo,
+                   0, "async: true");
+      device.finish();
+
+      /*MPI exchange of host buffer*/
+      exchange->Start (buf, 1, Add, Sym);
+      exchange->Finish(buf, 1, Add, Sym);
+
+      // copy recv back to device
+      o_buf.copyFrom(buf, exchange->Nhalo,
+                     0, "async: true");
+      device.finish(); //wait for transfer to finish
+    }
+  }
+  timePoint_t end = Time();
+
+  localTime = ElapsedTime(start,end)/Nhot;
+  comm.Allreduce(localTime, sumTime, comm_t::Sum);
+  comm.Allreduce(localTime, maxTime, comm_t::Max);
+  comm.Allreduce(localTime, minTime, comm_t::Min);
+
+  time[0] = sumTime/size; //avg
+  time[1] = minTime;      //min
+  time[2] = maxTime;      //max
+}
+
+static void HostExchangeTest(ogsExchange_t* exchange, double time[3]) {
+  const int Ncold = 10;
+  const int Nhot  = 10;
+  double localTime, sumTime, minTime, maxTime;
+
+  comm_t& comm = exchange->comm;
+  int size = comm.size();
+
+  pinnedMemory<dfloat> buf = exchange->h_workspace;
 
   //dry run
   for (int n=0;n<Ncold;++n) {
-    exchange->Start(1, Dfloat, Add, Sym, host);
-    exchange->Finish(1, Dfloat, Add, Sym, host);
+    exchange->Start (buf, 1, Add, Sym);
+    exchange->Finish(buf, 1, Add, Sym);
   }
 
   //hot runs
-  if (!host) exchange->platform.device.finish();
-  start = MPI_Wtime();
+  timePoint_t start = Time();
   for (int n=0;n<Nhot;++n) {
-    exchange->Start(1, Dfloat, Add, Sym, host);
-    exchange->Finish(1, Dfloat, Add, Sym, host);
+    exchange->Start (buf, 1, Add, Sym);
+    exchange->Finish(buf, 1, Add, Sym);
   }
-  if (!host) exchange->platform.device.finish();
-  end = MPI_Wtime();
+  timePoint_t end = Time();
 
-  localTime = (end-start)/Nhot;
-  MPI_Allreduce(&localTime, &sumTime, 1, MPI_DOUBLE, MPI_SUM, exchange->comm);
-  MPI_Allreduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, exchange->comm);
-  MPI_Allreduce(&localTime, &minTime, 1, MPI_DOUBLE, MPI_MIN, exchange->comm);
+  localTime = ElapsedTime(start,end)/Nhot;
+  comm.Allreduce(localTime, sumTime, comm_t::Sum);
+  comm.Allreduce(localTime, maxTime, comm_t::Max);
+  comm.Allreduce(localTime, minTime, comm_t::Min);
 
   time[0] = sumTime/size; //avg
   time[1] = minTime;      //min
@@ -70,18 +140,19 @@ static void ExchangeTest(ogsExchange_t* exchange, double time[3], bool host=fals
 }
 
 ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
-                                    libp::memory<parallelNode_t> &sharedNodes,
+                                    memory<parallelNode_t> &sharedNodes,
                                     ogsOperator_t& _gatherHalo,
-                                    MPI_Comm _comm,
+                                    comm_t _comm,
                                     platform_t &_platform,
                                     const int verbose) {
 
   int rank, size;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+  rank = comm.rank();
+  size = comm.size();
 
   if (size==1) return new ogsPairwise_t(Nshared, sharedNodes,
-                                           _gatherHalo, comm, platform);
+                                        _gatherHalo, dataStream,
+                                        comm, platform);
 
   ogsExchange_t* bestExchange;
   Method method;
@@ -102,13 +173,14 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
    * Pairwise
    ********************************/
   ogsExchange_t* pairwise = new ogsPairwise_t(Nshared, sharedNodes,
-                                           _gatherHalo, comm, platform);
+                                              _gatherHalo, dataStream,
+                                              comm, platform);
 
   //standard copy to host - exchange - copy back to device
   pairwise->gpu_aware=false;
 
   double pairwiseTime[3];
-  ExchangeTest(pairwise, pairwiseTime, false);
+  DeviceExchangeTest(pairwise, pairwiseTime);
   double pairwiseAvg = pairwiseTime[0];
 
 #ifdef GPU_AWARE_MPI
@@ -116,7 +188,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
   pairwise->gpu_aware=true;
 
   double pairwiseGATime[3];
-  ExchangeTest(pairwise, pairwiseGATime, false);
+  DeviceExchangeTest(pairwise, pairwiseGATime);
 
   if (pairwiseGATime[0] < pairwiseAvg)
     pairwiseAvg = pairwiseGATime[0];
@@ -127,7 +199,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
 
   //test exchange from host memory (just for reporting)
   double pairwiseHostTime[3];
-  ExchangeTest(pairwise, pairwiseHostTime, true);
+  HostExchangeTest(pairwise, pairwiseHostTime);
 
   bestExchange = pairwise;
   method = Pairwise;
@@ -150,13 +222,13 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
    * All-to-All
    ********************************/
   ogsExchange_t* alltoall = new ogsAllToAll_t(Nshared, sharedNodes,
-                                           _gatherHalo, comm, platform);
-
+                                           _gatherHalo, dataStream,
+                                           comm, platform);
   //standard copy to host - exchange - copy back to device
   alltoall->gpu_aware=false;
 
   double alltoallTime[3];
-  ExchangeTest(alltoall, alltoallTime, false);
+  DeviceExchangeTest(alltoall, alltoallTime);
   double alltoallAvg = alltoallTime[0];
 
 #ifdef GPU_AWARE_MPI
@@ -164,7 +236,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
   alltoall->gpu_aware=true;
 
   double alltoallGATime[3];
-  ExchangeTest(alltoall, alltoallGATime, false);
+  DeviceExchangeTest(alltoall, alltoallGATime);
 
   if (alltoallGATime[0] < alltoallAvg)
     alltoallAvg = alltoallGATime[0];
@@ -175,7 +247,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
 
   //test exchange from host memory (just for reporting)
   double alltoallHostTime[3];
-  ExchangeTest(alltoall, alltoallHostTime, true);
+  HostExchangeTest(alltoall, alltoallHostTime);
 
   if (alltoallAvg < bestTime) {
     delete bestExchange;
@@ -202,14 +274,15 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
   /********************************
    * Crystal Router
    ********************************/
-  ogsExchange_t* crystal = new ogsAllToAll_t(Nshared, sharedNodes,
-                                           _gatherHalo, comm, platform);
+  ogsExchange_t* crystal = new ogsCrystalRouter_t(Nshared, sharedNodes,
+                                                 _gatherHalo, dataStream,
+                                                 comm, platform);
 
   //standard copy to host - exchange - copy back to device
   crystal->gpu_aware=false;
 
   double crystalTime[3];
-  ExchangeTest(crystal, crystalTime, false);
+  DeviceExchangeTest(crystal, crystalTime);
   double crystalAvg = crystalTime[0];
 
 #ifdef GPU_AWARE_MPI
@@ -217,7 +290,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
   crystal->gpu_aware=true;
 
   double crystalGATime[3];
-  ExchangeTest(crystal, crystalGATime, false);
+  DeviceExchangeTest(crystal, crystalGATime);
 
   if (crystalGATime[0] < crystalAvg)
     crystalAvg = crystalGATime[0];
@@ -228,7 +301,7 @@ ogsExchange_t* ogsBase_t::AutoSetup(dlong Nshared,
 
   //test exchange from host memory (just for reporting)
   double crystalHostTime[3];
-  ExchangeTest(crystal, crystalHostTime, true);
+  HostExchangeTest(crystal, crystalHostTime);
 
   if (crystalAvg < bestTime) {
     delete bestExchange;
diff --git a/libs/ogs/ogsCrystalRouter.cpp b/libs/ogs/ogsCrystalRouter.cpp
index 661c95c..fa54b1b 100644
--- a/libs/ogs/ogsCrystalRouter.cpp
+++ b/libs/ogs/ogsCrystalRouter.cpp
@@ -39,129 +39,176 @@ namespace libp {
 
 namespace ogs {
 
-void ogsCrystalRouter_t::Start(const int k,
-                               const Type type,
-                               const Op op,
-                               const Transpose trans,
-                               const bool host){
+/**********************************
+* Host exchange
+***********************************/
+template<typename T>
+inline void ogsCrystalRouter_t::Start(pinnedMemory<T> &buf, const int k,
+                               const Op op, const Transpose trans){}
 
-  occa::device &device = platform.device;
+template<typename T>
+inline void ogsCrystalRouter_t::Finish(pinnedMemory<T> &buf, const int k,
+                                const Op op, const Transpose trans){
 
-  //get current stream
-  occa::stream currentStream = device.getStream();
-
-  const dlong N = (trans == NoTrans) ? NhaloP : Nhalo;
-
-  if (N) {
-    if (!gpu_aware && !host) {
-      //if not using gpu-aware mpi and exchanging device buffers,
-      // move the halo buffer to the host
-      device.setStream(dataStream);
-      const size_t Nbytes = k*Sizeof(type);
-      o_haloBuf.copyTo(haloBuf, N*Nbytes, 0, "async: true");
-      device.setStream(currentStream);
+
+  memory<crLevel> levels;
+  if (trans==NoTrans) {
+    levels = levelsN;
+  } else {
+    levels = levelsT;
+  }
+
+  pinnedMemory<T> sendBuf = h_sendspace;
+
+  // To start,    buf = h_workspace = h_work[(hbuf_id+0)%2];
+  //          sendBuf = h_sendspace;
+  for (int l=0;l<Nlevels;l++) {
+    pinnedMemory<T> recvBuf = h_workspace;
+
+    //post recvs
+    if (levels[l].Nmsg>0) {
+      comm.Irecv(recvBuf + levels[l].recvOffset*k,
+                 levels[l].partner,
+                 k*levels[l].Nrecv0,
+                 levels[l].partner,
+                 request[1]);
     }
+    if (levels[l].Nmsg==2) {
+      comm.Irecv(recvBuf + levels[l].recvOffset*k + levels[l].Nrecv0*k,
+                rank-1,
+                k*levels[l].Nrecv1,
+                rank-1,
+                request[2]);
+    }
+
+    //assemble send buffer
+    extract(levels[l].Nsend, k, levels[l].sendIds, buf, sendBuf);
+
+    //post send
+    comm.Isend(sendBuf,
+               levels[l].partner,
+               k*levels[l].Nsend,
+               rank,
+               request[0]);
+
+    comm.Waitall(levels[l].Nmsg+1, request);
+
+    //rotate buffers
+    h_workspace = h_work[(hbuf_id+1)%2];
+    hbuf_id = (hbuf_id+1)%2;
+
+    recvBuf = buf;
+    buf     = h_workspace;
+
+    //Gather the recv'd values into the haloBuffer
+    levels[l].gather.Gather(buf, recvBuf, k, op, Trans);
   }
 }
 
+void ogsCrystalRouter_t::Start(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+/**********************************
+* GPU-aware exchange
+***********************************/
+template<typename T>
+inline void ogsCrystalRouter_t::Start(deviceMemory<T> &o_buf,
+                                      const int k,
+                                      const Op op,
+                                      const Transpose trans){
+}
 
-void ogsCrystalRouter_t::Finish(const int k,
-                                const Type type,
-                                const Op op,
-                                const Transpose trans,
-                                const bool host){
+template<typename T>
+inline void ogsCrystalRouter_t::Finish(deviceMemory<T> &o_buf,
+                                       const int k,
+                                       const Op op,
+                                       const Transpose trans){
 
-  const size_t Nbytes = k*Sizeof(type);
-  occa::device &device = platform.device;
+  device_t &device = platform.device;
 
   //get current stream
-  occa::stream currentStream = device.getStream();
+  stream_t currentStream = device.getStream();
 
   //the intermediate kernels are always overlapped with the default stream
   device.setStream(dataStream);
 
-  dlong N = (trans == NoTrans) ? NhaloP : Nhalo;
-
-  if (N && !gpu_aware && !host) {
-    //synchronize data stream to ensure the host buffer is on the host
-    device.finish();
-  }
-
-  libp::memory<crLevel> levels;
-  if (trans==NoTrans)
+  memory<crLevel> levels;
+  if (trans==NoTrans) {
     levels = levelsN;
-  else
+  } else {
     levels = levelsT;
+  }
 
-  for (int l=0;l<Nlevels;l++) {
+  deviceMemory<T> o_sendBuf = o_sendspace;
 
-    char *sendPtr, *recvPtr;
-    if (gpu_aware && !host) { //device pointer
-      sendPtr = (char*)o_sendBuf.ptr();
-      recvPtr = (char*)o_haloBuf.ptr() + levels[l].recvOffset*Nbytes;
-    } else { //host pointer
-      sendPtr = sendBuf;
-      recvPtr = haloBuf + levels[l].recvOffset*Nbytes;
-    }
+  // To start,    o_buf = o_workspace = o_work[(buf_id+0)%2];
+  //          o_sendBuf = o_sendspace
+  for (int l=0;l<Nlevels;l++) {
+    deviceMemory<T> o_recvBuf = o_workspace;
 
     //post recvs
     if (levels[l].Nmsg>0) {
-      MPI_Irecv(recvPtr, k*levels[l].Nrecv0, MPI_Type(type),
-                levels[l].partner, levels[l].partner, comm, request+1);
+      comm.Irecv(o_recvBuf + levels[l].recvOffset*k,
+                 levels[l].partner,
+                 k*levels[l].Nrecv0,
+                 levels[l].partner,
+                 request[1]);
     }
     if (levels[l].Nmsg==2) {
-      MPI_Irecv(recvPtr+levels[l].Nrecv0*Nbytes,
-                k*levels[l].Nrecv1, MPI_Type(type),
-                rank-1, rank-1, comm, request+2);
+      comm.Irecv(o_recvBuf + levels[l].recvOffset*k + levels[l].Nrecv0*k,
+                rank-1,
+                k*levels[l].Nrecv1,
+                rank-1,
+                request[2]);
     }
 
     //assemble send buffer
-    if (gpu_aware) {
-      if (levels[l].Nsend) {
-        extractKernel[type](levels[l].Nsend, k,
-                            levels[l].o_sendIds,
-                            o_haloBuf, o_sendBuf);
-        device.finish();
-      }
-    } else {
-      extract(levels[l].Nsend, k, type,
-              levels[l].sendIds.ptr(), haloBuf, sendBuf);
+    if (levels[l].Nsend) {
+      extractKernel[ogsType<T>::get()](levels[l].Nsend, k,
+                                       levels[l].o_sendIds,
+                                       o_buf, o_sendBuf);
+      device.finish();
     }
 
     //post send
-    MPI_Isend(sendPtr, k*levels[l].Nsend, MPI_Type(type),
-              levels[l].partner, rank, comm, request+0);
-    MPI_Waitall(levels[l].Nmsg+1, request, status);
+    comm.Isend(o_sendBuf,
+               levels[l].partner,
+               k*levels[l].Nsend,
+               rank,
+               request[0]);
+
+    comm.Waitall(levels[l].Nmsg+1, request);
 
     //rotate buffers
-    o_recvBuf = o_buf[(buf_id+0)%2];
-    o_haloBuf = o_buf[(buf_id+1)%2];
-    recvBuf = buf[(buf_id+0)%2];
-    haloBuf = buf[(buf_id+1)%2];
+    o_workspace = o_work[(buf_id+1)%2];
     buf_id = (buf_id+1)%2;
 
-    //Gather the recv'd values into the haloBuffer
-    if (gpu_aware) {
-      levels[l].gather.Gather(o_haloBuf, o_recvBuf,
-                               k, type, op, Trans);
-    } else {
-      levels[l].gather.Gather(haloBuf, recvBuf,
-                               k, type, op, Trans);
-    }
-  }
+    o_recvBuf = o_buf;
+    o_buf  = o_workspace;
 
-  N = (trans == Trans) ? NhaloP : Nhalo;
-  if (N) {
-    if (!gpu_aware && !host) {
-      // copy recv back to device
-      o_haloBuf.copyFrom(haloBuf, N*Nbytes, 0, "async: true");
-      device.finish(); //wait for transfer to finish
-    }
+    //Gather the recv'd values into the haloBuffer
+    levels[l].gather.Gather(o_buf, o_recvBuf, k, op, Trans);
   }
 
   device.setStream(currentStream);
 }
 
+void ogsCrystalRouter_t::Start(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Start(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsCrystalRouter_t::Finish(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+
 /*
  *Crystal Router performs the needed MPI communcation via recursive
  * folding of a hypercube. Consider a set of NP ranks. We select a
@@ -198,11 +245,12 @@ void ogsCrystalRouter_t::Finish(const int k,
  */
 
 ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
-                                       libp::memory<parallelNode_t> &sharedNodes,
+                                       memory<parallelNode_t> &sharedNodes,
                                        ogsOperator_t& gatherHalo,
-                                       MPI_Comm _comm,
+                                       stream_t _dataStream,
+                                       comm_t _comm,
                                        platform_t &_platform):
-  ogsExchange_t(_platform,_comm) {
+  ogsExchange_t(_platform,_comm,_dataStream) {
 
   NhaloP = gatherHalo.NrowsN;
   Nhalo  = gatherHalo.NrowsT;
@@ -229,6 +277,7 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
   levelsN.malloc(Nlevels);
   levelsT.malloc(Nlevels);
 
+  request.malloc(3);
 
   //Now build the levels
   Nlevels = 0;
@@ -236,7 +285,7 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
   np_offset=0;
 
   dlong N = Nshared + Nhalo;
-  libp::memory<parallelNode_t> nodes(N);
+  memory<parallelNode_t> nodes(N);
 
   //setup is easier if we include copies of the nodes we own
   // in the list of shared nodes
@@ -298,15 +347,15 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
 
     int Nsend=(is_lo) ? Nhi : Nlo;
 
-    MPI_Isend(&Nsend, 1, MPI_INT, partner, rank, comm, request+0);
+    comm.Isend(Nsend, partner, rank, request[0]);
 
     int Nrecv0=0, Nrecv1=0;
     if (Nmsg>0)
-      MPI_Irecv(&Nrecv0, 1, MPI_INT, partner, partner, comm, request+1);
+      comm.Irecv(Nrecv0, partner, partner, request[1]);
     if (Nmsg==2)
-      MPI_Irecv(&Nrecv1, 1, MPI_INT, r_half-1, r_half-1, comm, request+2);
+      comm.Irecv(Nrecv1, r_half-1, r_half-1, request[2]);
 
-    MPI_Waitall(Nmsg+1, request, status);
+    comm.Waitall(Nmsg+1, request);
 
     int Nrecv = Nrecv0+Nrecv1;
 
@@ -315,8 +364,8 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
     else       Nhi+=Nrecv;
 
     //split node list in two
-    libp::memory<parallelNode_t> loNodes(Nlo);
-    libp::memory<parallelNode_t> hiNodes(Nhi);
+    memory<parallelNode_t> loNodes(Nlo);
+    memory<parallelNode_t> hiNodes(Nhi);
 
     Nlo=0, Nhi=0;
     for (dlong n=0;n<N;n++) {
@@ -334,7 +383,7 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
     N     = is_lo ? Nlo+Nrecv : Nhi+Nrecv;
 
     const int offset = is_lo ? Nlo : Nhi;
-    libp::memory<parallelNode_t> sendNodes = is_lo ? hiNodes : loNodes;
+    memory<parallelNode_t> sendNodes = is_lo ? hiNodes : loNodes;
 
     //count how many entries from the halo buffer we're sending
     int NentriesSendN=0;
@@ -365,29 +414,29 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
     levelsN[Nlevels].o_sendIds = platform.malloc(levelsN[Nlevels].sendIds);
 
     //share the entry count with our partner
-    MPI_Isend(&NentriesSendT, 1, MPI_INT, partner, rank, comm, request+0);
+    comm.Isend(NentriesSendT, partner, rank, request[0]);
 
     int NentriesRecvT0=0, NentriesRecvT1=0;
     if (Nmsg>0)
-      MPI_Irecv(&NentriesRecvT0, 1, MPI_INT, partner, partner, comm, request+1);
+      comm.Irecv(NentriesRecvT0, partner, partner, request[1]);
     if (Nmsg==2)
-      MPI_Irecv(&NentriesRecvT1, 1, MPI_INT, r_half-1, r_half-1, comm, request+2);
+      comm.Irecv(NentriesRecvT1, r_half-1, r_half-1, request[2]);
 
-    MPI_Waitall(Nmsg+1, request, status);
+    comm.Waitall(Nmsg+1, request);
 
     levelsT[Nlevels].Nrecv0 = NentriesRecvT0;
     levelsT[Nlevels].Nrecv1 = NentriesRecvT1;
     levelsT[Nlevels].recvOffset = NhaloExtT;
 
-    MPI_Isend(&NentriesSendN, 1, MPI_INT, partner, rank, comm, request+0);
+    comm.Isend(NentriesSendN, partner, rank, request[0]);
 
     int NentriesRecvN0=0, NentriesRecvN1=0;
     if (Nmsg>0)
-      MPI_Irecv(&NentriesRecvN0, 1, MPI_INT, partner, partner, comm, request+1);
+      comm.Irecv(NentriesRecvN0, partner, partner, request[1]);
     if (Nmsg==2)
-      MPI_Irecv(&NentriesRecvN1, 1, MPI_INT, r_half-1, r_half-1, comm, request+2);
+      comm.Irecv(NentriesRecvN1, r_half-1, r_half-1, request[2]);
 
-    MPI_Waitall(Nmsg+1, request, status);
+    comm.Waitall(Nmsg+1, request);
 
     levelsN[Nlevels].Nrecv0 = NentriesRecvN0;
     levelsN[Nlevels].Nrecv1 = NentriesRecvN1;
@@ -399,18 +448,15 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
 
 
     //send half the list to our partner
-    MPI_Isend(sendNodes.ptr(), Nsend,
-              MPI_PARALLELNODE_T, partner, rank, comm, request+0);
+    comm.Isend(sendNodes, partner, Nsend, rank, request[0]);
 
     //recv new nodes from our partner(s)
     if (Nmsg>0)
-      MPI_Irecv(nodes.ptr()+offset,        Nrecv0,
-                MPI_PARALLELNODE_T, partner, partner, comm, request+1);
+      comm.Irecv(nodes+offset, partner, Nrecv0, partner, request[1]);
     if (Nmsg==2)
-      MPI_Irecv(nodes.ptr()+offset+Nrecv0, Nrecv1,
-                MPI_PARALLELNODE_T, r_half-1, r_half-1, comm, request+2);
+      comm.Irecv(nodes+offset+Nrecv0, r_half-1, Nrecv1, r_half-1, request[2]);
 
-    MPI_Waitall(Nmsg+1, request, status);
+    comm.Waitall(Nmsg+1, request);
 
     sendNodes.free();
 
@@ -459,7 +505,7 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
 
 
     //make an index map to save the original extended halo ids
-    libp::memory<dlong> indexMap(NhaloExtT);
+    memory<dlong> indexMap(NhaloExtT);
 
     //fill newIds of new entries if possible, or give them an index
     NhaloExtT = Nhalo + NhaloExtN;
@@ -702,25 +748,24 @@ ogsCrystalRouter_t::ogsCrystalRouter_t(dlong Nshared,
   }
 
   //make scratch space
-  AllocBuffer(Sizeof(Dfloat));
+  AllocBuffer(sizeof(dfloat));
 }
 
 void ogsCrystalRouter_t::AllocBuffer(size_t Nbytes) {
 
-  if (o_sendBuf.size() < NsendMax*Nbytes) {
-    sendBuf = static_cast<char*>(platform.hostMalloc(NsendMax*Nbytes,  nullptr, h_sendBuf));
-    o_sendBuf = platform.malloc(NsendMax*Nbytes);
+  if (o_sendspace.size() < NsendMax*Nbytes) {
+    h_sendspace = platform.hostMalloc<char>(NsendMax*Nbytes);
+    o_sendspace = platform.malloc<char>(NsendMax*Nbytes);
   }
-  if (o_buf[0].size() < NrecvMax*Nbytes) {
-    buf[0] = static_cast<char*>(platform.hostMalloc(NrecvMax*Nbytes,  nullptr, h_buf[0]));
-    buf[1] = static_cast<char*>(platform.hostMalloc(NrecvMax*Nbytes,  nullptr, h_buf[1]));
-    haloBuf = buf[0];
-    recvBuf = buf[1];
-
-    o_buf[0] = platform.malloc(NrecvMax*Nbytes);
-    o_buf[1] = platform.malloc(NrecvMax*Nbytes);
-    o_haloBuf = o_buf[0];
-    o_recvBuf = o_buf[1];
+  if (o_work[0].size() < NrecvMax*Nbytes) {
+    h_work[0] = platform.hostMalloc<char>(NrecvMax*Nbytes);
+    h_work[1] = platform.hostMalloc<char>(NrecvMax*Nbytes);
+    h_workspace = h_work[0];
+    hbuf_id=0;
+
+    o_work[0] = platform.malloc<char>(NrecvMax*Nbytes);
+    o_work[1] = platform.malloc<char>(NrecvMax*Nbytes);
+    o_workspace = o_work[0];
     buf_id=0;
   }
 }
diff --git a/libs/ogs/ogsHalo.cpp b/libs/ogs/ogsHalo.cpp
index 29a9698..c47f9bd 100644
--- a/libs/ogs/ogsHalo.cpp
+++ b/libs/ogs/ogsHalo.cpp
@@ -34,173 +34,330 @@ namespace libp {
 namespace ogs {
 
 /********************************
- * Exchange
+ * Device Exchange
  ********************************/
-void halo_t::Exchange(occa::memory& o_v,
-                      const int k,
-                      const Type type) {
-  ExchangeStart (o_v, k, type);
-  ExchangeFinish(o_v, k, type);
+template<typename T>
+void halo_t::Exchange(deviceMemory<T> o_v, const int k) {
+  ExchangeStart (o_v, k);
+  ExchangeFinish(o_v, k);
 }
 
-void halo_t::ExchangeStart(occa::memory& o_v,
-                           const int k,
-                           const Type type){
-  exchange->AllocBuffer(k*Sizeof(type));
+template<typename T>
+void halo_t::ExchangeStart(deviceMemory<T> o_v, const int k){
+  exchange->AllocBuffer(k*sizeof(T));
+
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  if (exchange->gpu_aware) {
+    if (gathered_halo) {
+      //if this halo was build from a gathered ogs the halo nodes are at the end
+      o_haloBuf.copyFrom(o_v + k*NlocalT, k*NhaloP,
+                         0, "async: true");
+    } else {
+      //collect halo buffer
+      gatherHalo->Gather(o_haloBuf, o_v, k, Add, NoTrans);
+    }
+
+    //prepare MPI exchange
+    exchange->Start(o_haloBuf, k, Add, NoTrans);
 
-  //collect halo buffer
-  if (gathered_halo) {
-    //if this halo was build from a gathered ogs the halo nodes are at the end
-    if (NhaloP)
-      exchange->o_haloBuf.copyFrom(o_v + k*NlocalT*Sizeof(type),
-                                   k*NhaloP*Sizeof(type), 0, 0, "async: true");
   } else {
-    gatherHalo->Gather(exchange->o_haloBuf, o_v,
-                        k, type, Add, NoTrans);
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //if not using gpu-aware mpi move the halo buffer to the host
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    if (gathered_halo) {
+      //wait for o_v to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_v + k*NlocalT, NhaloP*k,
+                       0, "async: true");
+      device.setStream(currentStream);
+    } else {
+      //collect halo buffer
+      gatherHalo->Gather(o_haloBuf, o_v, k, Add, NoTrans);
+
+      //wait for o_haloBuf to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_haloBuf, NhaloP*k,
+                       0, "async: true");
+      device.setStream(currentStream);
+    }
   }
-
-  //prepare MPI exchange
-  exchange->Start(k, type, Add, NoTrans);
 }
 
-void halo_t::ExchangeFinish(occa::memory& o_v,
-                            const int k,
-                            const Type type){
+template<typename T>
+void halo_t::ExchangeFinish(deviceMemory<T> o_v, const int k){
 
-  //finish MPI exchange
-  exchange->Finish(k, type, Add, NoTrans);
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
 
   //write exchanged halo buffer back to vector
-  if (gathered_halo) {
-    //if this halo was build from a gathered ogs the halo nodes are at the end
-    if (NhaloP)
-      exchange->o_haloBuf.copyTo(o_v + k*(NlocalT+NhaloP)*Sizeof(type),
-                                 k*Nhalo*Sizeof(type),
-                                 0, k*NhaloP*Sizeof(type), "async: true");
+  if (exchange->gpu_aware) {
+    //finish MPI exchange
+    exchange->Finish(o_haloBuf, k, Add, NoTrans);
+
+    if (gathered_halo) {
+      o_haloBuf.copyTo(o_v + k*(NlocalT+NhaloP), k*Nhalo,
+                       k*NhaloP, "async: true");
+    } else {
+      gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans);
+    }
   } else {
-    gatherHalo->Scatter(o_v, exchange->o_haloBuf,
-                        k, type, Add, NoTrans);
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //synchronize data stream to ensure the buffer is on the host
+    device.setStream(dataStream);
+    device.finish();
+
+    /*MPI exchange of host buffer*/
+    exchange->Start (haloBuf, k, Add, NoTrans);
+    exchange->Finish(haloBuf, k, Add, NoTrans);
+
+    // copy recv back to device
+    if (gathered_halo) {
+      haloBuf.copyTo(o_v + k*(NlocalT+NhaloP), k*Nhalo,
+                     k*NhaloP, "async: true");
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+    } else {
+      haloBuf.copyTo(o_haloBuf+k*NhaloP, k*Nhalo,
+                     k*NhaloP, "async: true");
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+
+      gatherHalo->Scatter(o_v, o_haloBuf, k, NoTrans);
+    }
   }
 }
 
+template void halo_t::Exchange(deviceMemory<float> o_v, const int k);
+template void halo_t::Exchange(deviceMemory<double> o_v, const int k);
+template void halo_t::Exchange(deviceMemory<int> o_v, const int k);
+template void halo_t::Exchange(deviceMemory<long long int> o_v, const int k);
+
 //host version
-void halo_t::Exchange(void* v,
-                      const int k,
-                      const Type type) {
-  exchange->AllocBuffer(k*Sizeof(type));
+template<typename T>
+void halo_t::Exchange(memory<T> v, const int k) {
+  ExchangeStart (v, k);
+  ExchangeFinish(v, k);
+}
+
+template<typename T>
+void halo_t::ExchangeStart(memory<T> v, const int k) {
+  exchange->AllocBuffer(k*sizeof(T));
+
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
 
   //collect halo buffer
   if (gathered_halo) {
     //if this halo was build from a gathered ogs the halo nodes are at the end
-    std::memcpy(exchange->haloBuf,
-                static_cast<char*>(v) + k*NlocalT*Sizeof(type),
-                k*NhaloP*Sizeof(type));
+    haloBuf.copyFrom(v + k*NlocalT, k*NhaloP);
   } else {
-    gatherHalo->Gather(exchange->haloBuf, v,
-                       k, type, Add, NoTrans);
+    gatherHalo->Gather(haloBuf, v, k, Add, NoTrans);
   }
 
-  //MPI exchange
-  exchange->Start (k, type, Add, NoTrans, true);
-  exchange->Finish(k, type, Add, NoTrans, true);
+  //Prepare MPI exchange
+  exchange->Start(haloBuf, k, Add, NoTrans);
+}
+
+template<typename T>
+void halo_t::ExchangeFinish(memory<T> v, const int k) {
+
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+  //finish MPI exchange
+  exchange->Finish(haloBuf, k, Add, NoTrans);
 
   //write exchanged halo buffer back to vector
   if (gathered_halo) {
     //if this halo was build from a gathered ogs the halo nodes are at the end
-    std::memcpy(static_cast<char*>(v) + k*(NlocalT+NhaloP)*Sizeof(type),
-                static_cast<char*>(exchange->haloBuf)
-                                + k*NhaloP*Sizeof(type),
-                k*Nhalo*Sizeof(type));
+    haloBuf.copyTo(v + k*(NlocalT+NhaloP),
+                   k*Nhalo,
+                   k*NhaloP);
   } else {
-    gatherHalo->Scatter(v, exchange->haloBuf,
-                        k, type, Add, NoTrans);
+    gatherHalo->Scatter(v, haloBuf, k, NoTrans);
   }
 }
 
+template void halo_t::Exchange(memory<float> v, const int k);
+template void halo_t::Exchange(memory<double> v, const int k);
+template void halo_t::Exchange(memory<int> v, const int k);
+template void halo_t::Exchange(memory<long long int> v, const int k);
+
 /********************************
  * Combine
  ********************************/
-void halo_t::Combine(occa::memory& o_v,
-                     const int k,
-                     const Type type) {
-  CombineStart (o_v, k, type);
-  CombineFinish(o_v, k, type);
+template<typename T>
+void halo_t::Combine(deviceMemory<T> o_v, const int k) {
+  CombineStart (o_v, k);
+  CombineFinish(o_v, k);
 }
 
-void halo_t::CombineStart(occa::memory& o_v,
-                          const int k,
-                          const Type type){
-  exchange->AllocBuffer(k*Sizeof(type));
+template<typename T>
+void halo_t::CombineStart(deviceMemory<T> o_v, const int k){
+  exchange->AllocBuffer(k*sizeof(T));
 
-  //collect halo buffer
-  if (gathered_halo) {
-    //if this halo was build from a gathered ogs the halo nodes are at the end
-    if (NhaloT)
-      exchange->o_haloBuf.copyFrom(o_v + k*NlocalT*Sizeof(type),
-                                   k*NhaloT*Sizeof(type), 0, 0, "async: true");
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
+
+  if (exchange->gpu_aware) {
+    if (gathered_halo) {
+      //if this halo was build from a gathered ogs the halo nodes are at the end
+      o_haloBuf.copyFrom(o_v + k*NlocalT, k*NhaloT,
+                         0, "async: true");
+    } else {
+      //collect halo buffer
+      gatherHalo->Gather(o_haloBuf, o_v, k, Add, Trans);
+    }
+
+    //prepare MPI exchange
+    exchange->Start(o_haloBuf, k, Add, Trans);
   } else {
-    gatherHalo->Gather(exchange->o_haloBuf, o_v,
-                       k, type, Add, Trans);
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //if not using gpu-aware mpi move the halo buffer to the host
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    if (gathered_halo) {
+      //wait for o_v to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_v + k*NlocalT, NhaloT*k,
+                       0, "async: true");
+      device.setStream(currentStream);
+    } else {
+      //collect halo buffer
+      gatherHalo->Gather(o_haloBuf, o_v, k, Add, Trans);
+
+      //wait for o_haloBuf to be ready
+      device.finish();
+
+      //queue copy to host
+      device.setStream(dataStream);
+      haloBuf.copyFrom(o_haloBuf, NhaloT*k,
+                       0, "async: true");
+      device.setStream(currentStream);
+    }
   }
-
-  //prepare MPI exchange
-  exchange->Start(k, type, Add, Trans);
 }
 
+template<typename T>
+void halo_t::CombineFinish(deviceMemory<T> o_v, const int k){
 
-void halo_t::CombineFinish(occa::memory& o_v,
-                           const int k,
-                           const Type type){
-
-  //finish MPI exchange
-  exchange->Finish(k, type, Add, Trans);
+  deviceMemory<T> o_haloBuf = exchange->o_workspace;
 
   //write exchanged halo buffer back to vector
-  if (gathered_halo) {
-    //if this halo was build from a gathered ogs the halo nodes are at the end
-    if (NhaloP)
-      exchange->o_haloBuf.copyTo(o_v + k*NlocalT*Sizeof(type),
-                                 k*NhaloP*Sizeof(type),
-                                 0, 0, "async: true");
+  if (exchange->gpu_aware) {
+    //finish MPI exchange
+    exchange->Finish(o_haloBuf, k, Add, Trans);
+
+    if (gathered_halo) {
+      //if this halo was build from a gathered ogs the halo nodes are at the end
+      o_haloBuf.copyTo(o_v + k*NlocalT, k*NhaloP,
+                       0, "async: true");
+    } else {
+      gatherHalo->Scatter(o_v, o_haloBuf, k, Trans);
+    }
   } else {
-    gatherHalo->Scatter(o_v, exchange->o_haloBuf,
-                        k, type, Add, Trans);
+    pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+    //get current stream
+    device_t &device = platform.device;
+    stream_t currentStream = device.getStream();
+
+    //synchronize data stream to ensure the buffer is on the host
+    device.setStream(dataStream);
+    device.finish();
+
+    /*MPI exchange of host buffer*/
+    exchange->Start (haloBuf, k, Add, Trans);
+    exchange->Finish(haloBuf, k, Add, Trans);
+
+    if (gathered_halo) {
+      // copy recv back to device
+      haloBuf.copyTo(o_v + k*NlocalT, NhaloP*k,
+                     0, "async: true");
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+    } else {
+      haloBuf.copyTo(o_haloBuf, NhaloP*k,
+                     0, "async: true");
+      device.finish(); //wait for transfer to finish
+      device.setStream(currentStream);
+
+      gatherHalo->Scatter(o_v, o_haloBuf, k, Trans);
+    }
   }
 }
 
+template void halo_t::Combine(deviceMemory<float> o_v, const int k);
+template void halo_t::Combine(deviceMemory<double> o_v, const int k);
+template void halo_t::Combine(deviceMemory<int> o_v, const int k);
+template void halo_t::Combine(deviceMemory<long long int> o_v, const int k);
+
 //host version
-void halo_t::Combine(void* v,
-                     const int k,
-                     const Type type) {
-  exchange->AllocBuffer(k*Sizeof(type));
+template<typename T>
+void halo_t::Combine(memory<T> v, const int k) {
+  CombineStart (v, k);
+  CombineFinish(v, k);
+}
+
+template<typename T>
+void halo_t::CombineStart(memory<T> v, const int k) {
+  exchange->AllocBuffer(k*sizeof(T));
+
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
 
   //collect halo buffer
   if (gathered_halo) {
     //if this halo was build from a gathered ogs the halo nodes are at the end
-    std::memcpy(exchange->haloBuf,
-                static_cast<char*>(v) + k*NlocalT*Sizeof(type),
-                k*NhaloT*Sizeof(type));
+    haloBuf.copyFrom(v + k*NlocalT, k*NhaloT);
   } else {
-    gatherHalo->Gather(exchange->haloBuf, v,
-                       k, type, Add, Trans);
+    gatherHalo->Gather(haloBuf, v, k, Add, Trans);
   }
 
-  //MPI exchange
-  exchange->Start (k, type, Add, Trans, true);
-  exchange->Finish(k, type, Add, Trans, true);
+  //Prepare MPI exchange
+  exchange->Start(haloBuf, k, Add, Trans);
+}
+
+
+template<typename T>
+void halo_t::CombineFinish(memory<T> v, const int k) {
+
+  pinnedMemory<T> haloBuf = exchange->h_workspace;
+
+  //finish MPI exchange
+  exchange->Finish(haloBuf, k, Add, Trans);
 
   //write exchanged halo buffer back to vector
   if (gathered_halo) {
     //if this halo was build from a gathered ogs the halo nodes are at the end
-    std::memcpy(static_cast<char*>(v) + k*NlocalT*Sizeof(type),
-                exchange->haloBuf,
-                k*NhaloP*Sizeof(type));
+    haloBuf.copyTo(v + k*NlocalT, k*NhaloP);
   } else {
-    gatherHalo->Scatter(v, exchange->haloBuf,
-                        k, type, Add, Trans);
+    gatherHalo->Scatter(v, haloBuf, k, Trans);
   }
 }
 
+template void halo_t::Combine(memory<float> v, const int k);
+template void halo_t::Combine(memory<double> v, const int k);
+template void halo_t::Combine(memory<int> v, const int k);
+template void halo_t::Combine(memory<long long int> v, const int k);
+
 } //namespace ogs
 
 } //namespace libp
diff --git a/libs/ogs/ogsOperator.cpp b/libs/ogs/ogsOperator.cpp
index 5546ca8..2ee7833 100644
--- a/libs/ogs/ogsOperator.cpp
+++ b/libs/ogs/ogsOperator.cpp
@@ -35,31 +35,34 @@ namespace ogs {
 
 template<typename T>
 struct Op_Add {
-  const T init(){ return T{0}; }
-  void operator()(T& gv, const T v) { gv += v; }
+  inline const T init(){ return T{0}; }
+  inline void operator()(T& gv, const T v) { gv += v; }
 };
 template<typename T>
 struct Op_Mul {
-  const T init(){ return T{1}; }
-  void operator()(T& gv, const T v) { gv *= v; }
+  inline const T init(){ return T{1}; }
+  inline void operator()(T& gv, const T v) { gv *= v; }
 };
 template<typename T>
 struct Op_Max {
-  const T init(){ return -std::numeric_limits<T>::max(); }
-  void operator()(T& gv, const T v) { gv = (v>gv) ? v : gv; }
+  inline const T init(){ return -std::numeric_limits<T>::max(); }
+  inline void operator()(T& gv, const T v) { gv = (v>gv) ? v : gv; }
 };
 template<typename T>
 struct Op_Min {
-  const T init() {return  std::numeric_limits<T>::max(); }
-  void operator()(T& gv, const T v) { gv = (v<gv) ? v : gv; }
+  inline const T init() {return  std::numeric_limits<T>::max(); }
+  inline void operator()(T& gv, const T v) { gv = (v<gv) ? v : gv; }
 };
 
 /********************************
  * Gather Operation
  ********************************/
-template <typename T, template<typename> class Op>
-void ogsOperator_t::Gather(T* gv,
-                           const T* v,
+template <template<typename> class U,
+          template<typename> class V,
+          template<typename> class Op,
+          typename T>
+void ogsOperator_t::Gather(U<T> gv,
+                           const V<T> v,
                            const int K,
                            const Transpose trans) {
 
@@ -75,6 +78,9 @@ void ogsOperator_t::Gather(T* gv,
     colIds = colIdsT.ptr();
   }
 
+  const T* v_ptr  = v.ptr();
+  T* gv_ptr = gv.ptr();
+
   #pragma omp parallel for
   for(dlong n=0;n<Nrows;++n){
     const dlong start = rowStarts[n];
@@ -83,109 +89,80 @@ void ogsOperator_t::Gather(T* gv,
     for (int k=0;k<K;++k) {
       T val = Op<T>().init();
       for(dlong g=start;g<end;++g){
-        Op<T>()(val, v[k+colIds[g]*K]);
+        Op<T>()(val, v_ptr[k+colIds[g]*K]);
       }
-      gv[k+n*K] = val;
+      gv_ptr[k+n*K] = val;
     }
   }
 }
 
-void ogsOperator_t::Gather(void* gv,
-                            const void* v,
-                            const int k,
-                            const Type type,
-                            const Op op,
-                            const Transpose trans) {
+template <template<typename> class U,
+          template<typename> class V,
+          typename T>
+void ogsOperator_t::Gather(U<T> gv,
+                           const V<T> v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans) {
   switch (op){
     case Add:
-    switch (type){
-      case Float:  Gather<float  , Op_Add>
-                    (static_cast<float  *>(gv),
-                     static_cast<const float  *>( v),
-                     k, trans); break;
-      case Double: Gather<double , Op_Add>
-                    (static_cast<double  *>(gv),
-                     static_cast<const double  *>( v),
-                     k, trans); break;
-      case Int32:  Gather<int32_t, Op_Add>
-                    (static_cast<int32_t  *>(gv),
-                     static_cast<const int32_t  *>( v),
-                     k, trans); break;
-      case Int64:  Gather<int64_t, Op_Add>
-                    (static_cast<int64_t  *>(gv),
-                     static_cast<const int64_t  *>( v),
-                     k, trans); break;
-    }
-    break;
+      Gather<U, V, Op_Add, T>(gv, v, k, trans); break;
     case Mul:
-    switch (type){
-      case Float:  Gather<float  , Op_Mul>
-                    (static_cast<float  *>(gv),
-                     static_cast<const float  *>( v),
-                     k, trans); break;
-      case Double: Gather<double , Op_Mul>
-                    (static_cast<double  *>(gv),
-                     static_cast<const double  *>( v),
-                     k, trans); break;
-      case Int32:  Gather<int32_t, Op_Mul>
-                    (static_cast<int32_t  *>(gv),
-                     static_cast<const int32_t  *>( v),
-                     k, trans); break;
-      case Int64:  Gather<int64_t, Op_Mul>
-                    (static_cast<int64_t  *>(gv),
-                     static_cast<const int64_t  *>( v),
-                     k, trans); break;
-    }
-    break;
+      Gather<U, V, Op_Mul, T>(gv, v, k, trans); break;
     case Max:
-    switch (type){
-      case Float:  Gather<float  , Op_Max>
-                    (static_cast<float  *>(gv),
-                     static_cast<const float  *>( v),
-                     k, trans); break;
-      case Double: Gather<double , Op_Max>
-                    (static_cast<double  *>(gv),
-                     static_cast<const double  *>( v),
-                     k, trans); break;
-      case Int32:  Gather<int32_t, Op_Max>
-                    (static_cast<int32_t  *>(gv),
-                     static_cast<const int32_t  *>( v),
-                     k, trans); break;
-      case Int64:  Gather<int64_t, Op_Max>
-                    (static_cast<int64_t  *>(gv),
-                     static_cast<const int64_t  *>( v),
-                     k, trans); break;
-    }
-    break;
+      Gather<U, V, Op_Max, T>(gv, v, k, trans); break;
     case Min:
-    switch (type){
-      case Float:  Gather<float  , Op_Min>
-                    (static_cast<float  *>(gv),
-                     static_cast<const float  *>( v),
-                     k, trans); break;
-      case Double: Gather<double , Op_Min>
-                    (static_cast<double  *>(gv),
-                     static_cast<const double  *>( v),
-                     k, trans); break;
-      case Int32:  Gather<int32_t, Op_Min>
-                    (static_cast<int32_t  *>(gv),
-                     static_cast<const int32_t  *>( v),
-                     k, trans); break;
-      case Int64:  Gather<int64_t, Op_Min>
-                    (static_cast<int64_t  *>(gv),
-                     static_cast<const int64_t  *>( v),
-                     k, trans); break;
-    }
-    break;
+      Gather<U, V, Op_Min, T>(gv, v, k, trans); break;
   }
 }
 
-void ogsOperator_t::Gather(occa::memory&  o_gv,
-                            occa::memory&  o_v,
-                            const int k,
-                            const Type type,
-                            const Op op,
-                            const Transpose trans) {
+template
+void ogsOperator_t::Gather(memory<float> gv, const memory<float> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(memory<double> gv, const memory<double> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(memory<int> gv, const memory<int> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(memory<long long int> gv, const memory<long long int> v,
+                           const int k, const Op op, const Transpose trans);
+
+template
+void ogsOperator_t::Gather(pinnedMemory<float> gv, const memory<float> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<double> gv, const memory<double> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<int> gv, const memory<int> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<long long int> gv, const memory<long long int> v,
+                           const int k, const Op op, const Transpose trans);
+
+template
+void ogsOperator_t::Gather(pinnedMemory<float> gv, const pinnedMemory<float> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<double> gv, const pinnedMemory<double> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<int> gv, const pinnedMemory<int> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(pinnedMemory<long long int> gv, const pinnedMemory<long long int> v,
+                           const int k, const Op op, const Transpose trans);
+
+
+template<typename T>
+void ogsOperator_t::Gather(deviceMemory<T> o_gv,
+                           deviceMemory<T> o_v,
+                           const int k,
+                           const Op op,
+                           const Transpose trans) {
+  constexpr Type type = ogsType<T>::get();
   InitializeKernels(platform, type, op);
 
   if (trans==NoTrans) {
@@ -209,14 +186,28 @@ void ogsOperator_t::Gather(occa::memory&  o_gv,
   }
 }
 
+template
+void ogsOperator_t::Gather(deviceMemory<float> gv, const deviceMemory<float> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(deviceMemory<double> gv, const deviceMemory<double> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(deviceMemory<int> gv, const deviceMemory<int> v,
+                           const int k, const Op op, const Transpose trans);
+template
+void ogsOperator_t::Gather(deviceMemory<long long int> gv, const deviceMemory<long long int> v,
+                           const int k, const Op op, const Transpose trans);
+
+
 /********************************
  * Scatter Operation
  ********************************/
-template <typename T>
-void ogsOperator_t::Scatter(T* v,
-                            const T* gv,
-                            const int K,
-                            const Transpose trans) {
+template <template<typename> class U,
+          template<typename> class V,
+          typename T>
+void ogsOperator_t::Scatter(U<T> v, const V<T> gv,
+                            const int K, const Transpose trans) {
 
   dlong Nrows;
   dlong *rowStarts, *colIds;
@@ -230,6 +221,9 @@ void ogsOperator_t::Scatter(T* v,
     colIds = colIdsT.ptr();
   }
 
+  T* v_ptr  = v.ptr();
+  const T* gv_ptr = gv.ptr();
+
   #pragma omp parallel for
   for(dlong n=0;n<Nrows;++n){
     const dlong start = rowStarts[n];
@@ -237,40 +231,44 @@ void ogsOperator_t::Scatter(T* v,
 
     for(dlong g=start;g<end;++g){
       for (int k=0;k<K;++k) {
-        v[k+colIds[g]*K] = gv[k+n*K];
+        v_ptr[k+colIds[g]*K] = gv_ptr[k+n*K];
       }
     }
   }
 }
 
-void ogsOperator_t::Scatter(void* v,
-                             const void* gv,
-                             const int k,
-                             const Type type,
-                             const Op op,
-                             const Transpose trans) {
-  switch (type){
-    case Float:  Scatter<float  >(static_cast<float  *>(v),
-                                  static_cast<const float  *>(gv),
-                                  k, trans); break;
-    case Double: Scatter<double >(static_cast<double *>(v),
-                                  static_cast<const double *>(gv),
-                                  k, trans); break;
-    case Int32:  Scatter<int32_t>(static_cast<int32_t*>(v),
-                                  static_cast<const int32_t*>(gv),
-                                  k, trans); break;
-    case Int64:  Scatter<int64_t>(static_cast<int64_t*>(v),
-                                  static_cast<const int64_t*>(gv),
-                                  k, trans); break;
-  }
-}
+template
+void ogsOperator_t::Scatter(memory<float> v, const memory<float> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<double> v, const memory<double> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<int> v, const memory<int> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<long long int> v, const memory<long long int> gv,
+                            const int K, const Transpose trans);
+
+template
+void ogsOperator_t::Scatter(memory<float> v, const pinnedMemory<float> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<double> v, const pinnedMemory<double> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<int> v, const pinnedMemory<int> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(memory<long long int> v, const pinnedMemory<long long int> gv,
+                            const int K, const Transpose trans);
 
-void ogsOperator_t::Scatter(occa::memory&  o_v,
-                             occa::memory&  o_gv,
-                             const int k,
-                             const Type type,
-                             const Op op,
-                             const Transpose trans) {
+template<typename T>
+void ogsOperator_t::Scatter(deviceMemory<T> o_v,
+                            deviceMemory<T> o_gv,
+                            const int k,
+                            const Transpose trans) {
+  constexpr Type type = ogsType<T>::get();
   InitializeKernels(platform, type, Add);
 
   if (trans==Trans) {
@@ -294,12 +292,26 @@ void ogsOperator_t::Scatter(occa::memory&  o_v,
   }
 }
 
+template
+void ogsOperator_t::Scatter(deviceMemory<float> v, const deviceMemory<float> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(deviceMemory<double> v, const deviceMemory<double> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(deviceMemory<int> v, const deviceMemory<int> gv,
+                            const int K, const Transpose trans);
+template
+void ogsOperator_t::Scatter(deviceMemory<long long int> v, const deviceMemory<long long int> gv,
+                            const int K, const Transpose trans);
+
 /********************************
  * GatherScatter Operation
  ********************************/
-template <typename T, template<typename> class Op>
-void ogsOperator_t::GatherScatter(T* v,
-                                  const int K,
+template <template<typename> class U,
+          template<typename> class Op,
+          typename T>
+void ogsOperator_t::GatherScatter(U<T> v, const int K,
                                   const Transpose trans) {
 
   dlong Nrows;
@@ -326,6 +338,8 @@ void ogsOperator_t::GatherScatter(T* v,
     sColIds    = colIdsT.ptr();
   }
 
+  T* v_ptr = v.ptr();
+
   #pragma omp parallel for
   for(dlong n=0;n<Nrows;++n){
     const dlong gstart = gRowStarts[n];
@@ -336,77 +350,52 @@ void ogsOperator_t::GatherScatter(T* v,
     for (int k=0;k<K;++k) {
       T val = Op<T>().init();
       for(dlong g=gstart;g<gend;++g){
-        Op<T>()(val, v[k+gColIds[g]*K]);
+        Op<T>()(val, v_ptr[k+gColIds[g]*K]);
       }
       for(dlong s=sstart;s<send;++s){
-        v[k+sColIds[s]*K] = val;
+        v_ptr[k+sColIds[s]*K] = val;
       }
     }
   }
 }
 
-void ogsOperator_t::GatherScatter(void* v,
+template <template<typename> class U,
+          typename T>
+void ogsOperator_t::GatherScatter(U<T> v,
                                   const int k,
-                                  const Type type,
                                   const Op op,
                                   const Transpose trans) {
   switch (op){
     case Add:
-    switch (type){
-      case Float:  GatherScatter<float  , Op_Add>
-                    (static_cast<float  *>(v), k, trans); break;
-      case Double: GatherScatter<double , Op_Add>
-                    (static_cast<double *>(v), k, trans); break;
-      case Int32:  GatherScatter<int32_t, Op_Add>
-                    (static_cast<int32_t*>(v), k, trans); break;
-      case Int64:  GatherScatter<int64_t, Op_Add>
-                    (static_cast<int64_t*>(v), k, trans); break;
-    }
-    break;
+      GatherScatter<U, Op_Add, T>(v, k, trans); break;
     case Mul:
-    switch (type){
-      case Float:  GatherScatter<float  , Op_Mul>
-                    (static_cast<float  *>(v), k, trans); break;
-      case Double: GatherScatter<double , Op_Mul>
-                    (static_cast<double *>(v), k, trans); break;
-      case Int32:  GatherScatter<int32_t, Op_Mul>
-                    (static_cast<int32_t*>(v), k, trans); break;
-      case Int64:  GatherScatter<int64_t, Op_Mul>
-                    (static_cast<int64_t*>(v), k, trans); break;
-    }
-    break;
+      GatherScatter<U, Op_Mul, T>(v, k, trans); break;
     case Max:
-    switch (type){
-      case Float:  GatherScatter<float  , Op_Max>
-                    (static_cast<float  *>(v), k, trans); break;
-      case Double: GatherScatter<double , Op_Max>
-                    (static_cast<double *>(v), k, trans); break;
-      case Int32:  GatherScatter<int32_t, Op_Max>
-                    (static_cast<int32_t*>(v), k, trans); break;
-      case Int64:  GatherScatter<int64_t, Op_Max>
-                    (static_cast<int64_t*>(v), k, trans); break;
-    }
-    break;
+      GatherScatter<U, Op_Max, T>(v, k, trans); break;
     case Min:
-    switch (type){
-      case Float:  GatherScatter<float  , Op_Min>
-                    (static_cast<float  *>(v), k, trans); break;
-      case Double: GatherScatter<double , Op_Min>
-                    (static_cast<double *>(v), k, trans); break;
-      case Int32:  GatherScatter<int32_t, Op_Min>
-                    (static_cast<int32_t*>(v), k, trans); break;
-      case Int64:  GatherScatter<int64_t, Op_Min>
-                    (static_cast<int64_t*>(v), k, trans); break;
-    }
-    break;
+      GatherScatter<U, Op_Min, T>(v, k, trans); break;
   }
 }
 
-void ogsOperator_t::GatherScatter(occa::memory&  o_v,
+template
+void ogsOperator_t::GatherScatter(memory<float> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(memory<double> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(memory<int> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(memory<long long int> v,const int k,
+                                  const Op op, const Transpose trans);
+
+template<typename T>
+void ogsOperator_t::GatherScatter(deviceMemory<T> o_v,
                                   const int k,
-                                  const Type type,
                                   const Op op,
                                   const Transpose trans) {
+  constexpr Type type = ogsType<T>::get();
   InitializeKernels(platform, type, Add);
 
   if (trans==Trans) {
@@ -442,6 +431,19 @@ void ogsOperator_t::GatherScatter(occa::memory&  o_v,
   }
 }
 
+template
+void ogsOperator_t::GatherScatter(deviceMemory<float> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(deviceMemory<double> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(deviceMemory<int> v,const int k,
+                                  const Op op, const Transpose trans);
+template
+void ogsOperator_t::GatherScatter(deviceMemory<long long int> v,const int k,
+                                  const Op op, const Transpose trans);
+
 void ogsOperator_t::setupRowBlocks() {
 
   dlong blockSumN=0, blockSumT=0;
@@ -452,24 +454,15 @@ void ogsOperator_t::setupRowBlocks() {
 
   for (dlong i=0;i<NrowsT;i++) {
     const dlong rowSizeN  = rowStartsN[i+1]-rowStartsN[i];
-
-    if (rowSizeN > gatherNodesPerBlock) {
-      //this row is pathalogically big. We can't currently run this
-      std::stringstream ss;
-      ss << "Multiplicity of global node id: " << i
-         << " in ogsOperator_t::setupRowBlocks is too large.";
-      HIPBONE_ABORT(ss.str())
-    }
-
     const dlong rowSizeT  = rowStartsT[i+1]-rowStartsT[i];
 
-    if (rowSizeT > gatherNodesPerBlock) {
-      //this row is pathalogically big. We can't currently run this
-      std::stringstream ss;
-      ss << "Multiplicity of global node id: " << i
-         << " in ogsOperator_t::setupRowBlocks is too large.";
-      HIPBONE_ABORT(ss.str())
-    }
+    //this row is pathalogically big. We can't currently run this
+    LIBP_ABORT("Multiplicity of global node id: " << i
+               << " in ogsOperator_t::setupRowBlocks is too large.",
+               rowSizeN > gatherNodesPerBlock);
+    LIBP_ABORT("Multiplicity of global node id: " << i
+               << " in ogsOperator_t::setupRowBlocks is too large.",
+               rowSizeT > gatherNodesPerBlock);
 
     if (blockSumN+rowSizeN > gatherNodesPerBlock) { //adding this row will exceed the nnz per block
       NrowBlocksN++; //count the previous block
@@ -544,47 +537,44 @@ void ogsOperator_t::Free() {
 }
 
 
-template<typename T>
+template <template<typename> class U,
+          template<typename> class V,
+          typename T>
 void extract(const dlong N,
              const int K,
-             const dlong *ids,
-             const T *q,
-             T *gatherq) {
+             const memory<dlong> ids,
+             const U<T> q,
+             V<T> gatherq) {
+
+  const T* q_ptr = q.ptr();
+  T* gatherq_ptr = gatherq.ptr();
 
   for(dlong n=0;n<N;++n){
     const dlong gid = ids[n];
 
     for (int k=0;k<K;++k) {
-      gatherq[k+n*K] = q[k+gid*K];
+      gatherq_ptr[k+n*K] = q_ptr[k+gid*K];
     }
   }
 }
 
-void extract(const dlong N,
-             const int K,
-             const Type type,
-             const dlong *ids,
-             const void *q,
-             void *gatherq) {
-  switch (type){
-    case Float:  extract<float  >(N, K, ids,
-                                 static_cast<const float  *>(q),
-                                 static_cast<float  *>(gatherq));
-                 break;
-    case Double: extract<double >(N, K, ids,
-                                 static_cast<const double*>(q),
-                                 static_cast<double*>(gatherq));
-                 break;
-    case Int32:  extract<int32_t>(N, K, ids,
-                                 static_cast<const int32_t*>(q),
-                                 static_cast<int32_t*>(gatherq));
-                 break;
-    case Int64:  extract<int64_t>(N, K, ids,
-                                 static_cast<const int64_t*>(q),
-                                 static_cast<int64_t*>(gatherq));
-                 break;
-  }
-}
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const memory<float> q, memory<float> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const memory<double> q, memory<double> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const memory<int> q, memory<int> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const memory<long long int> q, memory<long long int> gatherq);
+
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const pinnedMemory<float> q, pinnedMemory<float> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const pinnedMemory<double> q, pinnedMemory<double> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const pinnedMemory<int> q, pinnedMemory<int> gatherq);
+template void extract(const dlong N, const int K, const memory<dlong> ids,
+                      const pinnedMemory<long long int> q, pinnedMemory<long long int> gatherq);
 
 } //namespace ogs
 
diff --git a/libs/ogs/ogsPairwise.cpp b/libs/ogs/ogsPairwise.cpp
index a6583e2..2cf83e9 100644
--- a/libs/ogs/ogsPairwise.cpp
+++ b/libs/ogs/ogsPairwise.cpp
@@ -39,73 +39,110 @@ namespace libp {
 
 namespace ogs {
 
-void ogsPairwise_t::Start(const int k,
-                          const Type type,
-                          const Op op,
-                          const Transpose trans,
-                          const bool host){
+/**********************************
+* Host exchange
+***********************************/
+template<typename T>
+inline void ogsPairwise_t::Start(pinnedMemory<T> &buf, const int k,
+                          const Op op, const Transpose trans){
 
-  occa::device &device = platform.device;
+  pinnedMemory<T> sendBuf = h_sendspace;
 
-  //get current stream
-  occa::stream currentStream = device.getStream();
+  const int NranksSend  = (trans==NoTrans) ? NranksSendN  : NranksSendT;
+  const int NranksRecv  = (trans==NoTrans) ? NranksRecvN  : NranksRecvT;
+  const int *sendRanks  = (trans==NoTrans) ? sendRanksN.ptr()   : sendRanksT.ptr();
+  const int *recvRanks  = (trans==NoTrans) ? recvRanksN.ptr()   : recvRanksT.ptr();
+  const int *sendCounts = (trans==NoTrans) ? sendCountsN.ptr()  : sendCountsT.ptr();
+  const int *recvCounts = (trans==NoTrans) ? recvCountsN.ptr()  : recvCountsT.ptr();
+  const int *sendOffsets= (trans==NoTrans) ? sendOffsetsN.ptr() : sendOffsetsT.ptr();
+  const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr();
 
-  const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT;
-  const dlong N     = (trans == NoTrans) ? NhaloP : Nhalo;
+  //post recvs
+  for (int r=0;r<NranksRecv;r++) {
+    comm.Irecv(buf + Nhalo*k + recvOffsets[r]*k,
+               recvRanks[r],
+               k*recvCounts[r],
+               recvRanks[r],
+               requests[r]);
+  }
 
-  if (Nsend) {
-    if (gpu_aware && !host) {
-      //if using gpu-aware mpi and exchanging device buffers,
-      //  assemble the send buffer on device
-      if (trans == NoTrans) {
-        extractKernel[type](NsendN, k, o_sendIdsN, o_haloBuf, o_sendBuf);
-      } else {
-        extractKernel[type](NsendT, k, o_sendIdsT, o_haloBuf, o_sendBuf);
-      }
-      //if not overlapping, wait for kernel to finish on default stream
-      device.finish();
-    } else if (!host) {
-      //if not using gpu-aware mpi and exchanging device buffers,
-      // move the halo buffer to the host
-      device.setStream(dataStream);
-      const size_t Nbytes = k*Sizeof(type);
-      o_haloBuf.copyTo(haloBuf, N*Nbytes, 0, "async: true");
-      device.setStream(currentStream);
-    }
+  // extract the send buffer
+  if (trans == NoTrans)
+    extract(NsendN, k, sendIdsN, buf, sendBuf);
+  else
+    extract(NsendT, k, sendIdsT, buf, sendBuf);
+
+  //post sends
+  for (int r=0;r<NranksSend;r++) {
+    comm.Isend(sendBuf + sendOffsets[r]*k,
+              sendRanks[r],
+              k*sendCounts[r],
+              rank,
+              requests[NranksRecv+r]);
   }
 }
 
+template<typename T>
+inline void ogsPairwise_t::Finish(pinnedMemory<T> &buf, const int k,
+                           const Op op, const Transpose trans){
 
-void ogsPairwise_t::Finish(const int k,
-                           const Type type,
-                           const Op op,
-                           const Transpose trans,
-                           const bool host){
+  const int NranksSend  = (trans==NoTrans) ? NranksSendN  : NranksSendT;
+  const int NranksRecv  = (trans==NoTrans) ? NranksRecvN  : NranksRecvT;
+  const int *recvOffsets= (trans==NoTrans) ? recvOffsetsN.ptr() : recvOffsetsT.ptr();
 
-  const size_t Nbytes = k*Sizeof(type);
-  occa::device &device = platform.device;
+  comm.Waitall(NranksRecv+NranksSend, requests);
 
-  //get current stream
-  occa::stream currentStream = device.getStream();
+  //if we recvieved anything via MPI, gather the recv buffer and scatter
+  // it back to to original vector
+  dlong Nrecv = recvOffsets[NranksRecv];
+  if (Nrecv) {
+    // gather the recieved nodes
+    postmpi.Gather(buf, buf, k, op, trans);
+  }
+}
+
+void ogsPairwise_t::Start(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsPairwise_t::Start(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsPairwise_t::Start(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsPairwise_t::Start(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(pinnedMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(pinnedMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(pinnedMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(pinnedMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
+/**********************************
+* GPU-aware exchange
+***********************************/
+template<typename T>
+void ogsPairwise_t::Start(deviceMemory<T> &o_buf,
+                          const int k,
+                          const Op op,
+                          const Transpose trans){
 
   const dlong Nsend = (trans == NoTrans) ? NsendN : NsendT;
 
-  if (Nsend && !gpu_aware && !host) {
-    //synchronize data stream to ensure the host buffer is on the host
-    device.setStream(dataStream);
+  if (Nsend) {
+    deviceMemory<T> o_sendBuf = o_sendspace;
+
+    //  assemble the send buffer on device
+    if (trans == NoTrans) {
+      extractKernel[ogsType<T>::get()](NsendN, k, o_sendIdsN, o_buf, o_sendBuf);
+    } else {
+      extractKernel[ogsType<T>::get()](NsendT, k, o_sendIdsT, o_buf, o_sendBuf);
+    }
+    //wait for kernel to finish on default stream
+    device_t &device = platform.device;
     device.finish();
-    device.setStream(dataStream);
   }
+}
 
+template<typename T>
+void ogsPairwise_t::Finish(deviceMemory<T> &o_buf,
+                           const int k,
+                           const Op op,
+                           const Transpose trans){
 
-  char *sendPtr, *recvPtr;
-  if (gpu_aware && !host) { //device pointer
-    sendPtr = (char*)o_sendBuf.ptr();
-    recvPtr = (char*)o_haloBuf.ptr() + Nhalo*Nbytes;
-  } else { //host pointer
-    sendPtr = sendBuf;
-    recvPtr = haloBuf + Nhalo*Nbytes;
-  }
+  deviceMemory<T> o_sendBuf = o_sendspace;
 
   const int NranksSend  = (trans==NoTrans) ? NranksSendN  : NranksSendT;
   const int NranksRecv  = (trans==NoTrans) ? NranksRecvN  : NranksRecvT;
@@ -118,59 +155,49 @@ void ogsPairwise_t::Finish(const int k,
 
   //post recvs
   for (int r=0;r<NranksRecv;r++) {
-    MPI_Irecv(recvPtr+recvOffsets[r]*Nbytes,
-              k*recvCounts[r], MPI_Type(type), recvRanks[r],
-              recvRanks[r], comm, requests.ptr()+r);
-  }
-
-  //if the halo data is on the host, extract the send buffer
-  if (host || !gpu_aware) {
-    if (trans == NoTrans)
-      extract(NsendN, k, type, sendIdsN.ptr(), haloBuf, sendBuf);
-    else
-      extract(NsendT, k, type, sendIdsT.ptr(), haloBuf, sendBuf);
+    comm.Irecv(o_buf + Nhalo*k + recvOffsets[r]*k,
+              recvRanks[r],
+              k*recvCounts[r],
+              recvRanks[r],
+              requests[r]);
   }
 
   //post sends
   for (int r=0;r<NranksSend;r++) {
-    MPI_Isend(sendPtr+sendOffsets[r]*Nbytes,
-              k*sendCounts[r], MPI_Type(type), sendRanks[r],
-              rank, comm, requests.ptr()+NranksRecv+r);
+    comm.Isend(o_sendBuf + sendOffsets[r]*k,
+              sendRanks[r],
+              k*sendCounts[r],
+              rank,
+              requests[NranksRecv+r]);
   }
-  MPI_Waitall(NranksRecv+NranksSend, requests.ptr(), statuses.ptr());
+
+  comm.Waitall(NranksRecv+NranksSend, requests);
 
   //if we recvieved anything via MPI, gather the recv buffer and scatter
   // it back to to original vector
   dlong Nrecv = recvOffsets[NranksRecv];
   if (Nrecv) {
-    if (!gpu_aware || host) {
-      //if not gpu-aware or recieved data is on the host,
-      // gather the recieved nodes
-      postmpi.Gather(haloBuf, haloBuf,
-                      k, type, op, trans);
-
-      if (!host) {
-        // copy recv back to device
-        device.setStream(dataStream);
-        const dlong N = (trans == Trans) ? NhaloP : Nhalo;
-        o_haloBuf.copyFrom(haloBuf, N*Nbytes, 0, "async: true");
-        device.finish(); //wait for transfer to finish
-        device.setStream(currentStream);
-      }
-    } else {
-      // gather the recieved nodes on device
-      postmpi.Gather(o_haloBuf, o_haloBuf,
-                      k, type, op, trans);
-    }
+    // gather the recieved nodes on device
+    postmpi.Gather(o_buf, o_buf, k, op, trans);
   }
 }
 
+void ogsPairwise_t::Start(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Start<float>(buf, k, op, trans); }
+void ogsPairwise_t::Start(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Start<double>(buf, k, op, trans); }
+void ogsPairwise_t::Start(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Start<int>(buf, k, op, trans); }
+void ogsPairwise_t::Start(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Start<long long int>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(deviceMemory<float> &buf, const int k, const Op op, const Transpose trans) { Finish<float>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(deviceMemory<double> &buf, const int k, const Op op, const Transpose trans) { Finish<double>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(deviceMemory<int> &buf, const int k, const Op op, const Transpose trans) { Finish<int>(buf, k, op, trans); }
+void ogsPairwise_t::Finish(deviceMemory<long long int> &buf, const int k, const Op op, const Transpose trans) { Finish<long long int>(buf, k, op, trans); }
+
 ogsPairwise_t::ogsPairwise_t(dlong Nshared,
-                             libp::memory<parallelNode_t> &sharedNodes,
+                             memory<parallelNode_t> &sharedNodes,
                              ogsOperator_t& gatherHalo,
-                             MPI_Comm _comm,
+                             stream_t _dataStream,
+                             comm_t _comm,
                              platform_t &_platform):
-  ogsExchange_t(_platform,_comm) {
+  ogsExchange_t(_platform,_comm,_dataStream) {
 
   Nhalo  = gatherHalo.NrowsT;
   NhaloP = gatherHalo.NrowsN;
@@ -185,14 +212,14 @@ ogsPairwise_t::ogsPairwise_t(dlong Nshared,
        });
 
   //make mpi allgatherv counts and offsets
-  libp::memory<int> mpiSendCountsT(size,0);
-  libp::memory<int> mpiSendCountsN(size,0);
-  libp::memory<int> mpiRecvCountsT(size);
-  libp::memory<int> mpiRecvCountsN(size);
-  libp::memory<int> mpiSendOffsetsT(size+1);
-  libp::memory<int> mpiSendOffsetsN(size+1);
-  libp::memory<int> mpiRecvOffsetsT(size+1);
-  libp::memory<int> mpiRecvOffsetsN(size+1);
+  memory<int> mpiSendCountsT(size,0);
+  memory<int> mpiSendCountsN(size,0);
+  memory<int> mpiRecvCountsT(size);
+  memory<int> mpiRecvCountsN(size);
+  memory<int> mpiSendOffsetsT(size+1);
+  memory<int> mpiSendOffsetsN(size+1);
+  memory<int> mpiRecvOffsetsT(size+1);
+  memory<int> mpiRecvOffsetsN(size+1);
 
   for (dlong n=0;n<Nshared;n++) { //loop through nodes we need to send
     const int r = sharedNodes[n].rank;
@@ -201,10 +228,8 @@ ogsPairwise_t::ogsPairwise_t(dlong Nshared,
   }
 
   //shared counts
-  MPI_Alltoall(mpiSendCountsT.ptr(), 1, MPI_INT,
-               mpiRecvCountsT.ptr(), 1, MPI_INT, comm);
-  MPI_Alltoall(mpiSendCountsN.ptr(), 1, MPI_INT,
-               mpiRecvCountsN.ptr(), 1, MPI_INT, comm);
+  comm.Alltoall(mpiSendCountsT, mpiRecvCountsT);
+  comm.Alltoall(mpiSendCountsN, mpiRecvCountsN);
 
   //cumulative sum
   mpiSendOffsetsN[0] = 0;
@@ -240,13 +265,11 @@ ogsPairwise_t::ogsPairwise_t(dlong Nshared,
 
   //send the node lists so we know what we'll receive
   dlong Nrecv = mpiRecvOffsetsT[size];
-  libp::memory<parallelNode_t> recvNodes(Nrecv);
+  memory<parallelNode_t> recvNodes(Nrecv);
 
   //Send list of nodes to each rank
-  MPI_Alltoallv(sharedNodes.ptr(), mpiSendCountsT.ptr(), mpiSendOffsetsT.ptr(), MPI_PARALLELNODE_T,
-                  recvNodes.ptr(), mpiRecvCountsT.ptr(), mpiRecvOffsetsT.ptr(), MPI_PARALLELNODE_T,
-                comm);
-  MPI_Barrier(comm);
+  comm.Alltoallv(sharedNodes, mpiSendCountsT, mpiSendOffsetsT,
+                   recvNodes, mpiRecvCountsT, mpiRecvOffsetsT);
 
   //make ops for gathering halo nodes after an MPI_Allgatherv
   postmpi.platform = platform;
@@ -258,10 +281,10 @@ ogsPairwise_t::ogsPairwise_t(dlong Nshared,
   postmpi.rowStartsT.calloc(Nhalo+1);
 
   //make array of counters
-  libp::memory<dlong> haloGatherTCounts(Nhalo);
-  libp::memory<dlong> haloGatherNCounts(Nhalo);
+  memory<dlong> haloGatherTCounts(Nhalo);
+  memory<dlong> haloGatherNCounts(Nhalo);
 
-  //count the data that will already be in haloBuf
+  //count the data that will already be in h_haloBuf.ptr()
   for (dlong n=0;n<Nhalo;n++) {
     haloGatherNCounts[n] = (n<NhaloP) ? 1 : 0;
     haloGatherTCounts[n] = 1;
@@ -386,22 +409,19 @@ ogsPairwise_t::ogsPairwise_t(dlong Nshared,
   recvOffsetsT[NranksRecvT] = mpiRecvOffsetsT[size];
 
   requests.malloc(NranksSendT+NranksRecvT);
-  statuses.malloc(NranksSendT+NranksRecvT);
 
   //make scratch space
-  AllocBuffer(Sizeof(Dfloat));
+  AllocBuffer(sizeof(dfloat));
 }
 
 void ogsPairwise_t::AllocBuffer(size_t Nbytes) {
-  if (o_haloBuf.size() < postmpi.nnzT*Nbytes) {
-    if (o_haloBuf.size()) o_haloBuf.free();
-    haloBuf = static_cast<char*>(platform.hostMalloc(postmpi.nnzT*Nbytes,  nullptr, h_haloBuf));
-    o_haloBuf = platform.malloc(postmpi.nnzT*Nbytes);
+  if (o_workspace.size() < postmpi.nnzT*Nbytes) {
+    h_workspace = platform.hostMalloc<char>(postmpi.nnzT*Nbytes);
+    o_workspace = platform.malloc<char>(postmpi.nnzT*Nbytes);
   }
-  if (o_sendBuf.size() < NsendT*Nbytes) {
-    if (o_sendBuf.size()) o_sendBuf.free();
-    sendBuf = static_cast<char*>(platform.hostMalloc(NsendT*Nbytes,  nullptr, h_sendBuf));
-    o_sendBuf = platform.malloc(NsendT*Nbytes);
+  if (o_sendspace.size() < NsendT*Nbytes) {
+    h_sendspace = platform.hostMalloc<char>(NsendT*Nbytes);
+    o_sendspace = platform.malloc<char>(NsendT*Nbytes);
   }
 }
 
diff --git a/libs/ogs/ogsSetup.cpp b/libs/ogs/ogsSetup.cpp
index 6323752..96608bd 100644
--- a/libs/ogs/ogsSetup.cpp
+++ b/libs/ogs/ogsSetup.cpp
@@ -28,6 +28,7 @@ SOFTWARE.
 #include "ogs/ogsUtils.hpp"
 #include "ogs/ogsOperator.hpp"
 #include "ogs/ogsExchange.hpp"
+#include "timer.hpp"
 
 #ifdef GLIBCXX_PARALLEL
 #include <parallel/algorithm>
@@ -41,8 +42,8 @@ namespace libp {
 namespace ogs {
 
 void ogs_t::Setup(const dlong _N,
-                  hlong *ids,
-                  MPI_Comm _comm,
+                  memory<hlong> ids,
+                  comm_t _comm,
                   const Kind _kind,
                   const Method method,
                   const bool _unique,
@@ -52,8 +53,8 @@ void ogs_t::Setup(const dlong _N,
 }
 
 void halo_t::Setup(const dlong _N,
-                  hlong *ids,
-                  MPI_Comm _comm,
+                  memory<hlong> ids,
+                  comm_t _comm,
                   const Method method,
                   const bool verbose,
                   platform_t& _platform){
@@ -66,8 +67,8 @@ void halo_t::Setup(const dlong _N,
  * Setup
  ********************************/
 void ogsBase_t::Setup(const dlong _N,
-                      hlong *ids,
-                      MPI_Comm _comm,
+                      memory<hlong> ids,
+                      comm_t _comm,
                       const Kind _kind,
                       const Method method,
                       const bool _unique,
@@ -77,21 +78,26 @@ void ogsBase_t::Setup(const dlong _N,
   //release resources if this ogs was setup before
   Free();
 
+  timePoint_t start = Time();
+
   platform = _platform;
 
+  if (!dataStream.isInitialized())
+      dataStream = platform.device.createStream();
+
   N = _N;
   comm = _comm;
   kind = _kind;
   unique = _unique;
 
   int rank, size;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+  rank = comm.rank();
+  size = comm.size();
 
   //sanity check options
-  if (   (kind==Unsigned && unique==true)
-      || (kind==Halo && unique==true) )
-    HIPBONE_ABORT("Invalid ogs setup requested");
+  LIBP_ABORT("Invalid ogs setup requested",
+             (kind==Unsigned && unique==true)
+              || (kind==Halo && unique==true));
 
   //count how many ids are non-zero
   dlong Nids=0;
@@ -99,7 +105,7 @@ void ogsBase_t::Setup(const dlong _N,
     if (ids[n]!=0) Nids++;
 
   // make list of nodes
-  libp::memory<parallelNode_t> nodes(Nids);
+  memory<parallelNode_t> nodes(Nids);
 
   //fill the data (squeezing out zero ids)
   Nids=0;
@@ -114,9 +120,6 @@ void ogsBase_t::Setup(const dlong _N,
     }
   }
 
-  /*Register MPI_PARALLELNODE_T type*/
-  InitMPIType();
-
   //flag which nodes are shared via MPI
   FindSharedNodes(Nids, nodes, verbose);
 
@@ -124,7 +127,7 @@ void ogsBase_t::Setup(const dlong _N,
   // construct sharedNodes which contains all the info
   // we need to setup the MPI exchange.
   dlong Nshared=0;
-  libp::memory<parallelNode_t> sharedNodes;
+  memory<parallelNode_t> sharedNodes;
   ConstructSharedNodes(Nids, nodes, Nshared, sharedNodes);
 
   Nids=0;
@@ -159,15 +162,18 @@ void ogsBase_t::Setup(const dlong _N,
   if (method == AllToAll) {
     exchange = std::shared_ptr<ogsExchange_t>(
                   new ogsAllToAll_t(Nshared, sharedNodes,
-                                    *gatherHalo, comm, platform));
+                                    *gatherHalo, dataStream,
+                                    comm, platform));
   } else if (method == Pairwise) {
     exchange = std::shared_ptr<ogsExchange_t>(
                   new ogsPairwise_t(Nshared, sharedNodes,
-                                    *gatherHalo, comm, platform));
+                                    *gatherHalo, dataStream,
+                                    comm, platform));
   } else if (method == CrystalRouter) {
     exchange = std::shared_ptr<ogsExchange_t>(
                   new ogsCrystalRouter_t(Nshared, sharedNodes,
-                                         *gatherHalo, comm, platform));
+                                         *gatherHalo, dataStream,
+                                         comm, platform));
   } else { //Auto
     exchange = std::shared_ptr<ogsExchange_t>(
                   AutoSetup(Nshared, sharedNodes,
@@ -175,30 +181,33 @@ void ogsBase_t::Setup(const dlong _N,
                             platform, verbose));
   }
 
-  /*Free the MPI_PARALLELNODE_T type*/
-  DestroyMPIType();
+  timePoint_t end = GlobalPlatformTime(platform);
+  double elapsedTime = ElapsedTime(start, end);
+
+  if (!rank && verbose) {
+    std::cout << "ogs Setup Time: " << elapsedTime << " seconds." << std::endl;
+  }
 }
 
 void ogsBase_t::FindSharedNodes(const dlong Nids,
-                                libp::memory<parallelNode_t> &nodes,
+                                memory<parallelNode_t> &nodes,
                                 const int verbose){
 
   int rank, size;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+  rank = comm.rank();
+  size = comm.size();
 
-  libp::memory<int> sendCounts(size,0);
-  libp::memory<int> recvCounts(size);
-  libp::memory<int> sendOffsets(size+1);
-  libp::memory<int> recvOffsets(size+1);
+  memory<int> sendCounts(size,0);
+  memory<int> recvCounts(size);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
 
   //count number of ids we're sending
   for (dlong n=0;n<Nids;n++) {
     sendCounts[nodes[n].destRank]++;
   }
 
-  MPI_Alltoall(sendCounts.ptr(), 1, MPI_INT,
-               recvCounts.ptr(), 1, MPI_INT, comm);
+  comm.Alltoall(sendCounts, recvCounts);
 
   sendOffsets[0] = 0;
   recvOffsets[0] = 0;
@@ -221,12 +230,11 @@ void ogsBase_t::FindSharedNodes(const dlong Nids,
 
   dlong recvN = recvOffsets[size]; //total ids to recv
 
-  libp::memory<parallelNode_t> recvNodes(recvN);
+  memory<parallelNode_t> recvNodes(recvN);
 
   //Send all the nodes to their destination rank.
-  MPI_Alltoallv(    nodes.ptr(), sendCounts.ptr(), sendOffsets.ptr(), MPI_PARALLELNODE_T,
-                recvNodes.ptr(), recvCounts.ptr(), recvOffsets.ptr(), MPI_PARALLELNODE_T,
-                comm);
+  comm.Alltoallv(    nodes, sendCounts, sendOffsets,
+                 recvNodes, recvCounts, recvOffsets);
 
   //remember this ordering
   for (dlong n=0;n<recvN;n++) {
@@ -243,7 +251,7 @@ void ogsBase_t::FindSharedNodes(const dlong Nids,
   // Our list is sorted by baseId to group nodes with the same globalId together
   // We now want to flag which nodes are shared via MPI
 
-  int locally_unique=1;
+  int is_unique=1;
 
   dlong Nshared=0;
 
@@ -271,16 +279,13 @@ void ogsBase_t::FindSharedNodes(const dlong Nids,
           if (recvNodes[i].baseId>0) positiveCount++;
 
         //if we didnt find a sole positive baseId, the gather is not well-defined
-        if (positiveCount!=1) locally_unique=0;
+        if (positiveCount!=1) is_unique=0;
       }
 
       // When making a halo excahnge, check that we have a leading positive id
-      if (kind==Halo && positiveCount!=1) {
-        std::stringstream ss;
-        ss << "Found " << positiveCount << " positive Ids for baseId: "
-           << abs(recvNodes[start].baseId)<< ".";
-        HIPBONE_ABORT(ss.str());
-      }
+      LIBP_ABORT("Found " << positiveCount << " positive Ids for baseId: "
+                 << abs(recvNodes[start].baseId)<< ".",
+                 kind==Halo && positiveCount!=1);
 
       //determine if this node is shared via MPI,
       int shared=1;
@@ -304,13 +309,11 @@ void ogsBase_t::FindSharedNodes(const dlong Nids,
   }
 
   //shared the unique node check so we know if the gather operation is well-defined
-  int globally_unique=1;
-  MPI_Allreduce(&locally_unique, &globally_unique, 1, MPI_INT, MPI_MIN, comm);
-  gather_defined = (globally_unique==1);
+  comm.Allreduce(is_unique, comm_t::Min);
+  gather_defined = (is_unique==1);
 
-  hlong Nshared_local = Nshared;
   hlong Nshared_global = Nshared;
-  MPI_Reduce(&Nshared_local, &Nshared_global, 1, MPI_HLONG, MPI_SUM, 0, comm);
+  comm.Reduce(Nshared_global, 0);
   if (!rank && verbose) {
     std::cout << "ogs Setup: " << Nshared_global << " unique labels shared." << std::endl;
   }
@@ -323,19 +326,16 @@ void ogsBase_t::FindSharedNodes(const dlong Nids,
   permute(recvN, recvNodes, [](const parallelNode_t& a) { return a.newId; } );
 
   //Return all the nodes to their origin rank.
-  MPI_Alltoallv(recvNodes.ptr(), recvCounts.ptr(), recvOffsets.ptr(), MPI_PARALLELNODE_T,
-                    nodes.ptr(), sendCounts.ptr(), sendOffsets.ptr(), MPI_PARALLELNODE_T,
-                comm);
+  comm.Alltoallv(recvNodes, recvCounts, recvOffsets,
+                     nodes, sendCounts, sendOffsets);
 }
 
 void ogsBase_t::ConstructSharedNodes(const dlong Nids,
-                                     libp::memory<parallelNode_t> &nodes,
+                                     memory<parallelNode_t> &nodes,
                                      dlong &Nshared,
-                                     libp::memory<parallelNode_t> &sharedNodes) {
+                                     memory<parallelNode_t> &sharedNodes) {
 
-  int rank, size;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+  int size = comm.size();
 
   // sort based on abs(baseId)
   sort(nodes.ptr(), nodes.ptr()+Nids,
@@ -388,11 +388,11 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
   Ngather = NlocalP+NhaloP;
 
   //global total
-  hlong NgatherLocal = (hlong) Ngather;
-  MPI_Allreduce(&NgatherLocal, &(NgatherGlobal), 1, MPI_HLONG, MPI_SUM, comm);
+  NgatherGlobal = Ngather;
+  comm.Allreduce(NgatherGlobal);
 
   //extract the leading node from each shared baseId
-  libp::memory<parallelNode_t> sendSharedNodes(NhaloT);
+  memory<parallelNode_t> sendSharedNodes(NhaloT);
 
   NhaloT=0;
   for (dlong n=0;n<Nids;n++) {
@@ -408,7 +408,7 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
 
   // Use the newId index to reorder the baseId groups based on
   // the order we encouter them in their original ordering.
-  libp::memory<dlong> indexMap(NbaseIds, -1);
+  memory<dlong> indexMap(NbaseIds, -1);
 
   dlong localCntN = 0, localCntT = NlocalP;  //start point for local gather nodes
   dlong haloCntN  = 0, haloCntT  = NhaloP;   //start point for halo gather nodes
@@ -441,10 +441,10 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
 
   indexMap.free();
 
-  libp::memory<int> sendCounts(size,0);
-  libp::memory<int> recvCounts(size);
-  libp::memory<int> sendOffsets(size+1);
-  libp::memory<int> recvOffsets(size+1);
+  memory<int> sendCounts(size,0);
+  memory<int> recvCounts(size);
+  memory<int> sendOffsets(size+1);
+  memory<int> recvOffsets(size+1);
 
   // sort based on destination rank
   sort(sendSharedNodes.ptr(), sendSharedNodes.ptr()+NhaloT,
@@ -457,8 +457,7 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
     sendCounts[sendSharedNodes[n].destRank]++;
   }
 
-  MPI_Alltoall(sendCounts.ptr(), 1, MPI_INT,
-               recvCounts.ptr(), 1, MPI_INT, comm);
+  comm.Alltoall(sendCounts, recvCounts);
 
   sendOffsets[0] = 0;
   recvOffsets[0] = 0;
@@ -468,15 +467,13 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
   }
   dlong recvN = recvOffsets[size]; //total ids to recv
 
-  libp::memory<parallelNode_t> recvSharedNodes(recvN);
+  memory<parallelNode_t> recvSharedNodes(recvN);
 
   //Send all the nodes to their destination rank.
-  MPI_Alltoallv(sendSharedNodes.ptr(), sendCounts.ptr(), sendOffsets.ptr(), MPI_PARALLELNODE_T,
-                recvSharedNodes.ptr(), recvCounts.ptr(), recvOffsets.ptr(), MPI_PARALLELNODE_T,
-                comm);
+  comm.Alltoallv(sendSharedNodes, sendCounts, sendOffsets,
+                 recvSharedNodes, recvCounts, recvOffsets);
 
   //free up some space
-  MPI_Barrier(comm);
   sendSharedNodes.free();
   sendCounts.free();
   recvCounts.free();
@@ -490,10 +487,10 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
        });
 
   //count number of shared nodes we will be sending
-  libp::memory<int> sharedSendCounts(size,0);
-  libp::memory<int> sharedRecvCounts(size);
-  libp::memory<int> sharedSendOffsets(size+1);
-  libp::memory<int> sharedRecvOffsets(size+1);
+  memory<int> sharedSendCounts(size,0);
+  memory<int> sharedRecvCounts(size);
+  memory<int> sharedSendOffsets(size+1);
+  memory<int> sharedRecvOffsets(size+1);
 
   start=0;
   for (dlong n=0;n<recvN;n++) {
@@ -515,8 +512,7 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
   // information to the involved ranks.
 
   //share counts
-  MPI_Alltoall(sharedSendCounts.ptr(), 1, MPI_INT,
-               sharedRecvCounts.ptr(), 1, MPI_INT, comm);
+  comm.Alltoall(sharedSendCounts, sharedRecvCounts);
 
   //cumulative sum
   sharedSendOffsets[0] = 0;
@@ -527,7 +523,7 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
   }
 
   //make a send buffer
-  libp::memory<parallelNode_t> sharedSendNodes(sharedSendOffsets[size]);
+  memory<parallelNode_t> sharedSendNodes(sharedSendOffsets[size]);
 
   //reset sendCounts
   for (int r=0;r<size;r++) sharedSendCounts[r]=0;
@@ -562,20 +558,15 @@ void ogsBase_t::ConstructSharedNodes(const dlong Nids,
 
   //make sharedNodes to hold the exchange data we recv
   Nshared = sharedRecvOffsets[size];
-  sharedNodes = libp::memory<parallelNode_t>(Nshared);
+  sharedNodes = memory<parallelNode_t>(Nshared);
 
   //Share all the gathering info
-  MPI_Alltoallv(sharedSendNodes.ptr(), sharedSendCounts.ptr(), sharedSendOffsets.ptr(), MPI_PARALLELNODE_T,
-                    sharedNodes.ptr(), sharedRecvCounts.ptr(), sharedRecvOffsets.ptr(), MPI_PARALLELNODE_T,
-                comm);
+  comm.Alltoallv(sharedSendNodes, sharedSendCounts, sharedSendOffsets,
+                     sharedNodes, sharedRecvCounts, sharedRecvOffsets);
 }
 
 //Make local and halo gather operators using nodes list
-void ogsBase_t::LocalSignedSetup(const dlong Nids, libp::memory<parallelNode_t> &nodes){
-
-  int rank, size;
-  MPI_Comm_rank(comm, &rank);
-  MPI_Comm_size(comm, &size);
+void ogsBase_t::LocalSignedSetup(const dlong Nids, memory<parallelNode_t> &nodes){
 
   gatherLocal = std::make_shared<ogsOperator_t>(platform);
   gatherHalo  = std::make_shared<ogsOperator_t>(platform);
@@ -593,10 +584,10 @@ void ogsBase_t::LocalSignedSetup(const dlong Nids, libp::memory<parallelNode_t>
 
   //tally up how many nodes are being gathered to each gatherNode and
   //  map to a local ordering
-  libp::memory<dlong> localGatherNCounts(gatherLocal->NrowsT,0);
-  libp::memory<dlong> localGatherTCounts(gatherLocal->NrowsT,0);
-  libp::memory<dlong> haloGatherNCounts(gatherHalo->NrowsT,0);
-  libp::memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
+  memory<dlong> localGatherNCounts(gatherLocal->NrowsT,0);
+  memory<dlong> localGatherTCounts(gatherLocal->NrowsT,0);
+  memory<dlong> haloGatherNCounts(gatherHalo->NrowsT,0);
+  memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
 
   for (dlong i=0;i<Nids;i++) {
     const dlong gid = nodes[i].newId; //re-mapped baseId on this rank
@@ -692,7 +683,7 @@ void ogsBase_t::LocalSignedSetup(const dlong Nids, libp::memory<parallelNode_t>
 }
 
 //Make local and halo gather operators using nodes list
-void ogsBase_t::LocalUnsignedSetup(const dlong Nids, libp::memory<parallelNode_t> &nodes){
+void ogsBase_t::LocalUnsignedSetup(const dlong Nids, memory<parallelNode_t> &nodes){
 
   gatherLocal = std::make_shared<ogsOperator_t>(platform);
   gatherHalo  = std::make_shared<ogsOperator_t>(platform);
@@ -710,8 +701,8 @@ void ogsBase_t::LocalUnsignedSetup(const dlong Nids, libp::memory<parallelNode_t
 
   //tally up how many nodes are being gathered to each gatherNode and
   //  map to a local ordering
-  libp::memory<dlong> localGatherTCounts(gatherLocal->NrowsT,0);
-  libp::memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
+  memory<dlong> localGatherTCounts(gatherLocal->NrowsT,0);
+  memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
 
   for (dlong i=0;i<Nids;i++) {
     const dlong gid = nodes[i].newId; //re-mapped baseId on this rank
@@ -785,7 +776,7 @@ void ogsBase_t::LocalUnsignedSetup(const dlong Nids, libp::memory<parallelNode_t
 }
 
 //Make local and halo gather operators using nodes list
-void ogsBase_t::LocalHaloSetup(const dlong Nids, libp::memory<parallelNode_t> &nodes){
+void ogsBase_t::LocalHaloSetup(const dlong Nids, memory<parallelNode_t> &nodes){
 
   gatherHalo  = std::make_shared<ogsOperator_t>(platform);
   gatherHalo->kind = Signed;
@@ -797,8 +788,8 @@ void ogsBase_t::LocalHaloSetup(const dlong Nids, libp::memory<parallelNode_t> &n
 
   //tally up how many nodes are being gathered to each gatherNode and
   //  map to a local ordering
-  libp::memory<dlong> haloGatherNCounts(gatherHalo->NrowsT,0);
-  libp::memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
+  memory<dlong> haloGatherNCounts(gatherHalo->NrowsT,0);
+  memory<dlong> haloGatherTCounts(gatherHalo->NrowsT,0);
 
   for (dlong i=0;i<Nids;i++) {
     const dlong gid = nodes[i].newId; //re-mapped baseId on this rank
@@ -856,6 +847,7 @@ void ogsBase_t::LocalHaloSetup(const dlong Nids, libp::memory<parallelNode_t> &n
 }
 
 void ogsBase_t::Free() {
+  comm.Free();
   gatherLocal = nullptr;
   gatherHalo = nullptr;
   exchange = nullptr;
@@ -867,17 +859,19 @@ void ogsBase_t::Free() {
 }
 
 void ogsBase_t::AssertGatherDefined() {
-  if (!gather_defined) {
-    HIPBONE_ABORT("Gather operation not well-defined.");
-  }
+  LIBP_ABORT("Gather operation not well-defined.",
+             !gather_defined);
 }
 
 //Populate the local mapping of the original ids and the gathered ordering
-void ogs_t::SetupGlobalToLocalMapping(dlong *GlobalToLocal) {
+void ogs_t::SetupGlobalToLocalMapping(memory<dlong> GlobalToLocal) {
+
+  LIBP_ABORT("ogs handle is not set up.",
+             NgatherGlobal==0);
 
   //Note: Must have GlobalToLocal have N entries.
 
-  libp::memory<dlong> ids(NlocalT+NhaloT);
+  memory<dlong> ids(NlocalT+NhaloT);
 
   for (dlong n=0;n<NlocalT+NhaloT;n++)
     ids[n] = n;
@@ -885,23 +879,25 @@ void ogs_t::SetupGlobalToLocalMapping(dlong *GlobalToLocal) {
   for (dlong n=0;n<N;n++)
     GlobalToLocal[n] = -1;
 
-  gatherLocal->Scatter(GlobalToLocal, ids.ptr(),
-                       1, Dlong, Add, NoTrans);
-  gatherHalo->Scatter(GlobalToLocal, ids.ptr()+NlocalT,
-                       1, Dlong, Add, NoTrans);
+  gatherLocal->Scatter(GlobalToLocal, ids,
+                       1, NoTrans);
+  gatherHalo->Scatter(GlobalToLocal, ids+NlocalT,
+                       1, NoTrans);
 }
 
 void halo_t::SetupFromGather(ogs_t& ogs) {
 
   ogs.AssertGatherDefined();
 
+  platform = ogs.platform;
+  comm = ogs.comm;
+
   N = ogs.NlocalT + ogs.NhaloT;
 
   Ngather = Ngather;
   Nhalo = ogs.NhaloT - ogs.NhaloP;
 
   NgatherGlobal = ogs.NgatherGlobal;
-  comm = ogs.comm;
 
   kind = Halo;
   unique = ogs.unique;
diff --git a/libs/ogs/ogsUtils.cpp b/libs/ogs/ogsUtils.cpp
index b342020..4bdfc35 100644
--- a/libs/ogs/ogsUtils.cpp
+++ b/libs/ogs/ogsUtils.cpp
@@ -34,42 +34,13 @@ namespace libp {
 
 namespace ogs {
 
-MPI_Datatype MPI_PARALLELNODE_T;
-
-void InitMPIType() {
-  // Make the MPI_PARALLELNODE_T data type
-  parallelNode_t node{};
-  MPI_Datatype dtype[6] = {MPI_DLONG, MPI_HLONG,
-                           MPI_DLONG, MPI_INT,
-                           MPI_INT, MPI_INT};
-  int blength[6] = {1, 1, 1, 1, 1, 1};
-  MPI_Aint addr[6], displ[6];
-  MPI_Get_address ( &(node.localId), addr+0);
-  MPI_Get_address ( &(node.baseId), addr+1);
-  MPI_Get_address ( &(node.newId), addr+2);
-  MPI_Get_address ( &(node.sign), addr+3);
-  MPI_Get_address ( &(node.rank), addr+4);
-  MPI_Get_address ( &(node.destRank), addr+5);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  displ[4] = addr[4] - addr[0];
-  displ[5] = addr[5] - addr[0];
-  MPI_Type_create_struct (6, blength, displ, dtype, &MPI_PARALLELNODE_T);
-  MPI_Type_commit (&MPI_PARALLELNODE_T);
-}
-
-void DestroyMPIType() {
-  MPI_Type_free(&MPI_PARALLELNODE_T);
-}
+stream_t ogsBase_t::dataStream;
 
-occa::kernel ogsOperator_t::gatherScatterKernel[4][4];
-occa::kernel ogsOperator_t::gatherKernel[4][4];
-occa::kernel ogsOperator_t::scatterKernel[4];
+kernel_t ogsOperator_t::gatherScatterKernel[4][4];
+kernel_t ogsOperator_t::gatherKernel[4][4];
+kernel_t ogsOperator_t::scatterKernel[4];
 
-occa::kernel ogsExchange_t::extractKernel[4];
-occa::stream ogsExchange_t::dataStream;
+kernel_t ogsExchange_t::extractKernel[4];
 
 
 void InitializeKernels(platform_t& platform, const Type type, const Op op) {
@@ -77,7 +48,7 @@ void InitializeKernels(platform_t& platform, const Type type, const Op op) {
   //check if the gather kernel is initialized
   if (!ogsOperator_t::gatherKernel[type][op].isInitialized()) {
 
-    occa::properties kernelInfo = platform.props();
+    properties_t kernelInfo = platform.props();
 
     kernelInfo["defines/p_blockSize"] = ogsOperator_t::blockSize;
     kernelInfo["defines/p_gatherNodesPerBlock"] = ogsOperator_t::gatherNodesPerBlock;
@@ -151,26 +122,6 @@ void InitializeKernels(platform_t& platform, const Type type, const Op op) {
   }
 }
 
-size_t Sizeof(const Type type) {
-  switch(type) {
-    case  Float: return sizeof(float);
-    case Double: return sizeof(double);
-    case  Int32: return sizeof(int32_t);
-    case  Int64: return sizeof(int64_t);
-  }
-  return 0;
-}
-
-MPI_Datatype MPI_Type(const Type type) {
-  switch(type) {
-    case  Float: return MPI_FLOAT;
-    case Double: return MPI_DOUBLE;
-    case  Int32: return MPI_INT32_T;
-    case  Int64: return MPI_INT64_T;
-  }
-  return 0;
-}
-
 } //namespace ogs
 
 } //namespace libp

From 7f1726743aa9c338aa2638c1752f80db834c1cec Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:11:47 -0600
Subject: [PATCH 06/12] Latest memory objects from libp

---
 include/memory.hpp | 822 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 626 insertions(+), 196 deletions(-)

diff --git a/include/memory.hpp b/include/memory.hpp
index d058e79..5c038f8 100644
--- a/include/memory.hpp
+++ b/include/memory.hpp
@@ -27,238 +27,668 @@ SOFTWARE.
 #ifndef LIBP_MEMORY_HPP
 #define LIBP_MEMORY_HPP
 
-#include <cstddef>
-#include <memory>
-#include <algorithm>
-#include <iostream>
-#include <typeinfo>
-#include <occa.h>
+#include "utils.hpp"
 
 namespace libp {
 
-  template<typename T>
-  class memory {
-
-   private:
-    using size_t = std::size_t;
-    using ptrdiff_t = std::ptrdiff_t;
-
-    std::shared_ptr<T[]> shrdPtr;
-    size_t lngth;
-    size_t offset;
-
-   public:
-    memory() :
-      lngth{0},
-      offset{0} {}
-
-    memory(const size_t lngth_) :
-      shrdPtr(new T[lngth_]),
-      lngth{lngth_},
-      offset{0} {}
-
-    memory(const size_t lngth_,
-           const T val) :
-      shrdPtr(new T[lngth_]),
-      lngth{lngth_},
-      offset{0} {
-      #pragma omp parallel for
-      for (size_t i=0;i<lngth;++i) {
-        shrdPtr[i] = val;
-      }
+template<typename T>
+class memory {
+  template <typename U> friend class memory;
+
+ private:
+  using size_t = std::size_t;
+  using ptrdiff_t = std::ptrdiff_t;
+
+  std::shared_ptr<T[]> shrdPtr;
+  size_t lngth;
+  size_t offset;
+
+ public:
+  memory() :
+    lngth{0},
+    offset{0} {}
+
+  memory(const size_t lngth_) :
+    shrdPtr(new T[lngth_]),
+    lngth{lngth_},
+    offset{0} {}
+
+  memory(const size_t lngth_,
+         const T val) :
+    shrdPtr(new T[lngth_]),
+    lngth{lngth_},
+    offset{0} {
+    #pragma omp parallel for
+    for (size_t i=0;i<lngth;++i) {
+      shrdPtr[i] = val;
     }
+  }
 
+  /*Conversion constructor*/
+  template<typename U>
+  memory(const memory<U> &m):
+    shrdPtr{std::reinterpret_pointer_cast<T[]>(m.shrdPtr)},
+    lngth{m.lngth*sizeof(T)/sizeof(U)},
+    offset{m.offset*sizeof(T)/sizeof(U)} {
+    // Check that this conversion made sense
+    LIBP_ABORT("libp::memory type conversion failed. Trying to convert "
+                << m.lngth << " " << sizeof(T) << "-byte words to "
+                << lngth << " " << sizeof(U) << "-byte words.",
+                lngth*sizeof(U) != m.lngth*sizeof(T));
+
+    LIBP_ABORT("libp::memory type conversion failed. Source memory has offset at "
+               << m.lngth << " " << sizeof(T) << "-byte words, destination memory would have offset at"
+               << lngth << " " << sizeof(U) << "-byte words.",
+               offset*sizeof(U) != m.offset*sizeof(T));
+  }
 
-    memory(const memory<T> &m)=default;
-    memory& operator = (const memory<T> &m)=default;
-    ~memory()=default;
+  memory(const memory<T> &m)=default;
+  memory& operator = (const memory<T> &m)=default;
+  ~memory()=default;
 
-    void malloc(const size_t lngth_) {
-      *this = libp::memory<T>(lngth_);
-    }
+  void malloc(const size_t lngth_) {
+    *this = memory<T>(lngth_);
+  }
 
-    void calloc(const size_t lngth_) {
-      *this = libp::memory<T>(lngth_, T{0});
-    }
+  void calloc(const size_t lngth_) {
+    *this = memory<T>(lngth_, T{0});
+  }
 
-    void realloc(const size_t lngth_) {
-      libp::memory<T> m(lngth_);
-      const ptrdiff_t cnt = std::min(lngth, lngth_);
-      m.copyFrom(*this, cnt);
-      *this = m;
-    }
+  void realloc(const size_t lngth_) {
+    memory<T> m(lngth_);
+    const ptrdiff_t cnt = std::min(lngth, lngth_);
+    m.copyFrom(*this, cnt);
+    *this = m;
+  }
 
-    memory& swap(memory<T> &m) {
-      std::swap(shrdPtr, m.shrdPtr);
-      std::swap(lngth, m.lngth);
-      std::swap(offset, m.offset);
-      return *this;
-    }
+  memory& swap(memory<T> &m) {
+    std::swap(shrdPtr, m.shrdPtr);
+    std::swap(lngth, m.lngth);
+    std::swap(offset, m.offset);
+    return *this;
+  }
 
-    T* ptr() {
-      return shrdPtr.get()+offset;
-    }
-    const T* ptr() const {
-      return shrdPtr.get()+offset;
-    }
+  T* ptr() {
+    return shrdPtr.get()+offset;
+  }
+  const T* ptr() const {
+    return shrdPtr.get()+offset;
+  }
 
-    size_t length() const {
-      return lngth;
-    }
+  T* begin() {return ptr();}
+  T* end() {return ptr() + length();}
 
-    size_t size() const {
-      return lngth*sizeof(T);
-    }
+  size_t length() const {
+    return lngth;
+  }
 
-    size_t use_count() const {
-      return shrdPtr.use_count();
-    }
+  size_t size() const {
+    return lngth*sizeof(T);
+  }
 
-    T& operator[](const ptrdiff_t idx) const {
-      return shrdPtr[idx+offset];
-    }
+  size_t use_count() const {
+    return shrdPtr.use_count();
+  }
 
-    bool operator == (const libp::memory<T> &other) const {
-      return (shrdPtr==other.shrdPtr && offset==other.offset);
-    }
-    bool operator != (const libp::memory<T> &other) const {
-      return (shrdPtr!=other.shrdPtr || offset!=other.offset);
-    }
+  T& operator[](const ptrdiff_t idx) const {
+    return shrdPtr[idx+offset];
+  }
 
-    libp::memory<T> operator + (const ptrdiff_t offset_) const {
-      return slice(offset_);
-    }
-    libp::memory<T>& operator += (const ptrdiff_t offset_) {
-      *this = slice(offset_);
-      return *this;
-    }
+  bool operator == (const memory<T> &other) const {
+    return (shrdPtr==other.shrdPtr && offset==other.offset);
+  }
+  bool operator != (const memory<T> &other) const {
+    return (shrdPtr!=other.shrdPtr || offset!=other.offset);
+  }
 
-    libp::memory<T> slice(const ptrdiff_t offset_,
-                          const ptrdiff_t count = -1) const {
-      libp::memory<T> m(*this);
-      m.offset = offset + offset_;
-      m.lngth = (count==-1)
-                  ? (lngth - offset_)
-                  : count;
-      return m;
-    }
+  memory<T> operator + (const ptrdiff_t offset_) const {
+    return slice(offset_);
+  }
+  memory<T>& operator += (const ptrdiff_t offset_) {
+    *this = slice(offset_);
+    return *this;
+  }
 
-    /*Copy from raw ptr*/
-    void copyFrom(const T* src,
-                  const ptrdiff_t count = -1,
-                  const ptrdiff_t offset_ = 0) {
+  memory<T> slice(const ptrdiff_t offset_,
+                  const ptrdiff_t count = -1) const {
+    memory<T> m(*this);
+    m.offset = offset + offset_;
+    m.lngth = (count==-1)
+                ? (lngth - offset_)
+                : count;
+    return m;
+  }
 
-      const ptrdiff_t cnt = (count==-1) ? lngth : count;
-      std::copy(src,
-                src+cnt,
-                ptr()+offset_);
-    }
+  /*Copy from raw ptr*/
+  void copyFrom(const T* src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset_ = 0) {
+
+    const ptrdiff_t cnt = (count==-1) ? lngth : count;
+
+    LIBP_ABORT("libp::memory::copyFrom Cannot have negative count ("
+               << cnt << ")",
+               cnt < 0);
+    LIBP_ABORT("libp::memory::copyFrom Cannot have negative offset ("
+               << offset_ << ")",
+               offset_ < 0);
+    LIBP_ABORT("libp::memory::copyFrom Destination memory has size [" << lngth << "],"
+               << " trying to access [" << offset_ << ", " << offset_+static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt)+offset_ > lngth);
+
+    std::copy(src,
+              src+cnt,
+              ptr()+offset_);
+  }
 
-    /*Copy from libp::memory*/
-    void copyFrom(const libp::memory<T> &src,
-                  const ptrdiff_t count = -1,
-                  const ptrdiff_t destOffset = 0,
-                  const ptrdiff_t srcOffset = 0) {
-      const ptrdiff_t cnt = (count==-1) ? lngth : count;
-      std::copy(src.ptr()+srcOffset,
-                src.ptr()+srcOffset+cnt,
-                ptr()+destOffset);
-    }
+  /*Copy from memory*/
+  void copyFrom(const memory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset_ = 0) {
+    const ptrdiff_t cnt = (count==-1) ? lngth : count;
+
+    LIBP_ABORT("libp::memory::copyFrom Cannot have negative count ("
+               << cnt << ")",
+               cnt < 0);
+    LIBP_ABORT("libp::memory::copyFrom Cannot have negative offset ("
+               << offset_ << ")",
+               offset_ < 0);
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > src.length());
+    LIBP_ABORT("libp::memory::copyFrom Destination memory has size [" << lngth << "],"
+               << " trying to access [" << offset_ << ", " << offset_+static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt)+offset_ > lngth);
+
+    std::copy(src.ptr(),
+              src.ptr()+cnt,
+              ptr()+offset_);
+  }
 
-    /*Copy from occa::memory*/
-    void copyFrom(const occa::memory &src,
-                  const ptrdiff_t count = -1,
-                  const ptrdiff_t destOffset = 0,
-                  const ptrdiff_t srcOffset = 0,
-                  const occa::json &props = occa::json()) {
-      const ptrdiff_t cnt = (count==-1) ? lngth : count;
-      src.copyTo(ptr()+destOffset,
-                 cnt*sizeof(T),
-                 srcOffset,
-                 props);
-    }
+  /*Copy to raw pointer*/
+  void copyTo(T *dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset_ = 0) const {
+    const ptrdiff_t cnt = (count==-1) ? lngth : count;
+
+    LIBP_ABORT("libp::memory::copyTo Cannot have negative count ("
+               << cnt << ")",
+               cnt < 0);
+    LIBP_ABORT("libp::memory::copyTo Cannot have negative offset ("
+               << offset_ << ")",
+               offset_ < 0);
+    LIBP_ABORT("libp::memory::copyTo Source memory has size [" << lngth << "],"
+               << " trying to access [" << offset_ << ", " << offset_+static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt)+offset_ > lngth);
+
+    std::copy(ptr()+offset_,
+              ptr()+offset_+cnt,
+              dest);
+  }
 
-    void copyFrom(const occa::memory &src,
-                  const occa::json &props) {
-      src.copyTo(ptr(),
-                 lngth*sizeof(T),
-                 0,
-                 props);
-    }
+  /*Copy to memory*/
+  void copyTo(memory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset_ = 0) const {
+    const ptrdiff_t cnt = (count==-1) ? lngth : count;
+
+    LIBP_ABORT("libp::memory::copyTo Cannot have negative count ("
+               << cnt << ")",
+               cnt < 0);
+    LIBP_ABORT("libp::memory::copyTo Cannot have negative offset ("
+               << offset_ << ")",
+               offset_ < 0);
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << cnt << "]",
+               static_cast<size_t>(cnt) > dest.length());
+    LIBP_ABORT("libp::memory::copyTo Source memory has size [" << lngth << "],"
+               << " trying to access [" << offset_ << ", " << offset_+static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt)+offset_ > lngth);
+
+    std::copy(ptr()+offset_,
+              ptr()+offset_+cnt,
+              dest.ptr());
+  }
 
-    /*Copy to raw pointer*/
-    void copyTo(T *dest,
-                const ptrdiff_t count = -1,
-                const ptrdiff_t offset_ = 0) const {
-      const ptrdiff_t cnt = (count==-1) ? lngth : count;
-      std::copy(ptr()+offset_,
-                ptr()+offset_+cnt,
-                dest);
-    }
+  memory<T> clone() const {
+    memory<T> m(lngth);
+    m.copyFrom(*this);
+    return m;
+  }
+
+  void free() {
+    shrdPtr = nullptr;
+    lngth=0;
+    offset=0;
+  }
+};
+
+template <typename T>
+std::ostream& operator << (std::ostream &out,
+                         const memory<T> &memory) {
+  out << "memory - "
+      << "type: " << typeid(T).name() << ", "
+      << "ptr : " << memory.ptr() << ", "
+      << "length : " << memory.length() << ", "
+      << "use_count : " << memory.use_count();
+  return out;
+}
+
+/*Extern declare common instantiations for faster compilation*/
+extern template class memory<int>;
+extern template class memory<long long int>;
+extern template class memory<float>;
+extern template class memory<double>;
+
+/*libp::deviceMemory is a wrapper around occa::memory*/
+template<typename T>
+class deviceMemory: public occa::memory {
+ public:
+  deviceMemory() = default;
+  deviceMemory(const deviceMemory<T> &m)=default;
+  deviceMemory(occa::memory m):
+    occa::memory(m)
+  {
+    if (isInitialized())
+      occa::memory::setDtype(occa::dtype::get<T>());
+  }
+
+  /*Conversion constructor*/
+  template<typename U>
+  deviceMemory(const deviceMemory<U> &m):
+    occa::memory(m)
+  {
+    if (isInitialized())
+      occa::memory::setDtype(occa::dtype::get<T>());
+  }
+
+  deviceMemory<T>& operator = (const deviceMemory<T> &m)=default;
+  ~deviceMemory()=default;
+
+  T* ptr() {
+    return static_cast<T*>(occa::memory::ptr());
+  }
+  const T* ptr() const {
+    return static_cast<const T*>(occa::memory::ptr());
+  }
+
+  T& operator[](const ptrdiff_t idx) {
+    return ptr()[idx];
+  }
+
+  deviceMemory<T> operator + (const ptrdiff_t offset) const {
+    if (isInitialized())
+      return deviceMemory<T>(occa::memory::operator+(offset));
+    else
+      return deviceMemory<T>();
+  }
+
+  deviceMemory<T>& operator += (const ptrdiff_t offset) {
+    *this = deviceMemory<T>(occa::memory::slice(offset));
+    return *this;
+  }
 
-    /*Copy to libp::memory*/
-    void copyTo(libp::memory<T> &dest,
+  /*Copy from libp::memory*/
+  void copyFrom(const libp::memory<T> src,
                 const ptrdiff_t count = -1,
-                const ptrdiff_t destOffset = 0,
-                const ptrdiff_t srcOffset = 0) const {
-      const ptrdiff_t cnt = (count==-1) ? lngth : count;
-      std::copy(ptr()+srcOffset,
-                ptr()+srcOffset+cnt,
-                dest.ptr()+destOffset);
-    }
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > src.length());
+
+    occa::memory::copyFrom(src.ptr(),
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           props);
+  }
+
+  void copyFrom(const libp::memory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << length() << "]",
+               length() > src.length());
+
+    occa::memory::copyFrom(src.ptr(),
+                           length()*sizeof(T),
+                           0,
+                           props);
+  }
 
-    /*Copy to occa::memory*/
-    void copyTo(occa::memory &dest,
+  /*Copy from libp::deviceMemory*/
+  void copyFrom(const deviceMemory<T> src,
                 const ptrdiff_t count = -1,
-                const ptrdiff_t destOffset = 0,
-                const ptrdiff_t srcOffset = 0,
-                const occa::json &props = occa::json()) const {
-      const ptrdiff_t cnt = (count==-1) ? lngth : count;
-      dest.copyFrom(ptr()+srcOffset,
-                 cnt*sizeof(T),
-                 destOffset,
-                 props);
-    }
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
 
-    void copyTo(occa::memory &dest,
-                const occa::json &props) const {
-      dest.copyFrom(ptr(),
-                 lngth*sizeof(T),
-                 0,
-                 props);
-    }
+    if (cnt==0) return;
 
+    occa::memory::copyFrom(src,
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           0,
+                           props);
+  }
 
-    libp::memory<T> clone() const {
-      libp::memory<T> m(lngth);
-      m.copyFrom(*this);
-      return m;
-    }
+  void copyFrom(const deviceMemory<T> src,
+                const properties_t &props) {
 
+    if (length()==0) return;
 
-    void free() {
-      shrdPtr = nullptr;
-      lngth=0;
-      offset=0;
-    }
-  };
+    occa::memory::copyFrom(src,
+                           length()*sizeof(T),
+                           0,
+                           0,
+                           props);
+  }
+
+  /*Copy to libp::memory*/
+  void copyTo(libp::memory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > dest.length());
 
-  template <typename T>
-  std::ostream& operator << (std::ostream &out,
-                           const libp::memory<T> &memory) {
-    out << "libp::memory - "
-        << "type: " << typeid(T).name() << ", "
-        << "ptr : " << memory.ptr() << ", "
-        << "length : " << memory.length() << ", "
-        << "use_count : " << memory.use_count();
-    return out;
+    occa::memory::copyTo(dest.ptr(),
+                         cnt*sizeof(T),
+                         offset*sizeof(T),
+                         props);
   }
-}
+
+  void copyTo(libp::memory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << length() << "]",
+               length() > dest.length());
+
+    occa::memory::copyTo(dest.ptr(),
+                         length()*sizeof(T),
+                         0,
+                         props);
+  }
+
+  /*Copy to libp::deviceMemory*/
+  void copyTo(deviceMemory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyTo(dest,
+                         cnt*sizeof(T),
+                         0,
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(deviceMemory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    occa::memory::copyTo(dest,
+                         length()*sizeof(T),
+                         0,
+                         0,
+                         props);
+  }
+};
 
 /*Extern declare common instantiations for faster compilation*/
-extern template class libp::memory<int>;
-extern template class libp::memory<long long int>;
-extern template class libp::memory<float>;
-extern template class libp::memory<double>;
+extern template class deviceMemory<int>;
+extern template class deviceMemory<long long int>;
+extern template class deviceMemory<float>;
+extern template class deviceMemory<double>;
+
+/*libp::pinnedMemory is another wrapper around occa::memory,
+  but is allocated slightly differently*/
+template<typename T>
+class pinnedMemory: public occa::memory {
+ public:
+  pinnedMemory() = default;
+  pinnedMemory(const pinnedMemory<T> &m)=default;
+  pinnedMemory(occa::memory m):
+    occa::memory(m)
+  {
+    if (isInitialized())
+      occa::memory::setDtype(occa::dtype::get<T>());
+  };
+
+  /*Conversion constructor*/
+  template<typename U>
+  pinnedMemory(const pinnedMemory<U> &m):
+    occa::memory(m)
+  {
+    if (isInitialized())
+      occa::memory::setDtype(occa::dtype::get<T>());
+  }
+
+  pinnedMemory<T>& operator = (const pinnedMemory<T> &m)=default;
+  ~pinnedMemory()=default;
+
+  T* ptr() {
+    return static_cast<T*>(occa::memory::ptr());
+  }
+  const T* ptr() const {
+    return static_cast<const T*>(occa::memory::ptr());
+  }
+
+  T& operator[](const ptrdiff_t idx) {
+    return ptr()[idx];
+  }
+
+  pinnedMemory<T> operator + (const ptrdiff_t offset) const {
+    if (isInitialized())
+      return pinnedMemory<T>(occa::memory::operator+(offset));
+    else
+      return pinnedMemory<T>();
+  }
+
+  pinnedMemory<T>& operator += (const ptrdiff_t offset) {
+    *this = pinnedMemory<T>(occa::memory::slice(offset));
+    return *this;
+  }
+
+  /*Copy from libp::memory*/
+  void copyFrom(const libp::memory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > src.length());
+
+    occa::memory::copyFrom(src.ptr(),
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           props);
+  }
+
+  void copyFrom(const libp::memory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    LIBP_ABORT("libp::memory::copyFrom Source memory has size [" << src.length() << "],"
+               << " trying to access [0, " << length() << "]",
+               length() > src.length());
+
+    occa::memory::copyFrom(src.ptr(),
+                           length()*sizeof(T),
+                           0,
+                           props);
+  }
+
+  /*Copy from libp::deviceMemory*/
+  void copyFrom(const deviceMemory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyFrom(src,
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           0,
+                           props);
+  }
+
+  void copyFrom(const deviceMemory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    occa::memory::copyFrom(src,
+                           length()*sizeof(T),
+                           0,
+                           0,
+                           props);
+  }
+
+  /*Copy from libp::pinnedMemory*/
+  void copyFrom(const pinnedMemory<T> src,
+                const ptrdiff_t count = -1,
+                const ptrdiff_t offset = 0,
+                const properties_t &props = properties_t()) {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyFrom(src,
+                           cnt*sizeof(T),
+                           offset*sizeof(T),
+                           0,
+                           props);
+  }
+
+  void copyFrom(const pinnedMemory<T> src,
+                const properties_t &props) {
+
+    if (length()==0) return;
+
+    occa::memory::copyFrom(src,
+                           length()*sizeof(T),
+                           0,
+                           0,
+                           props);
+  }
+
+  /*Copy to libp::memory*/
+  void copyTo(libp::memory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << static_cast<size_t>(cnt) << "]",
+               static_cast<size_t>(cnt) > dest.length());
+
+    occa::memory::copyTo(dest.ptr(),
+                         cnt*sizeof(T),
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(libp::memory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    LIBP_ABORT("libp::memory::copyTo Destination memory has size [" << dest.length() << "],"
+               << " trying to access [0, " << length() << "]",
+               length() > dest.length());
+
+    occa::memory::copyTo(dest.ptr(),
+                         length()*sizeof(T),
+                         0,
+                         props);
+  }
+
+  /*Copy to libp::deviceMemory*/
+  void copyTo(deviceMemory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyTo(dest,
+                         cnt*sizeof(T),
+                         0,
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(deviceMemory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    occa::memory::copyTo(dest,
+                         length()*sizeof(T),
+                         0,
+                         0,
+                         props);
+  }
+
+  /*Copy to libp::pinnedMemory*/
+  void copyTo(pinnedMemory<T> dest,
+              const ptrdiff_t count = -1,
+              const ptrdiff_t offset = 0,
+              const properties_t &props = properties_t()) const {
+    const ptrdiff_t cnt = (count==-1) ? length() : count;
+
+    if (cnt==0) return;
+
+    occa::memory::copyTo(dest,
+                         cnt*sizeof(T),
+                         0,
+                         offset*sizeof(T),
+                         props);
+  }
+
+  void copyTo(pinnedMemory<T> dest,
+              const properties_t &props) const {
+
+    if (length()==0) return;
+
+    occa::memory::copyTo(dest,
+                         length()*sizeof(T),
+                         0,
+                         0,
+                         props);
+  }
+};
+
+} //namespace libp
 
 #endif

From afad7b9eb156d009bee9ec42d7a8ab668b4992be Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:13:28 -0600
Subject: [PATCH 07/12] Update linalg libs

---
 include/core.hpp               | 19 +-----------
 include/linAlg.hpp             | 53 ++++++++++++++++++++--------------
 include/linearSolver.hpp       | 34 ++++++++++++----------
 libs/core/linAlg.cpp           | 36 +++++++++++++----------
 libs/core/linAlgSetup.cpp      | 24 ++++-----------
 libs/core/linearSolverCG.cpp   | 45 ++++++++++++++---------------
 libs/core/matrixEig.cpp        | 38 +++++++++---------------
 libs/core/matrixInverse.cpp    | 34 +++++++---------------
 libs/core/matrixRightSolve.cpp | 20 +++++--------
 9 files changed, 130 insertions(+), 173 deletions(-)

diff --git a/include/core.hpp b/include/core.hpp
index 4eec1e5..cd02493 100644
--- a/include/core.hpp
+++ b/include/core.hpp
@@ -27,29 +27,12 @@ SOFTWARE.
 #ifndef CORE_HPP
 #define CORE_HPP
 
-#include <mpi.h>
-#include <occa.h>
-#include <string>
-#include <cstring>
-#include <algorithm>
-#include <cmath>
 #include "utils.hpp"
 #include "memory.hpp"
+#include "comm.hpp"
 
 namespace libp {
 
-void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C);
-void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C);
-
-void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI);
-void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI);
-
-void matrixEigenValues(int N, double *A, double *WR, double *WI);
-void matrixEigenValues(int N, float *A, float *WR, float *WI);
-
-void matrixInverse(int N, double *A);
-void matrixInverse(int N, float *A);
-
 void RankDecomp(int  size_x, int  size_y, int  size_z,
                 int &rank_x, int &rank_y, int &rank_z,
                 const int rank);
diff --git a/include/linAlg.hpp b/include/linAlg.hpp
index 0e0fed2..b970226 100644
--- a/include/linAlg.hpp
+++ b/include/linAlg.hpp
@@ -28,7 +28,7 @@ SOFTWARE.
 #define LINALG_HPP
 
 #include "core.hpp"
-
+#include "memory.hpp"
 
 namespace libp {
 
@@ -39,46 +39,57 @@ class linAlg_t {
 
 public:
   platform_t *platform;
-  occa::properties kernelInfo;
+  properties_t kernelInfo;
 
   int blocksize;
 
   //scratch space for reductions
-  dfloat *scratch;
-  occa::memory h_scratch;
-  occa::memory o_scratch;
+  deviceMemory<dfloat> o_scratch;
+  pinnedMemory<dfloat> h_scratch;
 
   linAlg_t(platform_t *_platform);
 
   //initialize list of kernels
-  void InitKernels(std::vector<std::string> kernels, MPI_Comm comm);
-
-  ~linAlg_t();
+  void InitKernels(std::vector<std::string> kernels);
 
   /*********************/
   /* vector operations */
   /*********************/
 
   // o_x[n] = alpha
-  void set(const dlong N, const dfloat alpha, occa::memory& o_x);
+  void set(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_x);
 
   // o_y[n] = beta*o_y[n] + alpha*o_x[n]
-  void axpy(const dlong N, const dfloat alpha, occa::memory& o_x,
-                           const dfloat beta,  occa::memory& o_y);
+  void axpy(const dlong N, const dfloat alpha, deviceMemory<dfloat> o_x,
+                           const dfloat beta,  deviceMemory<dfloat> o_y);
 
   // ||o_a||_2
-  dfloat norm2(const dlong N, occa::memory& o_a, MPI_Comm comm);
+  dfloat norm2(const dlong N, deviceMemory<dfloat> o_a, comm_t comm);
 
   // o_x.o_y
-  dfloat innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y,
-                    MPI_Comm comm);
-
-  occa::kernel setKernel;
-  occa::kernel axpyKernel;
-  occa::kernel norm2Kernel1;
-  occa::kernel norm2Kernel2;
-  occa::kernel innerProdKernel1;
-  occa::kernel innerProdKernel2;
+  dfloat innerProd(const dlong N,
+                   deviceMemory<dfloat> o_x,
+                   deviceMemory<dfloat> o_y,
+                   comm_t comm);
+
+  kernel_t setKernel;
+  kernel_t axpyKernel;
+  kernel_t norm2Kernel1;
+  kernel_t norm2Kernel2;
+  kernel_t innerProdKernel1;
+  kernel_t innerProdKernel2;
+
+  static void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C);
+  static void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C);
+
+  static void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI);
+  static void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI);
+
+  static void matrixEigenValues(int N, double *A, double *WR, double *WI);
+  static void matrixEigenValues(int N, float *A, float *WR, float *WI);
+
+  static void matrixInverse(int N, double *A);
+  static void matrixInverse(int N, float *A);
 };
 
 } //namespace libp
diff --git a/include/linearSolver.hpp b/include/linearSolver.hpp
index a25bdf6..81f0172 100644
--- a/include/linearSolver.hpp
+++ b/include/linearSolver.hpp
@@ -37,7 +37,7 @@ namespace libp {
 class linearSolver_t {
 public:
   platform_t platform;
-  MPI_Comm comm;
+  comm_t comm;
 
   dlong N;
   dlong Nhalo;
@@ -47,33 +47,37 @@ class linearSolver_t {
     N(_N), Nhalo(_Nhalo) {}
 
   virtual int Solve(solver_t& solver,
-                    occa::memory& o_x, occa::memory& o_rhs,
-                    const dfloat tol, const int MAXIT, const int verbose)=0;
-
-  virtual ~linearSolver_t(){}
+                    deviceMemory<dfloat> o_x,
+                    deviceMemory<dfloat> o_rhs,
+                    const dfloat tol,
+                    const int MAXIT,
+                    const int verbose)=0;
 };
 
 //Conjugate Gradient
 class cg: public linearSolver_t {
 private:
-  occa::memory o_p, o_Ap;
+  deviceMemory<dfloat> o_p, o_Ap;
 
-  dfloat* tmprdotr;
-  occa::memory h_tmprdotr;
-  occa::memory o_tmprdotr;
+  deviceMemory<dfloat> o_tmprdotr;
+  pinnedMemory<dfloat> h_tmprdotr;
 
-  occa::kernel updateCGKernel1;
-  occa::kernel updateCGKernel2;
+  kernel_t updateCGKernel1;
+  kernel_t updateCGKernel2;
 
-  dfloat UpdateCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r);
+  dfloat UpdateCG(const dfloat alpha,
+                  deviceMemory<dfloat> o_x,
+                  deviceMemory<dfloat> o_r);
 
 public:
   cg(platform_t& _platform, dlong _N, dlong _Nhalo);
-  ~cg();
 
   int Solve(solver_t& solver,
-            occa::memory& o_x, occa::memory& o_rhs,
-            const dfloat tol, const int MAXIT, const int verbose);
+            deviceMemory<dfloat> o_x,
+            deviceMemory<dfloat> o_rhs,
+            const dfloat tol,
+            const int MAXIT,
+            const int verbose);
 };
 
 } //namespace libp
diff --git a/libs/core/linAlg.cpp b/libs/core/linAlg.cpp
index 84ef116..fad2ff6 100644
--- a/libs/core/linAlg.cpp
+++ b/libs/core/linAlg.cpp
@@ -35,37 +35,43 @@ namespace libp {
 /*********************/
 
 // o_x[n] = alpha
-void linAlg_t::set(const dlong N, const dfloat alpha, occa::memory& o_x) {
+void linAlg_t::set(const dlong N, const dfloat alpha,
+                   deviceMemory<dfloat> o_x) {
   setKernel(N, alpha, o_x);
 }
 
 // o_y[n] = beta*o_y[n] + alpha*o_x[n]
-void linAlg_t::axpy(const dlong N, const dfloat alpha, occa::memory& o_x,
-                    const dfloat beta,  occa::memory& o_y) {
+void linAlg_t::axpy(const dlong N,
+                    const dfloat alpha,
+                    deviceMemory<dfloat> o_x,
+                    const dfloat beta,
+                    deviceMemory<dfloat> o_y) {
   axpyKernel(N, alpha, o_x, beta, o_y);
 }
 
 // ||o_a||_2
-dfloat linAlg_t::norm2(const dlong N, occa::memory& o_a, MPI_Comm comm) {
-  //TODO, maybe complete reduction on device with second kernel?
+dfloat linAlg_t::norm2(const dlong N,
+                       deviceMemory<dfloat> o_a, comm_t comm) {
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
 
   norm2Kernel1(Nblock, N, o_a, o_scratch);
   norm2Kernel2(Nblock, o_scratch);
 
-  o_scratch.copyTo(scratch, 1*sizeof(dfloat), 0, "async: true");
-  platform->device.finish();
+  h_scratch.copyFrom(o_scratch, 1, 0, "async: true");
+  platform->finish();
 
-  dfloat norm = 0;
-  MPI_Allreduce(scratch, &norm, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat norm = h_scratch[0];
+  comm.Allreduce(norm);
 
   return sqrt(norm);
 }
 
 // o_x.o_y
-dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y,
-                           MPI_Comm comm) {
+dfloat linAlg_t::innerProd(const dlong N,
+                           deviceMemory<dfloat> o_x,
+                           deviceMemory<dfloat> o_y,
+                           comm_t comm) {
 
   int Nblock = (N+blocksize-1)/blocksize;
   Nblock = (Nblock>blocksize) ? blocksize : Nblock; //limit to blocksize entries
@@ -73,11 +79,11 @@ dfloat linAlg_t::innerProd(const dlong N, occa::memory& o_x, occa::memory& o_y,
   innerProdKernel1(Nblock, N, o_x, o_y, o_scratch);
   innerProdKernel2(Nblock, o_scratch);
 
-  o_scratch.copyTo(scratch, 1*sizeof(dfloat), 0, "async: true");
-  platform->device.finish();
+  h_scratch.copyFrom(o_scratch, 1, 0, "async: true");
+  platform->finish();
 
-  dfloat dot = 0;
-  MPI_Allreduce(scratch, &dot, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat dot = h_scratch[0];
+  comm.Allreduce(dot);
 
   return dot;
 }
diff --git a/libs/core/linAlgSetup.cpp b/libs/core/linAlgSetup.cpp
index ed96108..fbfabff 100644
--- a/libs/core/linAlgSetup.cpp
+++ b/libs/core/linAlgSetup.cpp
@@ -31,7 +31,6 @@ SOFTWARE.
 namespace libp {
 
 constexpr int LINALG_BLOCKSIZE = 256;
-constexpr int AXPY_BLOCKSIZE = 1024;
 
 using std::string;
 using std::stringstream;
@@ -45,13 +44,12 @@ linAlg_t::linAlg_t(platform_t *_platform): blocksize(LINALG_BLOCKSIZE) {
   kernelInfo["defines/" "p_blockSize"] = (int)LINALG_BLOCKSIZE;
 
   //pinned scratch buffer
-  scratch = (dfloat*) platform->hostMalloc(LINALG_BLOCKSIZE*sizeof(dfloat),
-                                           NULL, h_scratch);
-  o_scratch = platform->malloc(LINALG_BLOCKSIZE*sizeof(dfloat));
+  h_scratch = platform->hostMalloc<dfloat>(LINALG_BLOCKSIZE);
+  o_scratch = platform->malloc<dfloat>(LINALG_BLOCKSIZE);
 }
 
 //initialize list of kernels
-void linAlg_t::InitKernels(std::vector<string> kernels, MPI_Comm comm) {
+void linAlg_t::InitKernels(std::vector<string> kernels) {
 
   for (size_t i=0;i<kernels.size();i++) {
     string name = kernels[i];
@@ -62,12 +60,11 @@ void linAlg_t::InitKernels(std::vector<string> kernels, MPI_Comm comm) {
                                         "set",
                                         kernelInfo);
     } else if (name=="axpy") {
-      occa::properties axpyKernelInfo = kernelInfo;
       if (axpyKernel.isInitialized()==false)
         axpyKernel = platform->buildKernel(HIPBONE_DIR "/libs/core/okl/"
                                         "linAlgAXPY.okl",
                                         "axpy",
-                                        axpyKernelInfo);
+                                        kernelInfo);
     } else if (name=="norm2") {
       if (norm2Kernel1.isInitialized()==false)
         norm2Kernel1 = platform->buildKernel(HIPBONE_DIR "/libs/core/okl/"
@@ -93,20 +90,9 @@ void linAlg_t::InitKernels(std::vector<string> kernels, MPI_Comm comm) {
                                         "innerProd_2",
                                         kernelInfo);
     } else {
-      stringstream ss;
-      ss << "Requested linAlg routine \"" << name << "\" not found";
-      HIPBONE_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Requested linAlg routine \"" << name << "\" not found");
     }
   }
 }
 
-linAlg_t::~linAlg_t() {
-  setKernel.free();
-  axpyKernel.free();
-  norm2Kernel1.free();
-  norm2Kernel2.free();
-  innerProdKernel1.free();
-  innerProdKernel2.free();
-}
-
 } //namespace libp
diff --git a/libs/core/linearSolverCG.cpp b/libs/core/linearSolverCG.cpp
index e556651..6755883 100644
--- a/libs/core/linearSolverCG.cpp
+++ b/libs/core/linearSolverCG.cpp
@@ -37,18 +37,17 @@ cg::cg(platform_t& _platform, dlong _N, dlong _Nhalo):
   dlong Ntotal = N + Nhalo;
 
   /*aux variables */
-  dfloat *dummy = (dfloat *) calloc(Ntotal,sizeof(dfloat)); //need this to avoid uninitialized memory warnings
-  o_p  = platform.malloc(Ntotal*sizeof(dfloat),dummy);
-  o_Ap = platform.malloc(Ntotal*sizeof(dfloat),dummy);
-  free(dummy);
+  memory<dfloat> dummy(Ntotal,0.0); //need this to avoid uninitialized memory warnings
+  o_p  = platform.malloc<dfloat>(Ntotal,dummy);
+  o_Ap = platform.malloc<dfloat>(Ntotal,dummy);
+  dummy.free();
 
   //pinned tmp buffer for reductions
-  tmprdotr = (dfloat*) platform.hostMalloc(1*sizeof(dfloat),
-                                          NULL, h_tmprdotr);
-  o_tmprdotr = platform.malloc(CG_BLOCKSIZE*sizeof(dfloat));
+  h_tmprdotr = platform.hostMalloc<dfloat>(1);
+  o_tmprdotr = platform.malloc<dfloat>(CG_BLOCKSIZE);
 
   /* build kernels */
-  occa::properties kernelInfo = platform.props(); //copy base properties
+  properties_t kernelInfo = platform.props(); //copy base properties
 
   //add defines
   kernelInfo["defines/" "p_blockSize"] = (int)CG_BLOCKSIZE;
@@ -61,10 +60,13 @@ cg::cg(platform_t& _platform, dlong _N, dlong _Nhalo):
 }
 
 int cg::Solve(solver_t& solver,
-               occa::memory &o_x, occa::memory &o_r,
-               const dfloat tol, const int MAXIT, const int verbose) {
+               deviceMemory<dfloat> o_x,
+               deviceMemory<dfloat> o_r,
+               const dfloat tol,
+               const int MAXIT,
+               const int verbose) {
 
-  int rank = platform.rank;
+  int rank = platform.rank();
   linAlg_t &linAlg = platform.linAlg();
 
   // register scalars
@@ -82,7 +84,7 @@ int cg::Solve(solver_t& solver,
   rdotr = linAlg.norm2(N, o_r, comm);
   rdotr = rdotr*rdotr;
 
-  dfloat TOL = mymax(tol*tol*rdotr,tol*tol);
+  dfloat TOL = std::max(tol*tol*rdotr,tol*tol);
 
   if (verbose&&(rank==0))
     printf("CG: initial res norm %12.12f \n", sqrt(rdotr));
@@ -126,7 +128,9 @@ int cg::Solve(solver_t& solver,
   return iter;
 }
 
-dfloat cg::UpdateCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r){
+dfloat cg::UpdateCG(const dfloat alpha,
+                    deviceMemory<dfloat> o_x,
+                    deviceMemory<dfloat> o_r){
 
   linAlg_t &linAlg = platform.linAlg();
 
@@ -138,24 +142,17 @@ dfloat cg::UpdateCG(const dfloat alpha, occa::memory &o_x, occa::memory &o_r){
   updateCGKernel1(N, Nblocks, o_Ap, alpha, o_r, o_tmprdotr);
   updateCGKernel2(Nblocks, o_tmprdotr);
 
-  o_tmprdotr.copyTo(tmprdotr, 1*sizeof(dfloat), 0, "async: true");
-
-  platform.device.finish();
+  h_tmprdotr.copyFrom(o_tmprdotr, 1, 0, "async: true");
+  platform.finish();
 
   // x <= x + alpha*p
   linAlg.axpy(N, alpha, o_p, 1.f, o_x);
 
   /*Compute all reduce while axpy is running*/
-  dfloat rdotr1 = 0.0;
-  MPI_Allreduce(tmprdotr, &rdotr1, 1, MPI_DFLOAT, MPI_SUM, comm);
+  dfloat rdotr1 = h_tmprdotr[0];
+  comm.Allreduce(rdotr1);
 
   return rdotr1;
 }
 
-cg::~cg() {
-  updateCGKernel1.free();
-  updateCGKernel2.free();
-
-}
-
 } //namespace libp
diff --git a/libs/core/matrixEig.cpp b/libs/core/matrixEig.cpp
index 1219f3b..fdc084d 100644
--- a/libs/core/matrixEig.cpp
+++ b/libs/core/matrixEig.cpp
@@ -24,7 +24,7 @@ SOFTWARE.
 
 */
 
-#include "core.hpp"
+#include "linAlg.hpp"
 
 extern "C" {
   void sgeev_(char *JOBVL, char *JOBVR, int *N, float *A, int *LDA, float *WR, float *WI,
@@ -36,7 +36,7 @@ extern "C" {
 namespace libp {
 
 // compute right eigenvectors
-void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){
+void linAlg_t::matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){
 
   char JOBVL = 'N';
   char JOBVR = 'V';
@@ -62,11 +62,8 @@ void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){
   dgeev_ (&JOBVL, &JOBVR, &N, tmpA, &LDA, WR, WI,
           VL, &LDVL, tmpVR, &LDVR, WORK, &LWORK, &INFO);
 
-  if(INFO) {
-    std::stringstream ss;
-    ss << "dgeev_ reports info = " << INFO;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("dgeev_ reports info = " << INFO,
+                INFO);
 
   for(int n=0;n<N;++n){
     for(int m=0;m<N;++m){
@@ -80,7 +77,7 @@ void matrixEigenVectors(int N, double *A, double *VR, double *WR, double *WI){
 }
 
 // compute right eigenvectors
-void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI){
+void linAlg_t::matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI){
 
   char JOBVL = 'N';
   char JOBVR = 'V';
@@ -106,11 +103,8 @@ void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI){
   sgeev_ (&JOBVL, &JOBVR, &N, tmpA, &LDA, WR, WI,
           VL, &LDVL, tmpVR, &LDVR, WORK, &LWORK, &INFO);
 
-  if(INFO) {
-    std::stringstream ss;
-    ss << "sgeev_ reports info = " << INFO;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("sgeev_ reports info = " << INFO,
+                INFO);
 
   for(int n=0;n<N;++n){
     for(int m=0;m<N;++m){
@@ -124,7 +118,7 @@ void matrixEigenVectors(int N, float *A, float *VR, float *WR, float *WI){
 }
 
 // compute eigenvalues
-void matrixEigenValues(int N, double *A, double *WR, double *WI){
+void linAlg_t::matrixEigenValues(int N, double *A, double *WR, double *WI){
 
   char JOBVL  = 'N';
   char JOBVR  = 'N';
@@ -142,17 +136,14 @@ void matrixEigenValues(int N, double *A, double *WR, double *WI){
   dgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI,
           VL, &LDVL, VR, &LDVR, WORK, &LWORK, &INFO);
 
-  if(INFO) {
-    std::stringstream ss;
-    ss << "dgeev_ reports info = " << INFO;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("dgeev_ reports info = " << INFO,
+                INFO);
 
   free(WORK);
 }
 
 // compute eigenvalues
-void matrixEigenValues(int N, float *A, float *WR, float *WI){
+void linAlg_t::matrixEigenValues(int N, float *A, float *WR, float *WI){
 
   char JOBVL  = 'N';
   char JOBVR  = 'N';
@@ -170,11 +161,8 @@ void matrixEigenValues(int N, float *A, float *WR, float *WI){
   sgeev_ (&JOBVL, &JOBVR, &N, A, &LDA, WR, WI,
           VL, &LDVL, VR, &LDVR, WORK, &LWORK, &INFO);
 
-  if(INFO) {
-    std::stringstream ss;
-    ss << "sgeev_ reports info = " << INFO;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("sgeev_ reports info = " << INFO,
+                INFO);
 
   free(WORK);
 }
diff --git a/libs/core/matrixInverse.cpp b/libs/core/matrixInverse.cpp
index b8a3cab..261d4e9 100644
--- a/libs/core/matrixInverse.cpp
+++ b/libs/core/matrixInverse.cpp
@@ -24,7 +24,7 @@ SOFTWARE.
 
 */
 
-#include "core.hpp"
+#include "linAlg.hpp"
 
 extern "C" {
   void dgetrf_(int* M, int *N, double* A, int* lda, int* IPIV, int* INFO);
@@ -36,7 +36,7 @@ extern "C" {
 
 namespace libp {
 
-void matrixInverse(int N, double *A){
+void linAlg_t::matrixInverse(int N, double *A){
   int lwork = N*N;
   int info;
 
@@ -46,25 +46,19 @@ void matrixInverse(int N, double *A){
 
   dgetrf_ (&N, &N, A, &N, ipiv, &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "dgetrf_ reports info = " << info;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("dgetrf_ reports info = " << info,
+                info);
 
   dgetri_ (&N, A, &N, ipiv, work, &lwork, &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "dgetri_ reports info = " << info;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("dgetri_ reports info = " << info,
+                info);
 
   free(work);
   free(ipiv);
 }
 
-void matrixInverse(int N, float *A){
+void linAlg_t::matrixInverse(int N, float *A){
   int lwork = N*N;
   int info;
 
@@ -74,19 +68,13 @@ void matrixInverse(int N, float *A){
 
   sgetrf_ (&N, &N, A, &N, ipiv, &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "sgetrf_ reports info = " << info;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("sgetrf_ reports info = " << info,
+                info);
 
   sgetri_ (&N, A, &N, ipiv, work, &lwork, &info);
 
-  if(info) {
-    std::stringstream ss;
-    ss << "sgetri_ reports info = " << info;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("sgetri_ reports info = " << info,
+                info);
 
   free(work);
   free(ipiv);
diff --git a/libs/core/matrixRightSolve.cpp b/libs/core/matrixRightSolve.cpp
index f0e4705..01437ee 100644
--- a/libs/core/matrixRightSolve.cpp
+++ b/libs/core/matrixRightSolve.cpp
@@ -24,7 +24,7 @@ SOFTWARE.
 
 */
 
-#include "core.hpp"
+#include "linAlg.hpp"
 
 extern "C" {
   void dgesv_ ( int     *N, int     *NRHS, double  *A,
@@ -46,7 +46,7 @@ namespace libp {
 
 // C = A/B  = trans(trans(B)\trans(A))
 // assume row major
-void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C){
+void linAlg_t::matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB, double *B, double *C){
 
   int info;
 
@@ -75,11 +75,8 @@ void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB,
 
   dgesv_(&NrowsX, &NcolsY, tmpX, &NrowsX, ipiv, tmpY, &NrowsY, &info); // ?
 
-  if(info) {
-    std::stringstream ss;
-    ss << "dgesv_ reports info = " << info;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("dgesv_ reports info = " << info,
+                info);
 
   for(int n=0;n<NrowsY*NcolsY;++n){
     C[n] = tmpY[n];
@@ -93,7 +90,7 @@ void matrixRightSolve(int NrowsA, int NcolsA, double *A, int NrowsB, int NcolsB,
 
 // C = A/B  = trans(trans(B)\trans(A))
 // assume row major
-void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C){
+void linAlg_t::matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB, float *B, float *C){
 
   int info;
 
@@ -122,11 +119,8 @@ void matrixRightSolve(int NrowsA, int NcolsA, float *A, int NrowsB, int NcolsB,
 
   sgesv_(&NrowsX, &NcolsY, tmpX, &NrowsX, ipiv, tmpY, &NrowsY, &info); // ?
 
-  if(info) {
-    std::stringstream ss;
-    ss << "sgesv_ reports info = " << info;
-    HIPBONE_ABORT(ss.str());
-  }
+  LIBP_ABORT("sgesv_ reports info = " << info,
+                info);
 
   for(int n=0;n<NrowsY*NcolsY;++n){
     C[n] = tmpY[n];

From 8230b91ad70f72d15a9f9df87312913adc843b8e Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:14:28 -0600
Subject: [PATCH 08/12] Update core libs from libp

---
 include/platform.hpp               | 106 ++++++++++++++++-------------
 include/settings.hpp               |   8 +--
 include/solver.hpp                 |   7 +-
 libs/core/memory.cpp               |  18 +++--
 libs/core/platformBuildKernel.cpp  |  19 +++---
 libs/core/platformDeviceConfig.cpp |  67 ++++++++----------
 libs/core/platformProperties.cpp   |   2 +-
 libs/core/platformSettings.cpp     |   4 +-
 libs/core/settings.cpp             |  53 +++++----------
 9 files changed, 136 insertions(+), 148 deletions(-)

diff --git a/include/platform.hpp b/include/platform.hpp
index 81f9f58..8b6bf90 100644
--- a/include/platform.hpp
+++ b/include/platform.hpp
@@ -34,6 +34,8 @@ SOFTWARE.
 #define HIPBONE_VERSION_STR   "1.0.0"
 
 #include "core.hpp"
+#include "memory.hpp"
+#include "comm.hpp"
 #include "settings.hpp"
 #include "linAlg.hpp"
 
@@ -47,7 +49,7 @@ namespace internal {
 class iplatform_t {
 public:
   settings_t settings;
-  occa::properties props;
+  properties_t props;
 
   iplatform_t(settings_t& _settings):
     settings(_settings) {
@@ -57,14 +59,13 @@ class iplatform_t {
 } //namespace internal
 
 class platform_t {
-public:
-  MPI_Comm comm = MPI_COMM_NULL;
+ private:
   std::shared_ptr<internal::iplatform_t> iplatform;
-
-  occa::device device;
   std::shared_ptr<linAlg_t> ilinAlg;
 
-  int rank=0, size=0;
+ public:
+  comm_t comm;
+  device_t device;
 
   platform_t()=default;
 
@@ -74,9 +75,6 @@ class platform_t {
 
     comm = settings.comm;
 
-    MPI_Comm_rank(comm, &rank);
-    MPI_Comm_size(comm, &size);
-
     DeviceConfig();
     DeviceProperties();
 
@@ -91,64 +89,58 @@ class platform_t {
   }
 
   void assertInitialized() {
-    if(!isInitialized()) {
-      HIPBONE_ABORT("Platform not initialized.");
-    }
+    LIBP_ABORT("Platform not initialized.",
+                  !isInitialized());
   }
 
-  occa::kernel buildKernel(std::string fileName, std::string kernelName,
-                           occa::properties& kernelInfo);
-
-  occa::memory malloc(const size_t bytes,
-                      const void *src = NULL,
-                      const occa::properties &prop = occa::properties()) {
-    assertInitialized();
-    return device.malloc(bytes, src, prop);
-  }
+  kernel_t buildKernel(std::string fileName, std::string kernelName,
+                       properties_t& kernelInfo);
 
-  occa::memory malloc(const size_t bytes,
-                      const occa::memory &src,
-                      const occa::properties &prop = occa::properties()) {
+  template <typename T>
+  deviceMemory<T> malloc(const size_t count,
+                         const properties_t &prop = properties_t()) {
     assertInitialized();
-    return device.malloc(bytes, src, prop);
+    return deviceMemory<T>(device.malloc<T>(count, prop));
   }
 
-  occa::memory malloc(const size_t bytes,
-                      const occa::properties &prop) {
+  template <typename T>
+  deviceMemory<T> malloc(const size_t count,
+                         const memory<T> src,
+                         const properties_t &prop = properties_t()) {
     assertInitialized();
-    return device.malloc(bytes, prop);
+    return deviceMemory<T>(device.malloc<T>(count, src.ptr(), prop));
   }
 
   template <typename T>
-  occa::memory malloc(const size_t count,
-                      const occa::properties &prop = occa::properties()) {
+  deviceMemory<T> malloc(const memory<T> src,
+                         const properties_t &prop = properties_t()) {
     assertInitialized();
-    return device.malloc(count*sizeof(T), prop);
+    return deviceMemory<T>(device.malloc<T>(src.length(), src.ptr(), prop));
   }
 
   template <typename T>
-  occa::memory malloc(const size_t count,
-                      const libp::memory<T> &src,
-                      const occa::properties &prop) {
+  pinnedMemory<T> hostMalloc(const size_t count){
     assertInitialized();
-    return device.malloc(count*sizeof(T), src.ptr(), prop);
+    properties_t hostProp;
+    hostProp["host"] = true;
+    return pinnedMemory<T>(device.malloc<T>(count, nullptr, hostProp));
   }
 
   template <typename T>
-  occa::memory malloc(const libp::memory<T> &src,
-                      const occa::properties &prop = occa::properties()) {
+  pinnedMemory<T> hostMalloc(const size_t count,
+                             const memory<T> src){
     assertInitialized();
-    return device.malloc(src.length()*sizeof(T), src.ptr(), prop);
+    properties_t hostProp;
+    hostProp["host"] = true;
+    return pinnedMemory<T>(device.malloc<T>(count, src.ptr(), hostProp));
   }
 
-  void *hostMalloc(const size_t bytes,
-                   const void *src,
-                   occa::memory &h_mem){
+  template <typename T>
+  pinnedMemory<T> hostMalloc(const memory<T> src){
     assertInitialized();
-    occa::properties hostProp;
+    properties_t hostProp;
     hostProp["host"] = true;
-    h_mem = device.malloc(bytes, src, hostProp);
-    return h_mem.ptr();
+    return pinnedMemory<T>(device.malloc<T>(src.length(), src.ptr(), hostProp));
   }
 
   linAlg_t& linAlg() {
@@ -161,12 +153,32 @@ class platform_t {
     return iplatform->settings;
   }
 
-  occa::properties& props() {
+  properties_t& props() {
     assertInitialized();
     return iplatform->props;
   }
 
-private:
+  void finish() {
+    device.finish();
+  }
+
+  const int rank() const {
+    return comm.rank();
+  }
+
+  const int size() const {
+    return comm.size();
+  }
+
+  int getDeviceCount(const std::string mode) {
+    return occa::getDeviceCount(mode);
+  }
+
+  void setCacheDir(const std::string cacheDir) {
+    occa::env::setOccaCacheDir(cacheDir);
+  }
+
+ private:
   void DeviceConfig();
   void DeviceProperties();
 
@@ -174,4 +186,4 @@ class platform_t {
 
 } //namespace libp
 
-#endif
\ No newline at end of file
+#endif
diff --git a/include/settings.hpp b/include/settings.hpp
index 52dea68..6320a2b 100644
--- a/include/settings.hpp
+++ b/include/settings.hpp
@@ -92,12 +92,12 @@ class settings_t {
   std::vector<string> insertOrder;
 
 public:
-  MPI_Comm comm;
+  comm_t comm;
 
   std::map<string, setting_t> settings;
 
   settings_t() = delete;
-  settings_t(MPI_Comm _comm);
+  settings_t(comm_t _comm);
 
   //copy
   settings_t(const settings_t& other)=default;
@@ -120,9 +120,7 @@ class settings_t {
       const setting_t& val = search->second;
       value = val.getVal<T>();
     } else {
-      stringstream ss;
-      ss << "Unable to find setting: [" << name << "]";
-      HIPBONE_ABORT(ss.str());
+      LIBP_FORCE_ABORT("Unable to find setting: [" << name << "]");
     }
   }
 
diff --git a/include/solver.hpp b/include/solver.hpp
index 3313758..b878a95 100644
--- a/include/solver.hpp
+++ b/include/solver.hpp
@@ -38,17 +38,16 @@ class solver_t {
   solver_t()=default;
   solver_t(platform_t& _platform):
     platform(_platform) {}
-  virtual ~solver_t(){}
 
   settings_t& settings() { return platform.settings(); }
 
   virtual void Run()=0;
 
-  virtual void Operator(occa::memory& o_q, occa::memory& o_Aq) {
-    HIPBONE_ABORT(std::string("Operator not implemented in this solver"))
+  virtual void Operator(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_Aq) {
+    LIBP_FORCE_ABORT("Operator not implemented in this solver");
   }
 };
 
 } //namespace libp
 
-#endif
\ No newline at end of file
+#endif
diff --git a/libs/core/memory.cpp b/libs/core/memory.cpp
index 90ee0f3..3df102e 100644
--- a/libs/core/memory.cpp
+++ b/libs/core/memory.cpp
@@ -24,12 +24,18 @@ SOFTWARE.
 
 */
 
-#include "core.hpp"
+#include "memory.hpp"
 
 namespace libp {
-  /*explicit instantiation of common specializations*/
-  template class memory<int>;
-  template class memory<long long int>;
-  template class memory<float>;
-  template class memory<double>;
+/*explicit instantiation of common specializations*/
+template class memory<int>;
+template class memory<long long int>;
+template class memory<float>;
+template class memory<double>;
+
+/*explicit instantiation of common specializations*/
+template class deviceMemory<int>;
+template class deviceMemory<long long int>;
+template class deviceMemory<float>;
+template class deviceMemory<double>;
 } //namespace libp
diff --git a/libs/core/platformBuildKernel.cpp b/libs/core/platformBuildKernel.cpp
index e54c5a4..e992758 100644
--- a/libs/core/platformBuildKernel.cpp
+++ b/libs/core/platformBuildKernel.cpp
@@ -2,7 +2,7 @@
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
+Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -28,26 +28,27 @@ SOFTWARE.
 
 namespace libp {
 
-occa::kernel platform_t::buildKernel(std::string fileName, std::string kernelName,
-                                     occa::properties& kernelInfo){
+kernel_t platform_t::buildKernel(std::string fileName,
+                                 std::string kernelName,
+                                 properties_t& kernelInfo){
 
   assertInitialized();
 
-  occa::kernel kernel;
+  kernel_t kernel;
 
   //build on root first
-  if (!rank)
+  if (!rank())
     kernel = device.buildKernel(fileName, kernelName, kernelInfo);
 
-  MPI_Barrier(comm);
+  comm.Barrier();
 
   //remaining ranks find the cached version (ideally)
-  if (rank)
+  if (rank())
     kernel = device.buildKernel(fileName, kernelName, kernelInfo);
 
-  MPI_Barrier(comm);
+  comm.Barrier();
 
   return kernel;
 }
 
-} //namespace libp
\ No newline at end of file
+} //namespace libp
diff --git a/libs/core/platformDeviceConfig.cpp b/libs/core/platformDeviceConfig.cpp
index 8a5cd20..656d313 100644
--- a/libs/core/platformDeviceConfig.cpp
+++ b/libs/core/platformDeviceConfig.cpp
@@ -33,22 +33,20 @@ namespace libp {
 void platform_t::DeviceConfig(){
 
   //find out how many ranks and devices are on this system
-  char* hostnames = (char *) ::malloc(size*sizeof(char)*MPI_MAX_PROCESSOR_NAME);
-  char* hostname = hostnames+rank*MPI_MAX_PROCESSOR_NAME;
+  memory<char> hostnames(size()*MAX_PROCESSOR_NAME);
+  memory<char> hostname = hostnames + rank()*MAX_PROCESSOR_NAME;
 
   int namelen;
-  MPI_Get_processor_name(hostname,&namelen);
-
-  MPI_Allgather(MPI_IN_PLACE , MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
-                hostnames, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, MPI_COMM_WORLD);
+  comm_t::GetProcessorName(hostname.ptr(), namelen);
+  comm.Allgather(hostnames, MAX_PROCESSOR_NAME);
 
   int localRank = 0;
   int localSize = 0;
-  for (int n=0; n<rank; n++){
-    if (!strcmp(hostname, hostnames+n*MPI_MAX_PROCESSOR_NAME)) localRank++;
+  for (int n=0; n<rank(); n++){
+    if (!strcmp(hostname.ptr(), hostnames.ptr()+n*MAX_PROCESSOR_NAME)) localRank++;
   }
-  for (int n=0; n<size; n++){
-    if (!strcmp(hostname, hostnames+n*MPI_MAX_PROCESSOR_NAME)) localSize++;
+  for (int n=0; n<size(); n++){
+    if (!strcmp(hostname.ptr(), hostnames.ptr()+n*MAX_PROCESSOR_NAME)) localSize++;
   }
 
   int plat=0;
@@ -83,18 +81,16 @@ void platform_t::DeviceConfig(){
       ||Settings.compareSetting("THREAD MODEL", "HIP")
       ||Settings.compareSetting("THREAD MODEL", "OpenCL")) {
     //for testing a single device, run with 1 rank and specify DEVICE NUMBER
-    if (size==1) {
+    if (size()==1) {
       Settings.getSetting("DEVICE NUMBER",device_id);
     } else {
 
       device_id = localRank;
 
       //check for over-subscribing devices
-      int deviceCount = occa::getDeviceCount(mode);
+      int deviceCount = getDeviceCount(mode);
       if (deviceCount>0 && localRank>=deviceCount) {
-        std::stringstream ss;
-        ss << "Rank " << rank << " oversubscribing device " << device_id%deviceCount << " on node \"" << hostname<< "\"";
-        HIPBONE_WARNING(ss.str());
+        LIBP_FORCE_WARNING("Rank " << rank() << " oversubscribing device " << device_id%deviceCount << " on node \"" << hostname.ptr()<< "\"");
         device_id = device_id%deviceCount;
       }
     }
@@ -108,19 +104,18 @@ void platform_t::DeviceConfig(){
   /*Use lscpu to determine core and socket counts */
   FILE *pipeCores   = popen("lscpu | grep \"Core(s) per socket\" | awk '{print $4}'", "r");
   FILE *pipeSockets = popen("lscpu | grep \"Socket(s)\" | awk '{print $2}'", "r");
-  if (!pipeCores || !pipeSockets) {
-    HIPBONE_ABORT("popen() failed!");
-  }
+  LIBP_ABORT("popen() failed!",
+                !pipeCores || !pipeSockets);
 
   std::array<char, 128> buffer;
-  if (!fgets(buffer.data(), buffer.size(), pipeCores)) { //read to end of line
-    HIPBONE_ABORT("Error reading core count")
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading core count",
+                !fgets(buffer.data(), buffer.size(), pipeCores));
   int Ncores = std::stoi(buffer.data());
 
-  if (!fgets(buffer.data(), buffer.size(), pipeSockets)) { //read to end of line
-    HIPBONE_ABORT("Error reading core count")
-  }
+  //read to end of line
+  LIBP_ABORT("Error reading core count",
+                !fgets(buffer.data(), buffer.size(), pipeSockets));
   int Nsockets = std::stoi(buffer.data());
 
   pclose(pipeCores);
@@ -130,7 +125,7 @@ void platform_t::DeviceConfig(){
   int NcoresPerNode = Ncores*Nsockets;
   int Nthreads=0;
 
-#if !defined(HIPBONE_DEBUG)
+#if !defined(LIBP_DEBUG)
   /*Check OMP_NUM_THREADS env variable*/
   std::string ompNumThreads;
   char * ompEnvVar = std::getenv("OMP_NUM_THREADS");
@@ -146,11 +141,8 @@ void platform_t::DeviceConfig(){
       Nthreads = std::stoi(ompNumThreads);
     }
   }
-  if (Nthreads*localSize>NcoresPerNode) {
-    std::stringstream ss;
-    ss << "Rank " << rank << " oversubscribing CPU on node \"" << hostname<< "\"";
-    HIPBONE_WARNING(ss.str());
-  }
+  LIBP_WARNING("Rank " << rank() << " oversubscribing CPU on node \"" << hostname.ptr()<< "\"",
+                  Nthreads*localSize>NcoresPerNode);
   omp_set_num_threads(Nthreads);
   // omp_set_num_threads(1);
 
@@ -161,25 +153,24 @@ void platform_t::DeviceConfig(){
 
   device.setup(mode);
 
-  std::string occaCacheDir;
+  std::string cacheDir;
   char * cacheEnvVar = std::getenv("HIPBONE_CACHE_DIR");
   if (cacheEnvVar == nullptr) {
     // Environment variable is not set
-    occaCacheDir = HIPBONE_DIR "/.occa";
+    cacheDir = HIPBONE_DIR "/.occa";
   }
   else {
     // Environmet variable is set, but could be empty string
-    occaCacheDir = cacheEnvVar;
+    cacheDir = cacheEnvVar;
 
-    if (occaCacheDir.size() == 0) {
+    if (cacheDir.size() == 0) {
       // Environment variable is set but equal to empty string
-      occaCacheDir = HIPBONE_DIR "/.occa";
+      cacheDir = HIPBONE_DIR "/.occa";
     }
   }
-  occa::env::setOccaCacheDir(occaCacheDir);
+  setCacheDir(cacheDir);
 
-  MPI_Barrier(MPI_COMM_WORLD);
-  free(hostnames);
+  comm.Barrier();
 }
 
 } //namespace libp
diff --git a/libs/core/platformProperties.cpp b/libs/core/platformProperties.cpp
index a6a41ff..91fe56a 100644
--- a/libs/core/platformProperties.cpp
+++ b/libs/core/platformProperties.cpp
@@ -31,7 +31,7 @@ namespace libp {
 //initialize occa::properties with common props
 void platform_t::DeviceProperties(){
 
-  occa::properties& Props = props();
+  properties_t& Props = props();
 
   Props["defines"].asObject();
   Props["includes"].asArray();
diff --git a/libs/core/platformSettings.cpp b/libs/core/platformSettings.cpp
index 9b63e65..f8da166 100644
--- a/libs/core/platformSettings.cpp
+++ b/libs/core/platformSettings.cpp
@@ -54,9 +54,7 @@ void platformReportSettings(settings_t& settings) {
   if (settings.compareSetting("THREAD MODEL","OpenCL"))
     settings.reportSetting("PLATFORM NUMBER");
 
-  int size;
-  MPI_Comm_size(MPI_COMM_WORLD, &size);
-  if ((size==1)
+  if ((settings.comm.size()==1)
     &&(settings.compareSetting("THREAD MODEL","CUDA")
         ||settings.compareSetting("THREAD MODEL","HIP")
         ||settings.compareSetting("THREAD MODEL","OpenCL") ))
diff --git a/libs/core/settings.cpp b/libs/core/settings.cpp
index 4315c0f..4a6b9b9 100644
--- a/libs/core/settings.cpp
+++ b/libs/core/settings.cpp
@@ -55,7 +55,7 @@ void setting_t::updateVal(const string newVal){
        << "Possible values are: { ";
     for (size_t i=0;i<options.size()-1;i++) ss << options[i] << ", ";
     ss << options[options.size()-1] << " }" << std::endl;
-    HIPBONE_ABORT(ss.str());
+    LIBP_FORCE_ABORT(ss.str());
   }
 }
 
@@ -105,7 +105,7 @@ std::ostream& operator<<(std::ostream& os, const setting_t& setting) {
   return os;
 }
 
-settings_t::settings_t(MPI_Comm _comm):
+settings_t::settings_t(comm_t _comm):
   comm(_comm) {}
 
 void settings_t::newSetting(const string shortkey, const string longkey,
@@ -115,16 +115,10 @@ void settings_t::newSetting(const string shortkey, const string longkey,
 
   for(auto it = settings.begin(); it != settings.end(); ++it) {
     setting_t &setting = it->second;
-    if (!setting.shortkey.compare(shortkey)) {
-      stringstream ss;
-      ss << "Setting with key: [" << shortkey << "] already exists.";
-      HIPBONE_ABORT(ss.str());
-    }
-    if (!setting.longkey.compare(longkey)) {
-      stringstream ss;
-      ss << "Setting with key: [" << longkey << "] already exists.";
-      HIPBONE_ABORT(ss.str());
-    }
+    LIBP_ABORT("Setting with key: [" << shortkey << "] already exists.",
+                  !setting.shortkey.compare(shortkey));
+    LIBP_ABORT("Setting with key: [" << longkey << "] already exists.",
+                  !setting.longkey.compare(longkey));
   }
 
   auto search = settings.find(name);
@@ -132,9 +126,7 @@ void settings_t::newSetting(const string shortkey, const string longkey,
     settings[name] = setting_t(shortkey, longkey, name, val, description, options);
     insertOrder.push_back(name);
   } else {
-    stringstream ss;
-    ss << "Setting with name: [" << name << "] already exists.";
-    HIPBONE_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Setting with name: [" << name << "] already exists.");
   }
 }
 
@@ -144,20 +136,19 @@ void settings_t::changeSetting(const string name, const string newVal) {
     setting_t& val = search->second;
     val.updateVal(newVal);
   } else {
-    stringstream ss;
-    ss << "Setting with name: [" << name << "] does not exist.";
-    HIPBONE_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Setting with name: [" << name << "] does not exist.");
   }
 }
 
 void settings_t::parseSettings(const int argc, char** argv) {
 
   for (int i = 1; i < argc; ) {
-    if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
-    {
-       PrintUsage();
-       MPI_Abort(MPI_COMM_WORLD,HIPBONE_ERROR);
-       return;
+    if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+      if (comm.rank()==0) PrintUsage();
+      comm.Barrier();
+      comm_t::Finalize();
+      std::exit(LIBP_SUCCESS);
+      return;
     }
 
     for(auto it = settings.begin(); it != settings.end(); ++it) {
@@ -165,9 +156,7 @@ void settings_t::parseSettings(const int argc, char** argv) {
       if (strcmp(argv[i], setting.shortkey.c_str()) == 0 ||
           strcmp(argv[i], setting.longkey.c_str()) == 0) {
         if (setting.check!=0) {
-          stringstream ss;
-          ss << "Cannot set setting [" << setting.name << "] twice in run command.";
-          HIPBONE_ABORT(ss.str());
+          LIBP_FORCE_ABORT("Cannot set setting [" << setting.name << "] twice in run command.");
         } else {
           if (strcmp(argv[i], "-v") == 0 ||
               strcmp(argv[i], "--verbose") == 0) {
@@ -191,9 +180,7 @@ string settings_t::getSetting(const string name) const {
     const setting_t& val = search->second;
     return val.getVal<string>();
   } else {
-    stringstream ss;
-    ss << "Unable to find setting: [" << name << "]";
-    HIPBONE_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Unable to find setting: [" << name << "]");
     return string();
   }
 }
@@ -204,9 +191,7 @@ bool settings_t::compareSetting(const string name, const string token) const {
     const setting_t& val = search->second;
     return val.compareVal(token);
   } else {
-    stringstream ss;
-    ss << "Unable to find setting: [" << name.c_str() << "]";
-    HIPBONE_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Unable to find setting: [" << name.c_str() << "]");
     return false;
   }
 }
@@ -226,9 +211,7 @@ void settings_t::reportSetting(const string name) const {
     const setting_t& val = search->second;
     std::cout << val << std::endl;
   } else {
-    stringstream ss;
-    ss << "Unable to find setting: [" << name.c_str() << "]";
-    HIPBONE_ABORT(ss.str());
+    LIBP_FORCE_ABORT("Unable to find setting: [" << name.c_str() << "]");
   }
 }
 

From 3ef8ede9bb8c43624dcff73650ca9c67adace67f Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:14:55 -0600
Subject: [PATCH 09/12] Update mesh lib

---
 include/mesh.hpp                      | 129 ++++++++++----------
 libs/mesh/makefile                    |  84 -------------
 libs/mesh/mesh.cpp                    |   8 +-
 libs/mesh/meshBasis1D.cpp             |  30 ++---
 libs/mesh/meshBasisHex3D.cpp          | 148 ++++++++++++++++++++++-
 libs/mesh/meshConnect.cpp             |  69 +++--------
 libs/mesh/meshConnectFaceNodes.cpp    | 162 +-------------------------
 libs/mesh/meshConnectFaceVertices.cpp |   2 +-
 libs/mesh/meshConnectNodes.cpp        |  26 ++---
 libs/mesh/meshGatherScatterSetup.cpp  |  42 +++----
 libs/mesh/meshHaloSetup.cpp           |  12 +-
 libs/mesh/meshOccaSetup.cpp           |  15 +--
 libs/mesh/meshPhysicalNodes.cpp       |   8 +-
 libs/mesh/meshSetupBox.cpp            |  13 +--
 14 files changed, 310 insertions(+), 438 deletions(-)
 delete mode 100644 libs/mesh/makefile

diff --git a/include/mesh.hpp b/include/mesh.hpp
index f8de21f..65c2ed2 100644
--- a/include/mesh.hpp
+++ b/include/mesh.hpp
@@ -28,6 +28,7 @@ SOFTWARE.
 #define MESH_HPP 1
 
 #include "core.hpp"
+#include "comm.hpp"
 #include "settings.hpp"
 #include "ogs.hpp"
 
@@ -37,90 +38,98 @@ class mesh_t {
 
 public:
   platform_t platform;
-  occa::properties props;
+  properties_t props;
 
-  MPI_Comm comm;
+  comm_t comm;
   int rank, size;
 
+  /*************************/
+  /* Element Data          */
+  /*************************/
   int dim;
   int Nverts, Nfaces, NfaceVertices;
 
-  hlong Nnodes; //global number of element vertices
-  libp::memory<dfloat> EX; // coordinates of vertices for each element
-  libp::memory<dfloat> EY;
-  libp::memory<dfloat> EZ;
+  // indices of vertex nodes
+  memory<int> vertexNodes;
+
+  hlong Nnodes=0; //global number of element vertices
+  memory<dfloat> EX; // coordinates of vertices for each element
+  memory<dfloat> EY;
+  memory<dfloat> EZ;
 
   dlong Nelements;       //local element count
   hlong NelementsGlobal; //global element count
-  libp::memory<hlong> EToV; // element-to-vertex connectivity
-  libp::memory<dlong> EToE; // element-to-element connectivity
-  libp::memory<int>   EToF; // element-to-(local)face connectivity
-  libp::memory<int>   EToP; // element-to-partition/process connectivity
+  memory<hlong> EToV;    // element-to-vertex connectivity
+  memory<dlong> EToE;    // element-to-element connectivity
+  memory<int>   EToF;    // element-to-(local)face connectivity
+  memory<int>   EToP;    // element-to-partition/process connectivity
+
+  memory<dlong> VmapM;  // list of vertices on each face
+  memory<dlong> VmapP;  // list of vertices that are paired with face vertices
+
+  /*************************/
+  /* FEM Space             */
+  /*************************/
+  int N=0, Np=0;             // N = Polynomial order and Np = Nodes per element
+  memory<dfloat> r, s, t;    // coordinates of local nodes
+
+  int Nq=0;            // N = Polynomial order, Nq=N+1
+  memory<dfloat> gllz; // 1D GLL quadrature nodes
+  memory<dfloat> gllw; // 1D GLL quadrature weights
+  memory<dfloat> D;    // 1D differentiation matrix (for tensor-product)
+  deviceMemory<dfloat> o_D;
 
-  libp::memory<dlong> VmapM;  // list of vertices on each face
-  libp::memory<dlong> VmapP;  // list of vertices that are paired with face vertices
+  // face node info
+  int Nfp=0;                // number of nodes per face
+  memory<int> faceNodes;    // list of element reference interpolation nodes on element faces
+  memory<dlong> vmapM;      // list of volume nodes that are face nodes
+  memory<dlong> vmapP;      // list of volume nodes that are paired with face nodes
+  memory<int> faceVertices; // list of mesh vertices on each face
+
+  /*************************/
+  /* Physical Space        */
+  /*************************/
+  // volume node info
+  memory<dfloat> x, y, z;    // coordinates of physical nodes
+  // second order volume geometric factors
+  dlong Nggeo;
+  memory<dfloat> ggeo;
+  deviceMemory<dfloat> o_ggeo;
 
+  /*************************/
+  /* MPI Data              */
+  /*************************/
   // MPI halo exchange info
-  ogs::halo_t halo;            // halo exchange pointer
-  dlong NinternalElements; // number of elements that can update without halo exchange
-  dlong NhaloElements;     // number of elements that cannot update without halo exchange
-  dlong  totalHaloPairs;   // number of elements to be received in halo exchange
-  libp::memory<dlong> internalElementIds;  // list of elements that can update without halo exchange
-  libp::memory<dlong> haloElementIds;      // list of elements to be sent in halo exchange
-  occa::memory o_internalElementIds;  // list of elements that can update without halo exchange
-  occa::memory o_haloElementIds;      // list of elements to be sent in halo exchange
+  ogs::halo_t halo;                         // halo exchange pointer
+  dlong NinternalElements;                  // number of elements that can update without halo exchange
+  dlong NhaloElements;                      // number of elements that cannot update without halo exchange
+  dlong totalHaloPairs;                     // number of elements to be received in halo exchange
+  memory<dlong> internalElementIds;         // list of elements that can update without halo exchange
+  memory<dlong> haloElementIds;             // list of elements to be sent in halo exchange
+  deviceMemory<dlong> o_internalElementIds; // list of elements that can update without halo exchange
+  deviceMemory<dlong> o_haloElementIds;     // list of elements to be sent in halo exchange
 
   // CG gather-scatter info
   ogs::ogs_t ogsMasked;
   ogs::halo_t gHalo;
-  libp::memory<hlong> globalIds, maskedGlobalIds, maskedGlobalNumbering;
+  memory<hlong> globalIds, maskedGlobalIds, maskedGlobalNumbering;
   dlong Nmasked;
 
-  libp::memory<dlong> GlobalToLocal;
-  occa::memory o_GlobalToLocal;
+  memory<dlong> GlobalToLocal;
+  deviceMemory<dlong> o_GlobalToLocal;
 
   // list of elements that are needed for global gather-scatter
   dlong NglobalGatherElements;
-  libp::memory<dlong> globalGatherElementList;
-  occa::memory o_globalGatherElementList;
+  memory<dlong> globalGatherElementList;
+  deviceMemory<dlong> o_globalGatherElementList;
 
   // list of elements that are not needed for global gather-scatter
   dlong NlocalGatherElements;
-  libp::memory<dlong> localGatherElementList;
-  occa::memory o_localGatherElementList;
-
-
-  libp::memory<dfloat> weight, weightG;
-  occa::memory o_weight, o_weightG;
-
-  // second order volume geometric factors
-  dlong Nggeo;
-  libp::memory<dfloat> ggeo;
-
-  // volume node info
-  int N, Nq, Np;
-  libp::memory<dfloat> r, s, t;    // coordinates of local nodes
-  libp::memory<dfloat> x, y, z;    // coordinates of physical nodes
-
-  // indices of vertex nodes
-  libp::memory<int> vertexNodes;
-
-  libp::memory<dfloat> D; // 1D differentiation matrix (for tensor-product)
-  libp::memory<dfloat> gllz; // 1D GLL quadrature nodes
-  libp::memory<dfloat> gllw; // 1D GLL quadrature weights
-
-  // face node info
-  int Nfp;        // number of nodes per face
-  libp::memory<int> faceNodes; // list of element reference interpolation nodes on element faces
-  libp::memory<dlong> vmapM;     // list of volume nodes that are face nodes
-  libp::memory<dlong> vmapP;     // list of volume nodes that are paired with face nodes
-  libp::memory<int> faceVertices; // list of mesh vertices on each face
-
-  // occa stuff
-  occa::stream defaultStream;
+  memory<dlong> localGatherElementList;
+  deviceMemory<dlong> o_localGatherElementList;
 
-  occa::memory o_D; // tensor product differentiation matrix (for Hexes)
-  occa::memory o_ggeo; // second order geometric factors
+  memory<dfloat> weight, weightG;
+  deviceMemory<dfloat> o_weight, o_weightG;
 
   mesh_t()=default;
   mesh_t(platform_t& _platform) {
@@ -186,6 +195,8 @@ class mesh_t {
   void NodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[]);
   void FaceNodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[],  int _faceNodes[]);
   void VertexNodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[], int _vertexNodes[]);
+  void FaceNodeMatchingHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[],
+                             int _faceNodes[], int R[]);
 
   /* offsets for second order geometric factors */
   static constexpr int GWJID=0;
diff --git a/libs/mesh/makefile b/libs/mesh/makefile
deleted file mode 100644
index 2a40fce..0000000
--- a/libs/mesh/makefile
+++ /dev/null
@@ -1,84 +0,0 @@
-#####################################################################################
-#
-#The MIT License (MIT)
-#
-#Copyright (c) 2020 Tim Warburton, Noel Chalmers, Jesse Chan, Ali Karakus
-#
-#Permission is hereby granted, free of charge, to any person obtaining a copy
-#of this software and associated documentation files (the "Software"), to deal
-#in the Software without restriction, including without limitation the rights
-#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#copies of the Software, and to permit persons to whom the Software is
-#furnished to do so, subject to the following conditions:
-#
-#The above copyright notice and this permission notice shall be included in all
-#copies or substantial portions of the Software.
-#
-#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-#SOFTWARE.
-#
-#####################################################################################
-
-ifndef HIPBONE_MAKETOP_LOADED
-ifeq (,$(wildcard ../../make.top))
-$(error cannot locate ${PWD}/../../make.top)
-else
-include ../../make.top
-endif
-endif
-
-LIBMESH_DIR=${HIPBONE_CORE_DIR}/mesh
-
-#includes
-INCLUDES=${HIPBONE_INCLUDES} \
-         -I${LIBMESH_DIR}/include
-
-#defines
-DEFINES =${HIPBONE_DEFINES} \
-         -DHIPBONE_DIR='"${HIPBONE_DIR}"' \
-
-#.cpp compilation flags
-MESH_CXXFLAGS=${HIPBONE_CXXFLAGS} ${DEFINES} ${INCLUDES}
-
-#object dependancies
-DEPS=$(wildcard $(HIPBONE_INCLUDE_DIR)/*.h) \
-     $(wildcard ${LIBMESH_DIR}/include/*.hpp) \
-     $(HIPBONE_INCLUDE_DIR)/mesh.hpp
-
-SRC =$(wildcard *.cpp)
-OBJS=$(SRC:.cpp=.o)
-
-.PHONY: all lib clean realclean silentUpdate
-
-all: lib
-
-lib: ../libmesh.a silentUpdate
-
-../libmesh.a: $(OBJS)
-ifneq (,${verbose})
-	ar -cr ../libmesh.a $(OBJS)
-else
-	@printf "%b" "$(LIB_COLOR)Building library $(@F)$(NO_COLOR)\n";
-	@ar -cr ../libmesh.a $(OBJS)
-endif
-
-silentUpdate:
-	@true
-
-# rule for .cpp files
-%.o: %.cpp $(DEPS)
-ifneq (,${verbose})
-	$(HIPBONE_CXX) -o $*.o -c $*.cpp $(MESH_CXXFLAGS)
-else
-	@printf "%b" "$(OBJ_COLOR)Compiling $(@F)$(NO_COLOR)\n";
-	@$(HIPBONE_CXX) -o $*.o -c $*.cpp $(MESH_CXXFLAGS)
-endif
-
-#cleanup
-clean:
-	rm -f *.o ../libmesh.a
diff --git a/libs/mesh/mesh.cpp b/libs/mesh/mesh.cpp
index 5418b17..8e54a2d 100644
--- a/libs/mesh/mesh.cpp
+++ b/libs/mesh/mesh.cpp
@@ -32,9 +32,9 @@ void mesh_t::Setup(platform_t& _platform) {
 
   platform = _platform;
 
-  comm = platform.comm;
-  MPI_Comm_rank(platform.comm, &rank);
-  MPI_Comm_size(platform.comm, &size);
+  comm = platform.comm.Dup();
+  rank = comm.rank();
+  size = comm.size();
 
   platform.settings().getSetting("POLYNOMIAL DEGREE", N);
 
@@ -83,4 +83,4 @@ void mesh_t::Setup(platform_t& _platform) {
   OccaSetup();
 }
 
-} //namespace libp
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshBasis1D.cpp b/libs/mesh/meshBasis1D.cpp
index da8e475..b8eca24 100644
--- a/libs/mesh/meshBasis1D.cpp
+++ b/libs/mesh/meshBasis1D.cpp
@@ -92,21 +92,21 @@ void mesh_t::MassMatrix1D(int _Np, dfloat V[], dfloat _MM[]){
       _MM[n*_Np + m] = res;
     }
   }
-  matrixInverse(_Np, _MM);
+  linAlg_t::matrixInverse(_Np, _MM);
 }
 
 void mesh_t::Dmatrix1D(int _N, int Npoints, dfloat _r[], dfloat _Dr[]){
 
   int _Np = _N+1;
 
-  libp::memory<dfloat> V(Npoints*_Np);
-  libp::memory<dfloat> Vr(Npoints*_Np);
+  memory<dfloat> V(Npoints*_Np);
+  memory<dfloat> Vr(Npoints*_Np);
 
   Vandermonde1D(_N, Npoints, _r, V.ptr());
   GradVandermonde1D(_N, Npoints, _r, Vr.ptr());
 
   //D = Vr/V
-  matrixRightSolve(_Np, _Np, Vr.ptr(), _Np, _Np, V.ptr(), _Dr);
+  linAlg_t::matrixRightSolve(_Np, _Np, Vr.ptr(), _Np, _Np, V.ptr(), _Dr);
 }
 
 // ------------------------------------------------------------------------
@@ -122,7 +122,7 @@ dfloat mesh_t::JacobiP(dfloat a, dfloat alpha, dfloat beta, int _N){
 
   dfloat ax = a;
 
-  libp::memory<dfloat> P(_N+1);
+  memory<dfloat> P(_N+1);
 
   // Zero order
   dfloat gamma0 = pow(2,(alpha+beta+1))/(alpha+beta+1)*mygamma(1+alpha)*mygamma(1+beta)/mygamma(1+alpha+beta);
@@ -172,14 +172,14 @@ void mesh_t::JacobiGLL(int _N, dfloat _x[], dfloat w[]){
   _x[_N] =  1.;
 
   if(_N>1){
-    libp::memory<dfloat> wtmp(_N-1);
+    memory<dfloat> wtmp(_N-1);
     JacobiGQ(1,1, _N-2, _x+1, wtmp.ptr());
   }
 
   if (w!=nullptr) {
     int _Np = _N+1;
-    libp::memory<dfloat> _MM(_Np*_Np);
-    libp::memory<dfloat> V(_Np*_Np);
+    memory<dfloat> _MM(_Np*_Np);
+    memory<dfloat> V(_Np*_Np);
 
     Vandermonde1D(_N, _N+1, _x, V.ptr());
     MassMatrix1D(_N+1, V.ptr(), _MM.ptr());
@@ -210,8 +210,8 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat _x[], dfloat w[]
   }
 
   // Form symmetric matrix from recurrence.
-  libp::memory<dfloat> J((_N+1)*(_N+1), 0);
-  libp::memory<dfloat> h1(_N+1);
+  memory<dfloat> J((_N+1)*(_N+1), 0);
+  memory<dfloat> h1(_N+1);
 
   for(int n=0;n<=_N;++n){
     h1[n] = 2*n+alpha+beta;
@@ -240,12 +240,12 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat _x[], dfloat w[]
   // Compute quadrature by eigenvalue solve
 
   //  [V,D] = eig(J);
-  libp::memory<dfloat> WR(_N+1);
-  libp::memory<dfloat> WI(_N+1);
-  libp::memory<dfloat> VR((_N+1)*(_N+1));
+  memory<dfloat> WR(_N+1);
+  memory<dfloat> WI(_N+1);
+  memory<dfloat> VR((_N+1)*(_N+1));
 
   // _x = diag(D);
-  matrixEigenVectors(_N+1, J.ptr(), VR.ptr(), _x, WI.ptr());
+  linAlg_t::matrixEigenVectors(_N+1, J.ptr(), VR.ptr(), _x, WI.ptr());
 
   //w = (V(1,:)').^2*2^(alpha+beta+1)/(alpha+beta+1)*gamma(alpha+1)*.gamma(beta+1)/gamma(alpha+beta+1);
   for(int n=0;n<=_N;++n){
@@ -267,4 +267,4 @@ void mesh_t::JacobiGQ(dfloat alpha, dfloat beta, int _N, dfloat _x[], dfloat w[]
   }
 }
 
-} //namespace libp
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshBasisHex3D.cpp b/libs/mesh/meshBasisHex3D.cpp
index 9001cc4..e24dd95 100644
--- a/libs/mesh/meshBasisHex3D.cpp
+++ b/libs/mesh/meshBasisHex3D.cpp
@@ -34,7 +34,7 @@ namespace libp {
 void mesh_t::NodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[]){
   int _Nq = _N+1;
 
-  libp::memory<dfloat> r1D(_Nq);
+  memory<dfloat> r1D(_Nq);
   JacobiGLL(_N, r1D.ptr()); //Gauss-Legendre-Lobatto nodes
 
   //Tensor product
@@ -109,4 +109,148 @@ void mesh_t::VertexNodesHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[], int
   }
 }
 
-} //namespace libp
\ No newline at end of file
+/*Find a matching array between nodes on matching faces */
+void mesh_t::FaceNodeMatchingHex3D(int _N, dfloat _r[], dfloat _s[], dfloat _t[],
+                                   int _faceNodes[], int R[]){
+
+  int _Nq = _N+1;
+  int _Nfp = _Nq*_Nq;
+
+  const dfloat NODETOL = 1.0e-5;
+
+  dfloat V0[4][2] = {{-1.0,-1.0},{ 1.0,-1.0},{ 1.0, 1.0},{-1.0, 1.0}};
+  dfloat V1[4][2] = {{-1.0,-1.0},{-1.0, 1.0},{ 1.0, 1.0},{ 1.0,-1.0}};
+
+  dfloat EX0[Nverts], EY0[Nverts];
+  dfloat EX1[Nverts], EY1[Nverts];
+
+  memory<dfloat> x0(_Nfp);
+  memory<dfloat> y0(_Nfp);
+
+  memory<dfloat> x1(_Nfp);
+  memory<dfloat> y1(_Nfp);
+
+
+  for (int fM=0;fM<Nfaces;fM++) {
+
+    for (int v=0;v<Nverts;v++) {
+      EX0[v] = 0.0; EY0[v] = 0.0;
+    }
+    //setup top element with face fM on the bottom
+    for (int v=0;v<NfaceVertices;v++) {
+      int fv = faceVertices[fM*NfaceVertices + v];
+      EX0[fv] = V0[v][0]; EY0[fv] = V0[v][1];
+    }
+
+    for(int n=0;n<_Nfp;++n){ /* for each face node */
+      const int fn = _faceNodes[fM*_Nfp+n];
+
+      /* (r,s,t) coordinates of interpolation nodes*/
+      dfloat rn = _r[fn];
+      dfloat sn = _s[fn];
+      dfloat tn = _t[fn];
+
+      /* physical coordinate of interpolation node */
+      x0[n] =
+        +0.125*(1-rn)*(1-sn)*(1-tn)*EX0[0]
+        +0.125*(1+rn)*(1-sn)*(1-tn)*EX0[1]
+        +0.125*(1+rn)*(1+sn)*(1-tn)*EX0[2]
+        +0.125*(1-rn)*(1+sn)*(1-tn)*EX0[3]
+        +0.125*(1-rn)*(1-sn)*(1+tn)*EX0[4]
+        +0.125*(1+rn)*(1-sn)*(1+tn)*EX0[5]
+        +0.125*(1+rn)*(1+sn)*(1+tn)*EX0[6]
+        +0.125*(1-rn)*(1+sn)*(1+tn)*EX0[7];
+
+      y0[n] =
+        +0.125*(1-rn)*(1-sn)*(1-tn)*EY0[0]
+        +0.125*(1+rn)*(1-sn)*(1-tn)*EY0[1]
+        +0.125*(1+rn)*(1+sn)*(1-tn)*EY0[2]
+        +0.125*(1-rn)*(1+sn)*(1-tn)*EY0[3]
+        +0.125*(1-rn)*(1-sn)*(1+tn)*EY0[4]
+        +0.125*(1+rn)*(1-sn)*(1+tn)*EY0[5]
+        +0.125*(1+rn)*(1+sn)*(1+tn)*EY0[6]
+        +0.125*(1-rn)*(1+sn)*(1+tn)*EY0[7];
+    }
+
+    for (int fP=0;fP<Nfaces;fP++) { /*For each neighbor face */
+      for (int rot=0;rot<NfaceVertices;rot++) { /* For each face rotation */
+        // Zero vertices
+        for (int v=0;v<Nverts;v++) {
+          EX1[v] = 0.0; EY1[v] = 0.0;
+        }
+        //setup bottom element with face fP on the top
+        for (int v=0;v<NfaceVertices;v++) {
+          int fv = faceVertices[fP*NfaceVertices + ((v+rot)%NfaceVertices)];
+          EX1[fv] = V1[v][0]; EY1[fv] = V1[v][1];
+        }
+
+        for(int n=0;n<_Nfp;++n){ /* for each node */
+          const int fn = _faceNodes[fP*_Nfp+n];
+
+          /* (r,s,t) coordinates of interpolation nodes*/
+          dfloat rn = _r[fn];
+          dfloat sn = _s[fn];
+          dfloat tn = _t[fn];
+
+          /* physical coordinate of interpolation node */
+          x1[n] =  0.125*(1-rn)*(1-sn)*(1-tn)*EX1[0]
+                  +0.125*(1+rn)*(1-sn)*(1-tn)*EX1[1]
+                  +0.125*(1+rn)*(1+sn)*(1-tn)*EX1[2]
+                  +0.125*(1-rn)*(1+sn)*(1-tn)*EX1[3]
+                  +0.125*(1-rn)*(1-sn)*(1+tn)*EX1[4]
+                  +0.125*(1+rn)*(1-sn)*(1+tn)*EX1[5]
+                  +0.125*(1+rn)*(1+sn)*(1+tn)*EX1[6]
+                  +0.125*(1-rn)*(1+sn)*(1+tn)*EX1[7];
+
+          y1[n] =  0.125*(1-rn)*(1-sn)*(1-tn)*EY1[0]
+                  +0.125*(1+rn)*(1-sn)*(1-tn)*EY1[1]
+                  +0.125*(1+rn)*(1+sn)*(1-tn)*EY1[2]
+                  +0.125*(1-rn)*(1+sn)*(1-tn)*EY1[3]
+                  +0.125*(1-rn)*(1-sn)*(1+tn)*EY1[4]
+                  +0.125*(1+rn)*(1-sn)*(1+tn)*EY1[5]
+                  +0.125*(1+rn)*(1+sn)*(1+tn)*EY1[6]
+                  +0.125*(1-rn)*(1+sn)*(1+tn)*EY1[7];
+        }
+
+        /* for each node on this face find the neighbor node */
+        for(int n=0;n<_Nfp;++n){
+          const dfloat xM = x0[n];
+          const dfloat yM = y0[n];
+
+          int m=0;
+          for(;m<_Nfp;++m){ /* for each neighbor node */
+            const dfloat xP = x1[m];
+            const dfloat yP = y1[m];
+
+            /* distance between target and neighbor node */
+            const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2);
+
+            /* if neighbor node is close to target, match */
+            if(dist<NODETOL){
+              R[fM*Nfaces*NfaceVertices*_Nfp
+                + fP*NfaceVertices*_Nfp
+                + rot*_Nfp + n] = m;
+              break;
+            }
+          }
+
+          /*Check*/
+          const dfloat xP = x1[m];
+          const dfloat yP = y1[m];
+
+          /* distance between target and neighbor node */
+          const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2);
+          //This shouldn't happen
+          LIBP_ABORT("Unable to match face node, face: " << fM
+                     << ", matching face: " << fP
+                     << ", rotation: " << rot
+                     << ", node: " << n
+                     << ". Is the reference node set not symmetric?",
+                     dist>NODETOL);
+        }
+      }
+    }
+  }
+}
+
+} //namespace libp
diff --git a/libs/mesh/meshConnect.cpp b/libs/mesh/meshConnect.cpp
index 9b89fec..ef59673 100644
--- a/libs/mesh/meshConnect.cpp
+++ b/libs/mesh/meshConnect.cpp
@@ -59,7 +59,7 @@ void mesh_t::Connect(){
    **********************/
 
   /* build list of faces */
-  libp::memory<face_t> faces(Nelements*Nfaces);
+  memory<face_t> faces(Nelements*Nfaces);
 
   #pragma omp parallel for collapse(2)
   for(dlong e=0;e<Nelements;++e){
@@ -133,10 +133,10 @@ void mesh_t::Connect(){
 
   // count # of elements to send to each rank based on
   // minimum {vertex id % size}
-  libp::memory<int> Nsend(size, 0);
-  libp::memory<int> Nrecv(size, 0);
-  libp::memory<int> sendOffsets(size, 0);
-  libp::memory<int> recvOffsets(size, 0);
+  memory<int> Nsend(size, 0);
+  memory<int> Nrecv(size, 0);
+  memory<int> sendOffsets(size, 0);
+  memory<int> recvOffsets(size, 0);
 
   // WARNING: In some corner cases, the number of faces to send may overrun int storage
   int allNsend = 0;
@@ -147,8 +147,8 @@ void mesh_t::Connect(){
         hlong maxv = 0;
         for(int n=0;n<NfaceVertices;++n){
           int nid = faceVertices[f*NfaceVertices+n];
-          dlong id = EToV[e*Nverts + nid];
-          maxv = mymax(maxv, id);
+          hlong id = EToV[e*Nverts + nid];
+          maxv = std::max(maxv, id);
         }
         int destRank = (int) (maxv%size);
 
@@ -168,30 +168,7 @@ void mesh_t::Connect(){
     Nsend[rr] = 0;
 
   // buffer for outgoing data
-  libp::memory<face_t> sendFaces(allNsend);
-
-  // Make the MPI_FACE_T data type
-  MPI_Datatype MPI_FACE_T;
-  MPI_Datatype dtype[7] = {MPI_HLONG, MPI_DLONG, MPI_DLONG, MPI_INT,
-                            MPI_INT, MPI_INT, MPI_INT};
-  int blength[7] = {4, 1, 1, 1, 1, 1, 1};
-  MPI_Aint addr[7], displ[7];
-  MPI_Get_address ( &(sendFaces[0]              ), addr+0);
-  MPI_Get_address ( &(sendFaces[0].element      ), addr+1);
-  MPI_Get_address ( &(sendFaces[0].elementN     ), addr+2);
-  MPI_Get_address ( &(sendFaces[0].face         ), addr+3);
-  MPI_Get_address ( &(sendFaces[0].faceN        ), addr+4);
-  MPI_Get_address ( &(sendFaces[0].rank         ), addr+5);
-  MPI_Get_address ( &(sendFaces[0].rankN        ), addr+6);
-  displ[0] = 0;
-  displ[1] = addr[1] - addr[0];
-  displ[2] = addr[2] - addr[0];
-  displ[3] = addr[3] - addr[0];
-  displ[4] = addr[4] - addr[0];
-  displ[5] = addr[5] - addr[0];
-  displ[6] = addr[6] - addr[0];
-  MPI_Type_create_struct (7, blength, displ, dtype, &MPI_FACE_T);
-  MPI_Type_commit (&MPI_FACE_T);
+  memory<face_t> sendFaces(allNsend);
 
   // pack face data
   for(dlong e=0;e<Nelements;++e){
@@ -203,7 +180,7 @@ void mesh_t::Connect(){
         for(int n=0;n<NfaceVertices;++n){
           int nid = faceVertices[f*NfaceVertices+n];
           hlong id = EToV[e*Nverts + nid];
-          maxv = mymax(maxv, id);
+          maxv = std::max(maxv, id);
         }
         int destRank = (int) (maxv%size);
 
@@ -231,10 +208,8 @@ void mesh_t::Connect(){
     }
   }
 
-  // exchange byte counts
-  MPI_Alltoall(Nsend.ptr(), 1, MPI_INT,
-               Nrecv.ptr(), 1, MPI_INT,
-               comm);
+  // exchange counts
+  comm.Alltoall(Nsend, Nrecv);
 
   // count incoming faces
   int allNrecv = 0;
@@ -243,15 +218,14 @@ void mesh_t::Connect(){
 
   // find offsets for recv data
   for(int rr=1;rr<size;++rr)
-    recvOffsets[rr] = recvOffsets[rr-1] + Nrecv[rr-1]; // byte offsets
+    recvOffsets[rr] = recvOffsets[rr-1] + Nrecv[rr-1]; // offsets
 
   // buffer for incoming face data
-  libp::memory<face_t> recvFaces(allNrecv);
+  memory<face_t> recvFaces(allNrecv);
 
   // exchange parallel faces
-  MPI_Alltoallv(sendFaces.ptr(), Nsend.ptr(), sendOffsets.ptr(), MPI_FACE_T,
-                recvFaces.ptr(), Nrecv.ptr(), recvOffsets.ptr(), MPI_FACE_T,
-                comm);
+  comm.Alltoallv(sendFaces, Nsend, sendOffsets,
+                 recvFaces, Nrecv, recvOffsets);
 
   // local sort allNrecv received faces
   sort(recvFaces.ptr(), recvFaces.ptr()+allNrecv,
@@ -289,9 +263,8 @@ void mesh_t::Connect(){
       });
 
   // send faces back from whence they came
-  MPI_Alltoallv(recvFaces.ptr(), Nrecv.ptr(), recvOffsets.ptr(), MPI_FACE_T,
-                sendFaces.ptr(), Nsend.ptr(), sendOffsets.ptr(), MPI_FACE_T,
-                comm);
+  comm.Alltoallv(recvFaces, Nrecv, recvOffsets,
+                 sendFaces, Nsend, sendOffsets);
 
   // extract connectivity info
   #pragma omp parallel for
@@ -313,13 +286,9 @@ void mesh_t::Connect(){
     }
   }
 
-  MPI_Barrier(comm);
-  MPI_Type_free(&MPI_FACE_T);
-
   //record the number of elements in the whole mesh
-  hlong NelementsLocal = (hlong) Nelements;
-  NelementsGlobal = 0;
-  MPI_Allreduce(&NelementsLocal, &NelementsGlobal, 1, MPI_HLONG, MPI_SUM, comm);
+  NelementsGlobal = Nelements;
+  comm.Allreduce(NelementsGlobal);
 }
 
 } //namespace libp
diff --git a/libs/mesh/meshConnectFaceNodes.cpp b/libs/mesh/meshConnectFaceNodes.cpp
index e05014b..2e13d5f 100644
--- a/libs/mesh/meshConnectFaceNodes.cpp
+++ b/libs/mesh/meshConnectFaceNodes.cpp
@@ -31,149 +31,10 @@ namespace libp {
 // serial face-node to face-node connection
 void mesh_t::ConnectFaceNodes(){
 
-  const dfloat NODETOL = 1.0e-5;
-
-  dfloat V0[4][2] = {{-1.0,-1.0},{ 1.0,-1.0},{ 1.0, 1.0},{-1.0, 1.0}};
-  dfloat V1[4][2] = {{-1.0,-1.0},{-1.0, 1.0},{ 1.0, 1.0},{ 1.0,-1.0}};
-
-  dfloat EX0[Nverts], EY0[Nverts];
-  dfloat EX1[Nverts], EY1[Nverts];
-
-  libp::memory<dfloat> x0(Nfp);
-  libp::memory<dfloat> y0(Nfp);
-
-  libp::memory<dfloat> x1(Nfp);
-  libp::memory<dfloat> y1(Nfp);
-
   /* Build the permutation array R */
-  libp::memory<int> R(Nfaces*Nfaces*Nverts*Nfp);
-
-  for (int fM=0;fM<Nfaces;fM++) {
-
-    for (int v=0;v<Nverts;v++) {
-      EX0[v] = 0.0; EY0[v] = 0.0;
-    }
-    //setup top element with face fM on the bottom
-    for (int v=0;v<NfaceVertices;v++) {
-      int fv = faceVertices[fM*NfaceVertices + v];
-      EX0[fv] = V0[v][0]; EY0[fv] = V0[v][1];
-    }
-
-    for(int n=0;n<Nfp;++n){ /* for each face node */
-      const int fn = faceNodes[fM*Nfp+n];
-
-      /* (r,s,t) coordinates of interpolation nodes*/
-      dfloat rn = r[fn];
-      dfloat sn = s[fn];
-      dfloat tn = t[fn];
-
-      /* physical coordinate of interpolation node */
-      x0[n] =
-        +0.125*(1-rn)*(1-sn)*(1-tn)*EX0[0]
-        +0.125*(1+rn)*(1-sn)*(1-tn)*EX0[1]
-        +0.125*(1+rn)*(1+sn)*(1-tn)*EX0[2]
-        +0.125*(1-rn)*(1+sn)*(1-tn)*EX0[3]
-        +0.125*(1-rn)*(1-sn)*(1+tn)*EX0[4]
-        +0.125*(1+rn)*(1-sn)*(1+tn)*EX0[5]
-        +0.125*(1+rn)*(1+sn)*(1+tn)*EX0[6]
-        +0.125*(1-rn)*(1+sn)*(1+tn)*EX0[7];
-
-      y0[n] =
-        +0.125*(1-rn)*(1-sn)*(1-tn)*EY0[0]
-        +0.125*(1+rn)*(1-sn)*(1-tn)*EY0[1]
-        +0.125*(1+rn)*(1+sn)*(1-tn)*EY0[2]
-        +0.125*(1-rn)*(1+sn)*(1-tn)*EY0[3]
-        +0.125*(1-rn)*(1-sn)*(1+tn)*EY0[4]
-        +0.125*(1+rn)*(1-sn)*(1+tn)*EY0[5]
-        +0.125*(1+rn)*(1+sn)*(1+tn)*EY0[6]
-        +0.125*(1-rn)*(1+sn)*(1+tn)*EY0[7];
-    }
-
-    for (int fP=0;fP<Nfaces;fP++) { /*For each neighbor face */
-      for (int rot=0;rot<Nfaces;rot++) { /* For each face rotation */
-        // Zero vertices
-        for (int v=0;v<Nverts;v++) {
-          EX1[v] = 0.0; EY1[v] = 0.0;
-        }
-        //setup bottom element with face fP on the top
-        for (int v=0;v<NfaceVertices;v++) {
-          int fv = faceVertices[fP*NfaceVertices + ((v+rot)%NfaceVertices)];
-          EX1[fv] = V1[v][0]; EY1[fv] = V1[v][1];
-        }
-
-        for(int n=0;n<Nfp;++n){ /* for each node */
-          const int fn = faceNodes[fP*Nfp+n];
-
-          /* (r,s,t) coordinates of interpolation nodes*/
-          dfloat rn = r[fn];
-          dfloat sn = s[fn];
-          dfloat tn = t[fn];
-
-          /* physical coordinate of interpolation node */
-          x1[n] =  0.125*(1-rn)*(1-sn)*(1-tn)*EX1[0]
-                  +0.125*(1+rn)*(1-sn)*(1-tn)*EX1[1]
-                  +0.125*(1+rn)*(1+sn)*(1-tn)*EX1[2]
-                  +0.125*(1-rn)*(1+sn)*(1-tn)*EX1[3]
-                  +0.125*(1-rn)*(1-sn)*(1+tn)*EX1[4]
-                  +0.125*(1+rn)*(1-sn)*(1+tn)*EX1[5]
-                  +0.125*(1+rn)*(1+sn)*(1+tn)*EX1[6]
-                  +0.125*(1-rn)*(1+sn)*(1+tn)*EX1[7];
-
-          y1[n] =  0.125*(1-rn)*(1-sn)*(1-tn)*EY1[0]
-                  +0.125*(1+rn)*(1-sn)*(1-tn)*EY1[1]
-                  +0.125*(1+rn)*(1+sn)*(1-tn)*EY1[2]
-                  +0.125*(1-rn)*(1+sn)*(1-tn)*EY1[3]
-                  +0.125*(1-rn)*(1-sn)*(1+tn)*EY1[4]
-                  +0.125*(1+rn)*(1-sn)*(1+tn)*EY1[5]
-                  +0.125*(1+rn)*(1+sn)*(1+tn)*EY1[6]
-                  +0.125*(1-rn)*(1+sn)*(1+tn)*EY1[7];
-        }
-
-        /* for each node on this face find the neighbor node */
-        for(int n=0;n<Nfp;++n){
-          const dfloat xM = x0[n];
-          const dfloat yM = y0[n];
-
-          int m=0;
-          for(;m<Nfp;++m){ /* for each neighbor node */
-            const dfloat xP = x1[m];
-            const dfloat yP = y1[m];
-
-            /* distance between target and neighbor node */
-            const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2);
-
-            /* if neighbor node is close to target, match */
-            if(dist<NODETOL){
-              R[fM*Nfaces*NfaceVertices*Nfp
-                + fP*NfaceVertices*Nfp
-                + rot*Nfp + n] = m;
-              break;
-            }
-          }
-
-          /*Check*/
-          const dfloat xP = x1[m];
-          const dfloat yP = y1[m];
-
-          /* distance between target and neighbor node */
-          const dfloat dist = pow(xM-xP,2) + pow(yM-yP,2);
-          if(dist>NODETOL){
-            //This shouldn't happen
-            std::stringstream ss;
-            ss << "Unable to match face node, face: " << fM
-               << ", matching face: " << fP
-               << ", rotation: " << rot
-               << ", node: " << n
-               << ". Is the reference node set not symmetric?";
-            HIPBONE_ABORT(ss.str())
-          }
-        }
-      }
-    }
-  }
-
-  x0.free(); y0.free();
-  x1.free(); y1.free();
+  memory<int> R(Nfaces*Nfaces*NfaceVertices*Nfp);
+  FaceNodeMatchingHex3D(N, r.ptr(), s.ptr(), t.ptr(),
+                        faceNodes.ptr(), R.ptr());
 
   /* volume indices of the interior and exterior face nodes for each element */
   vmapM.malloc(Nfp*Nfaces*Nelements);
@@ -213,27 +74,10 @@ void mesh_t::ConnectFaceNodes(){
           const dlong id = eM*Nfaces*Nfp + fM*Nfp + nM;
           vmapM[id] = idM + eM*Np;
           vmapP[id] = idP + eP*Np;
-
-      #if 0
-        /*Sanity check*/
-        dfloat xnM = x[idM + eM*Np];
-        dfloat ynM = y[idM + eM*Np];
-        dfloat znM = z[idM + eM*Np];
-        dfloat xnP = x[idP + eP*Np];
-        dfloat ynP = y[idP + eP*Np];
-        dfloat znP = z[idP + eP*Np];
-        const dfloat dist = pow(xnM-xnP,2) + pow(ynM-ynP,2) + pow(znM-znP,2);
-        if (dist>NODETOL)
-          printf("Mismatch?: Element %d, face %d, node %d, xM = (%f, %f, %f), xP = (%f, %f, %f)\n",
-                  eM, fM, nM, xnM, ynM, znM, xnP, ynP, znP);
-      #endif
         }
       }
     }
   }
-
-//      printf("connecting (%d,%d) to (%d,%d) [ vmapM %d to vmapP %d ]\n",
-//             e,f,eP,fP, vmapM[id], vmapP[id]);
 }
 
 } //namespace libp
diff --git a/libs/mesh/meshConnectFaceVertices.cpp b/libs/mesh/meshConnectFaceVertices.cpp
index bc8a06e..322e1a8 100644
--- a/libs/mesh/meshConnectFaceVertices.cpp
+++ b/libs/mesh/meshConnectFaceVertices.cpp
@@ -33,7 +33,7 @@ void mesh_t::ConnectFaceVertices(){
 
   //allocate and fill a halo region in element-to-vertex mapping
   EToV.realloc((Nelements+totalHaloPairs)*Nverts);
-  halo.Exchange(EToV.ptr(), Nverts, ogs::Hlong);
+  halo.Exchange(EToV, Nverts);
 
   /* volume indices of the interior and exterior face vertices for each element */
   VmapM.malloc(NfaceVertices*Nfaces*Nelements);
diff --git a/libs/mesh/meshConnectNodes.cpp b/libs/mesh/meshConnectNodes.cpp
index 7a3cf02..e4e15df 100644
--- a/libs/mesh/meshConnectNodes.cpp
+++ b/libs/mesh/meshConnectNodes.cpp
@@ -31,18 +31,10 @@ namespace libp {
 // uniquely label each node with a global index, used for gatherScatter
 void mesh_t::ConnectNodes(){
 
-  dlong localNodeCount = Np*Nelements;
-  libp::memory<dlong> allLocalNodeCounts(platform.size);
-
-  MPI_Allgather(&localNodeCount,    1, MPI_DLONG,
-                allLocalNodeCounts.ptr(), 1, MPI_DLONG,
-                comm);
-
-  hlong gatherNodeStart = 0;
-  for(int rr=0;rr<rank;++rr)
-    gatherNodeStart += allLocalNodeCounts[rr];
-
-  allLocalNodeCounts.free();
+  hlong localNnodes = Np*Nelements;
+  hlong gatherNodeStart = localNnodes;
+  comm.Scan(localNnodes, gatherNodeStart);
+  gatherNodeStart -= localNnodes;
 
   // form continuous node numbering (local=>virtual gather)
   globalIds.malloc((totalHaloPairs+Nelements)*Np);
@@ -56,16 +48,16 @@ void mesh_t::ConnectNodes(){
     }
   }
 
-  hlong localChange = 0, gatherChange = 1;
+  hlong gatherChange = 1;
 
   // keep comparing numbers on positive and negative traces until convergence
   while(gatherChange>0){
 
     // reset change counter
-    localChange = 0;
+    gatherChange = 0;
 
     // send halo data and recv into extension of buffer
-    halo.Exchange(globalIds.ptr(), Np, ogs::Hlong);
+    halo.Exchange(globalIds, Np);
 
     // compare trace nodes
     #pragma omp parallel for collapse(2)
@@ -78,14 +70,14 @@ void mesh_t::ConnectNodes(){
         hlong gidP = globalIds[idP];
 
         if(gidP<gidM){
-          ++localChange;
+          ++gatherChange;
           globalIds[idM] = gidP;
         }
       }
     }
 
     // sum up changes
-    MPI_Allreduce(&localChange, &gatherChange, 1, MPI_HLONG, MPI_SUM, comm);
+    comm.Allreduce(gatherChange);
   }
 }
 
diff --git a/libs/mesh/meshGatherScatterSetup.cpp b/libs/mesh/meshGatherScatterSetup.cpp
index a64a68a..35599c4 100644
--- a/libs/mesh/meshGatherScatterSetup.cpp
+++ b/libs/mesh/meshGatherScatterSetup.cpp
@@ -32,25 +32,25 @@ void mesh_t::GatherScatterSetup() {
 
   dlong Ntotal = Nverts*(Nelements+totalHaloPairs);
 
-  libp::memory<int> minRank(Ntotal);
-  libp::memory<int> maxRank(Ntotal);
+  memory<int> minRank(Ntotal);
+  memory<int> maxRank(Ntotal);
 
   for (dlong i=0;i<Ntotal;i++) {
     minRank[i] = rank;
     maxRank[i] = rank;
   }
 
-  hlong localChange = 0, gatherChange = 1;
+  hlong gatherChange = 1;
 
   // keep comparing numbers on positive and negative traces until convergence
   while(gatherChange>0){
 
     // reset change counter
-    localChange = 0;
+    gatherChange = 0;
 
     // send halo data and recv into extension of buffer
-    halo.Exchange(minRank.ptr(), Nverts, ogs::Int32);
-    halo.Exchange(maxRank.ptr(), Nverts, ogs::Int32);
+    halo.Exchange(minRank, Nverts);
+    halo.Exchange(maxRank, Nverts);
 
     // compare trace vertices
     #pragma omp parallel for collapse(2)
@@ -67,19 +67,19 @@ void mesh_t::GatherScatterSetup() {
         int maxRankP = maxRank[idP];
 
         if(minRankP<minRankM){
-          localChange=1;
+          gatherChange=1;
           minRank[idM] = minRankP;
         }
 
         if(maxRankP>maxRankM){
-          localChange=1;
+          gatherChange=1;
           maxRank[idM] = maxRankP;
         }
       }
     }
 
     // sum up changes
-    MPI_Allreduce(&localChange, &gatherChange, 1, MPI_HLONG, MPI_MAX, comm);
+    comm.Allreduce(gatherChange);
   }
 
   // count elements that contribute to global C0 gather-scatter
@@ -152,14 +152,14 @@ void mesh_t::GatherScatterSetup() {
   //use the masked ids to make another gs handle (signed so the gather is defined)
   bool verbose = platform.settings().compareSetting("VERBOSE", "TRUE");
   bool unique = true; //flag a unique node in every gather node
-  ogsMasked.Setup(Nelements*Np, maskedGlobalIds.ptr(),
+  ogsMasked.Setup(Nelements*Np, maskedGlobalIds,
                   comm, ogs::Signed, ogs::Auto,
                   unique, verbose, platform);
 
   gHalo.SetupFromGather(ogsMasked);
 
   GlobalToLocal.malloc(Nelements*Np);
-  ogsMasked.SetupGlobalToLocalMapping(GlobalToLocal.ptr());
+  ogsMasked.SetupGlobalToLocalMapping(GlobalToLocal);
 
   o_GlobalToLocal = platform.malloc(GlobalToLocal);
 
@@ -175,30 +175,30 @@ void mesh_t::GatherScatterSetup() {
   #pragma omp parallel for
   for(dlong n=0;n<Ntotal;++n) weight[n] = 1.0;
 
-  ogsMasked.Gather(weightG.ptr(), weight.ptr(), 1, ogs::Dfloat, ogs::Add, ogs::Trans);
+  ogsMasked.Gather(weightG, weight, 1, ogs::Add, ogs::Trans);
 
   #pragma omp parallel for
   for(dlong n=0;n<ogsMasked.Ngather;++n)
     if (weightG[n]) weightG[n] = 1./weightG[n];
 
-  ogsMasked.Scatter(weight.ptr(), weightG.ptr(), 1, ogs::Dfloat, ogs::Add, ogs::NoTrans);
+  ogsMasked.Scatter(weight, weightG, 1, ogs::NoTrans);
 
   // o_weight  = device.malloc(Ntotal*sizeof(dfloat), weight);
   // o_weightG = device.malloc(ogsMasked.Ngather*sizeof(dfloat), weightG);
 
   // create a global numbering system
-  libp::memory<hlong> newglobalIds(Ngather);
+  memory<hlong> newglobalIds(Ngather);
 
   // every gathered degree of freedom has its own global id
-  libp::memory<hlong> globalStarts(size+1, 0);
-  MPI_Allgather(&Ngather, 1, MPI_HLONG, globalStarts.ptr()+1, 1, MPI_HLONG, comm);
-  for(int rr=0;rr<size;++rr)
-    globalStarts[rr+1] = globalStarts[rr] + globalStarts[rr+1];
+  hlong localNgather = Ngather;
+  hlong gatherOffset = localNgather;
+  comm.Scan(localNgather, gatherOffset);
+  gatherOffset -= localNgather;
 
   //use the offsets to set a consecutive global numbering
   #pragma omp parallel for
   for (dlong n =0;n<ogsMasked.Ngather;n++) {
-    newglobalIds[n] = n + globalStarts[rank];
+    newglobalIds[n] = n + gatherOffset;
   }
 
   //scatter this numbering to the original nodes
@@ -207,7 +207,7 @@ void mesh_t::GatherScatterSetup() {
   #pragma omp parallel for
   for (dlong n=0;n<Ntotal;n++) maskedGlobalNumbering[n] = -1;
 
-  ogsMasked.Scatter(maskedGlobalNumbering.ptr(), newglobalIds.ptr(), 1, ogs::Hlong, ogs::Add, ogs::NoTrans);
+  ogsMasked.Scatter(maskedGlobalNumbering, newglobalIds, 1, ogs::NoTrans);
 }
 
-} //namespace libp
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshHaloSetup.cpp b/libs/mesh/meshHaloSetup.cpp
index b828e83..78c91a9 100644
--- a/libs/mesh/meshHaloSetup.cpp
+++ b/libs/mesh/meshHaloSetup.cpp
@@ -32,11 +32,11 @@ namespace libp {
 // exchange of elements or trace nodes
 void mesh_t::HaloSetup(){
 
-  libp::memory<hlong> globalOffset(size+1, 0);
-  hlong localNelements = (hlong) Nelements;
+  memory<hlong> globalOffset(size+1, 0);
 
   //gather number of elements on each rank
-  MPI_Allgather(&localNelements, 1, MPI_HLONG, globalOffset.ptr()+1, 1, MPI_HLONG, comm);
+  hlong localNelements = Nelements;
+  comm.Allgather(localNelements, globalOffset+1);
 
   for(int rr=0;rr<size;++rr)
     globalOffset[rr+1] = globalOffset[rr]+globalOffset[rr+1];
@@ -85,7 +85,7 @@ void mesh_t::HaloSetup(){
   }
 
   //make a list of global element ids taking part in the halo exchange
-  libp::memory<hlong> globalElementId(Nelements+totalHaloPairs);
+  memory<hlong> globalElementId(Nelements+totalHaloPairs);
 
   //outgoing elements
   for(int e=0;e<Nelements;++e)
@@ -110,8 +110,8 @@ void mesh_t::HaloSetup(){
   //make a halo exchange op
   bool verbose = false;
   halo.Setup(Nelements+totalHaloPairs,
-             globalElementId.ptr(), comm,
+             globalElementId, comm,
              ogs::Pairwise, verbose, platform);
 }
 
-} //namespace libp
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshOccaSetup.cpp b/libs/mesh/meshOccaSetup.cpp
index 3b316c0..87fb401 100644
--- a/libs/mesh/meshOccaSetup.cpp
+++ b/libs/mesh/meshOccaSetup.cpp
@@ -30,12 +30,10 @@ namespace libp {
 
 void mesh_t::OccaSetup(){
 
-  o_internalElementIds = platform.malloc(internalElementIds);
-  o_haloElementIds = platform.malloc(haloElementIds);
-  o_globalGatherElementList = platform.malloc(globalGatherElementList);
-  o_localGatherElementList = platform.malloc(localGatherElementList);
-
-  defaultStream = platform.device.getStream();
+  o_internalElementIds = platform.malloc<dlong>(internalElementIds);
+  o_haloElementIds = platform.malloc<dlong>(haloElementIds);
+  o_globalGatherElementList = platform.malloc<dlong>(globalGatherElementList);
+  o_localGatherElementList = platform.malloc<dlong>(localGatherElementList);
 
   props = platform.props(); //copy platform props
 
@@ -56,9 +54,8 @@ void mesh_t::OccaSetup(){
   props["defines/" "p_G22ID"]= G22ID;
   props["defines/" "p_GWJID"]= GWJID;
 
-  o_D = platform.malloc(D);
-
-  o_ggeo = platform.malloc(ggeo);
+  o_D = platform.malloc<dfloat>(D);
+  o_ggeo = platform.malloc<dfloat>(ggeo);
 }
 
 } //namespace libp
diff --git a/libs/mesh/meshPhysicalNodes.cpp b/libs/mesh/meshPhysicalNodes.cpp
index e87f2ff..2d52da4 100644
--- a/libs/mesh/meshPhysicalNodes.cpp
+++ b/libs/mesh/meshPhysicalNodes.cpp
@@ -108,9 +108,9 @@ void mesh_t::PhysicalNodes(){
     }
   }
 
-  halo.Exchange(x.ptr(), Np, ogs::Dfloat);
-  halo.Exchange(y.ptr(), Np, ogs::Dfloat);
-  halo.Exchange(z.ptr(), Np, ogs::Dfloat);
+  halo.Exchange(x, Np);
+  halo.Exchange(y, Np);
+  halo.Exchange(z, Np);
 }
 
-} //namespace libp
\ No newline at end of file
+} //namespace libp
diff --git a/libs/mesh/meshSetupBox.cpp b/libs/mesh/meshSetupBox.cpp
index 6b2b8ca..c8c9a4c 100644
--- a/libs/mesh/meshSetupBox.cpp
+++ b/libs/mesh/meshSetupBox.cpp
@@ -60,19 +60,18 @@ void mesh_t::SetupBox(){
 
     // size is total number of ranks and populated by the mpi communicator
     size_x = std::cbrt(size); //number of ranks in each dimension
-    if (size_x*size_x*size_x != size)
-      HIPBONE_ABORT(std::string("3D BOX mesh requires a cubic number of MPI ranks since px,py,pz have not been provided."))
+    LIBP_ABORT("3D BOX mesh requires a cubic number of MPI ranks since px,py,pz have not been provided.",
+               size_x*size_x*size_x != size);
 
     size_y = size_x;
     size_z = size_x;
 
   }
-  else if (size_x * size_y * size_z != size) {
-    // User provided only *some* of px, py, pz, so check if they multiply to
-    // the right thing.
 
-    HIPBONE_ABORT(std::string("3D BOX mesh requires the user specifies all of px, py, pz, or none of px, py, pz.  If all are provided, their product must equal the total number of MPI ranks"))
-  }
+  // User provided only *some* of px, py, pz, so check if they multiply to
+  // the right thing.
+  LIBP_ABORT("3D BOX mesh requires the user specifies all of px, py, pz, or none of px, py, pz.  If all are provided, their product must equal the total number of MPI ranks",
+             size_x * size_y * size_z != size);
 
   //local grid physical sizes
   dfloat dimx = DIMX/size_x;

From db1e354e1abaac75af8096d5030f3380b03eb032 Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Tue, 8 Mar 2022 16:15:10 -0600
Subject: [PATCH 10/12] Update hipBone to use latest libs

---
 README.md        |  5 -----
 hipBone.cpp      | 10 ++++------
 hipBone.hpp      | 12 +++++-------
 make.top         |  2 +-
 src/operator.cpp | 12 ++++++------
 src/run.cpp      | 27 ++++++++++-----------------
 src/settings.cpp | 11 ++++-------
 src/setup.cpp    |  7 +++----
 8 files changed, 33 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 956c5ba..9ef7f13 100644
--- a/README.md
+++ b/README.md
@@ -89,9 +89,4 @@ To clean the `hipBone` build objects:
     $ cd /path/to/hipBone/repo
     $ make realclean
 
-To clean JIT kernel objects:
-
-    $ cd /path/to/hipBone/repo
-    $ rm -r .occa
-
 Please invoke `make help` for more supported options.
diff --git a/hipBone.cpp b/hipBone.cpp
index 0028514..e1ac9ee 100644
--- a/hipBone.cpp
+++ b/hipBone.cpp
@@ -29,11 +29,10 @@ SOFTWARE.
 int main(int argc, char **argv){
 
   // start up MPI
-  MPI_Init(&argc, &argv);
+  comm_t::Init(argc, argv);
 
   { /*Scope so everything is destructed before MPI_Finalize */
-
-    MPI_Comm comm = MPI_COMM_WORLD;
+    comm_t comm(comm_t::world().Dup());
 
     hipBoneSettings_t settings(argc, argv, comm);
     if (settings.compareSetting("VERBOSE", "TRUE"))
@@ -50,10 +49,9 @@ int main(int argc, char **argv){
 
     // run
     hb.Run();
-
   }
 
   // close down MPI
-  MPI_Finalize();
-  return HIPBONE_SUCCESS;
+  comm_t::Finalize();
+  return LIBP_SUCCESS;
 }
diff --git a/hipBone.hpp b/hipBone.hpp
index 5b77a5c..d39c803 100644
--- a/hipBone.hpp
+++ b/hipBone.hpp
@@ -39,7 +39,7 @@ using namespace libp;
 
 class hipBoneSettings_t: public settings_t {
 public:
-  hipBoneSettings_t(const int argc, char** argv, MPI_Comm& _comm);
+  hipBoneSettings_t(const int argc, char** argv, comm_t _comm);
   void report();
 };
 
@@ -50,10 +50,10 @@ class hipBone_t: public solver_t {
 
   dfloat lambda;
 
-  occa::memory o_AqL;
+  deviceMemory<dfloat> o_AqL;
 
-  occa::kernel operatorKernel;
-  occa::kernel forcingKernel;
+  kernel_t operatorKernel;
+  kernel_t forcingKernel;
 
   hipBone_t() = default;
   hipBone_t(platform_t& _platform, mesh_t &_mesh) {
@@ -65,9 +65,7 @@ class hipBone_t: public solver_t {
 
   void Run();
 
-  void PlotFields(dfloat* Q, char *fileName);
-
-  void Operator(occa::memory& o_q, occa::memory& o_Aq);
+  void Operator(deviceMemory<dfloat>& o_q, deviceMemory<dfloat>& o_Aq);
 };
 
 
diff --git a/make.top b/make.top
index fa882d5..530f9bb 100644
--- a/make.top
+++ b/make.top
@@ -51,7 +51,7 @@ export HIPBONE_LIBS= ${HIPBONE_BLAS_LIB}  \
 ifneq (,${debug})
   export HIPBONE_CFLAGS=-O0 -g -Wall -Wshadow -Wno-unused-function -Wno-unknown-pragmas
   export HIPBONE_CXXFLAGS=-O0 -g -Wall -Wshadow -Wno-unused-function -Wno-unknown-pragmas -std=c++17
-  export HIPBONE_DEFINES=-DHIPBONE_DEBUG
+  export HIPBONE_DEFINES=-DLIBP_DEBUG
 else
   export HIPBONE_CFLAGS=-fopenmp -O3 -Wall -Wshadow -Wno-unused-function
   export HIPBONE_CXXFLAGS=-fopenmp -O3 -Wall -Wshadow -Wno-unused-function -std=c++17
diff --git a/src/operator.cpp b/src/operator.cpp
index a3b6246..00d5af3 100644
--- a/src/operator.cpp
+++ b/src/operator.cpp
@@ -26,9 +26,9 @@
 
 #include "hipBone.hpp"
 
-void hipBone_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
+void hipBone_t::Operator(deviceMemory<dfloat> &o_q, deviceMemory<dfloat> &o_Aq){
 
-  mesh.gHalo.ExchangeStart(o_q, 1, ogs::Dfloat);
+  mesh.gHalo.ExchangeStart(o_q, 1);
 
   if(mesh.NlocalGatherElements/2){
     operatorKernel(mesh.NlocalGatherElements/2,
@@ -40,7 +40,7 @@ void hipBone_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
   }
 
   // finalize halo exchange
-  mesh.gHalo.ExchangeFinish(o_q, 1, ogs::Dfloat);
+  mesh.gHalo.ExchangeFinish(o_q, 1);
 
   if(mesh.NglobalGatherElements) {
     operatorKernel(mesh.NglobalGatherElements,
@@ -52,17 +52,17 @@ void hipBone_t::Operator(occa::memory &o_q, occa::memory &o_Aq){
   }
 
   //gather result to Aq
-  mesh.ogsMasked.GatherStart(o_Aq, o_AqL, 1, ogs::Dfloat, ogs::Add, ogs::Trans);
+  mesh.ogsMasked.GatherStart(o_Aq, o_AqL, 1, ogs::Add, ogs::Trans);
 
   if((mesh.NlocalGatherElements+1)/2){
     operatorKernel((mesh.NlocalGatherElements+1)/2,
-                   mesh.o_localGatherElementList+(mesh.NlocalGatherElements/2)*sizeof(dlong),
+                   mesh.o_localGatherElementList+(mesh.NlocalGatherElements/2),
                    mesh.o_GlobalToLocal,
                    mesh.o_ggeo,
                    mesh.o_D,
                    lambda, o_q, o_AqL);
   }
 
-  mesh.ogsMasked.GatherFinish(o_Aq, o_AqL, 1, ogs::Dfloat, ogs::Add, ogs::Trans);
+  mesh.ogsMasked.GatherFinish(o_Aq, o_AqL, 1, ogs::Add, ogs::Trans);
 }
 
diff --git a/src/run.cpp b/src/run.cpp
index ff0d07e..5bba158 100644
--- a/src/run.cpp
+++ b/src/run.cpp
@@ -25,21 +25,22 @@ SOFTWARE.
 */
 
 #include "hipBone.hpp"
+#include "timer.hpp"
 
 void hipBone_t::Run(){
 
   //setup linear solver
   dlong N = mesh.ogsMasked.Ngather;
   dlong Nhalo = mesh.gHalo.Nhalo;
-  linearSolver_t *linearSolver = new cg(platform, N, Nhalo);
+  cg linearSolver(platform, N, Nhalo);
 
   hlong NGlobal = mesh.ogsMasked.NgatherGlobal;
   dlong NLocal = mesh.Np*mesh.Nelements;
 
   //create occa buffers
   dlong Nall = N+Nhalo;
-  occa::memory o_r = platform.malloc(Nall*sizeof(dfloat));
-  occa::memory o_x = platform.malloc(Nall*sizeof(dfloat));
+  deviceMemory<dfloat> o_r = platform.malloc<dfloat>(Nall);
+  deviceMemory<dfloat> o_x = platform.malloc<dfloat>(Nall);
 
   //set x =0
   platform.linAlg().set(Nall, 0.0, o_x);
@@ -50,24 +51,19 @@ void hipBone_t::Run(){
   int maxIter = 100;
   int verbose = platform.settings().compareSetting("VERBOSE", "TRUE") ? 1 : 0;
 
-  platform.device.finish();
-  MPI_Barrier(mesh.comm);
-  double startTime = MPI_Wtime();
+  timePoint_t startTime = GlobalPlatformTime(platform);
 
   //call the solver
   dfloat tol = 0.0;
-  int Niter = linearSolver->Solve(*this, o_x, o_r, tol, maxIter, verbose);
+  int Niter = linearSolver.Solve(*this, o_x, o_r, tol, maxIter, verbose);
 
-  platform.device.finish();
-  MPI_Barrier(mesh.comm);
-  double endTime = MPI_Wtime();
-  double elapsedTime = endTime - startTime;
+  timePoint_t endTime = GlobalPlatformTime(platform);
+  double elapsedTime = ElapsedTime(startTime, endTime);
 
   int Np = mesh.Np, Nq = mesh.Nq;
 
-  hlong NunMasked = NLocal - mesh.Nmasked;
-  hlong NunMaskedGlobal;
-  MPI_Allreduce(&NunMasked, &NunMaskedGlobal, 1, MPI_HLONG, MPI_SUM, mesh.comm);
+  hlong NunMaskedGlobal = NLocal - mesh.Nmasked;
+  mesh.comm.Allreduce(NunMaskedGlobal);
 
   hlong Ndofs = NGlobal;
 
@@ -109,7 +105,4 @@ void hipBone_t::Run(){
 
     printf("hipBone: NekBone FOM = %4.1f GFLOPs. \n", NflopsNekbone/(1.0e9 * elapsedTime));
   }
-
-  o_r.free(); o_x.free();
-  delete linearSolver;
 }
diff --git a/src/settings.cpp b/src/settings.cpp
index 905a89b..0772bb3 100644
--- a/src/settings.cpp
+++ b/src/settings.cpp
@@ -27,11 +27,11 @@ SOFTWARE.
 #include "hipBone.hpp"
 
 //settings for hipBone solver
-hipBoneSettings_t::hipBoneSettings_t(const int argc, char** argv, MPI_Comm &_comm):
+hipBoneSettings_t::hipBoneSettings_t(const int argc, char** argv, comm_t _comm):
   settings_t(_comm) {
 
   platformAddSettings(*this);
-  libp::meshAddSettings(*this);
+  meshAddSettings(*this);
 
   newSetting("-v", "--verbose",
              "VERBOSE",
@@ -44,12 +44,9 @@ hipBoneSettings_t::hipBoneSettings_t(const int argc, char** argv, MPI_Comm &_com
 
 void hipBoneSettings_t::report() {
 
-  int rank;
-  MPI_Comm_rank(comm, &rank);
-
-  if (rank==0) {
+  if (comm.rank()==0) {
     std::cout << "Settings:\n\n";
     platformReportSettings(*this);
-    libp::meshReportSettings(*this);
+    meshReportSettings(*this);
   }
 }
diff --git a/src/setup.cpp b/src/setup.cpp
index fb10579..cef0998 100644
--- a/src/setup.cpp
+++ b/src/setup.cpp
@@ -34,17 +34,16 @@ void hipBone_t::Setup(platform_t& _platform, mesh_t& _mesh){
   lambda = 0.1; //hard code
 
   //setup linear algebra module
-  platform.linAlg().InitKernels({"set", "axpy", "innerProd", "norm2"},
-                                mesh.comm);
+  platform.linAlg().InitKernels({"set", "axpy", "innerProd", "norm2"});
 
   //Trigger JIT kernel builds
   ogs::InitializeKernels(platform, ogs::Dfloat, ogs::Add);
 
   //tmp local storage buffer for Ax op
-  o_AqL = platform.malloc(mesh.Np*mesh.Nelements*sizeof(dfloat));
+  o_AqL = platform.malloc<dfloat>(mesh.Np*mesh.Nelements);
 
   // OCCA build stuff
-  occa::properties kernelInfo = mesh.props; //copy mesh occa properties
+  properties_t kernelInfo = mesh.props; //copy mesh occa properties
 
   // Ax kernel
   operatorKernel = platform.buildKernel(DHIPBONE "/okl/hipBoneAx.okl",

From 21cbf97ec69c690c77d7d58b5df44f06d572419c Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Thu, 10 Mar 2022 09:03:12 -0600
Subject: [PATCH 11/12] Small tweak to block size

---
 include/ogs/ogsOperator.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ogs/ogsOperator.hpp b/include/ogs/ogsOperator.hpp
index bd33c6f..811a1af 100644
--- a/include/ogs/ogsOperator.hpp
+++ b/include/ogs/ogsOperator.hpp
@@ -120,7 +120,7 @@ class ogsOperator_t {
 
   //NC: Hard code these for now. Should be sufficient for GPU devices, but needs attention for CPU
   static constexpr int blockSize = 256;
-  static constexpr int gatherNodesPerBlock = 1024; //should be a multiple of blockSize for good unrolling
+  static constexpr int gatherNodesPerBlock = 512; //should be a multiple of blockSize for good unrolling
 
   //4 types - Float, Double, Int32, Int64
   //4 ops - Add, Mul, Max, Min

From 65fd9da7ecfa76466a90e36182ce6dd8bc3bc1b2 Mon Sep 17 00:00:00 2001
From: Noel Chalmers <noel.chalmers@gmail.com>
Date: Mon, 14 Mar 2022 18:04:04 -0500
Subject: [PATCH 12/12] Remove ambiguous scan overload

---
 include/comm.hpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/include/comm.hpp b/include/comm.hpp
index 7781da0..3d942c7 100644
--- a/include/comm.hpp
+++ b/include/comm.hpp
@@ -382,13 +382,6 @@ class comm_t {
     MPI_Scan(&snd, &rcv, 1, type, op, comm());
     mpiType<T>::freeMpiType(type);
   }
-  template <typename T>
-  void Scan(T& snd,
-            const op_t op = Sum) const {
-    T rcv=snd;
-    Scan(snd, rcv, op);
-    snd = rcv;
-  }
 
   /*libp::memory gather*/
   template <template<typename> class mem, typename T>