diff --git a/src/all_pairs.h b/src/all_pairs.h
index 68d7cfa..edc5962 100644
--- a/src/all_pairs.h
+++ b/src/all_pairs.h
@@ -16,10 +16,11 @@ void all_pairs_force(System<T, N>& system) {
   auto it = counting_iterator<uint64_t>(0);
   std::for_each_n(par_unseq, it, system.size, [s = system.state()](auto i) {
     auto ai = vec<T, N>::splat(0);
-    auto pi = s.x[i];
+    auto pi = s.p[i].x();
     for (typename System<T, N>::index_t j = 0; j < s.sz; j++) {
-      auto pj = s.x[j];
-      ai += s.m[j] * (pj - pi) / dist3(pi, pj);
+      if (i == j) continue;
+      auto [mj, pj] = s.p[j];
+      ai += mj * (pj - pi) / dist3(pi, pj);
     }
     s.a[i] = s.c * ai;
   });
@@ -38,9 +39,11 @@ void all_pairs_collapsed_force(System<T, N>& system) {
       return;
     }
 
-    auto pi = s.x[i];
-    auto pj = s.x[j];
-    auto a  = s.c * s.m[j] * (pj - pi) / dist3(pi, pj);
+    // TODO: we should exploit the symmetry of the force pairs to do
+    // (N^2)/2 computations here by taking this "a" and doing "s.a[j] -= a / mj * mi".
+    auto [mi, pi] = s.p[i];
+    auto [mj, pj] = s.p[j];
+    auto a  = s.c * mj * (pj - pi) / dist3(pi, pj);
     atomic_ref<T>{s.a[i][0]}.fetch_add(a[0], memory_order_relaxed);
     atomic_ref<T>{s.a[i][1]}.fetch_add(a[1], memory_order_relaxed);
   });
diff --git a/src/alloc.h b/src/alloc.h
index 1469047..9ab4602 100644
--- a/src/alloc.h
+++ b/src/alloc.h
@@ -2,6 +2,7 @@
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
+#include <vector>
 
 template <typename T>
 void alloc(T*& ptr, std::size_t n) {
@@ -50,3 +51,6 @@ template <typename T, typename U>
 constexpr bool operator!=(allocator<T> const &, allocator<U> const &) noexcept {
   return false;
 }
+
+template <typename T>
+using vector = std::vector<T, allocator<T>>;
diff --git a/src/bvh.h b/src/bvh.h
index 1f87632..8071670 100644
--- a/src/bvh.h
+++ b/src/bvh.h
@@ -15,10 +15,10 @@ using clock_timer = std::chrono::steady_clock;
 
 /// Computes the bounding box of the grid.
 template <typename T, dim_t N>
-aabb<T, N> bounding_box(std::span<vec<T, N>> xs) {
+aabb<T, N> bounding_box(std::span<monopole<T, N>> ms) {
   return std::transform_reduce(
-   par_unseq, xs.begin(), xs.end(), aabb<T, N>(from_points, vec<T, N>::splat(0.)),
-   [](auto a, auto b) { return merge(a, b); }, [](auto a) { return aabb<T, N>(from_points, a); });
+   par_unseq, ms.begin(), ms.end(), aabb<T, N>(from_points, vec<T, N>::splat(0.)),
+   [](auto a, auto b) { return merge(a, b); }, [](auto a) { return aabb<T, N>(from_points, a.x()); });
 }
 
 /// Sorts bodies along the Hilbert curve.
@@ -35,11 +35,11 @@ void hilbert_sort(System<T, N>& system, aabb<T, N> bbox) {
 
   // Compute the Hilbert index for each body in the Cartesian grid
   auto bids = system.body_indices();
-  static std::vector<uint64_t> hilbert_ids(system.size);
+  static vector<uint64_t> hilbert_ids(system.size);
   std::for_each(par_unseq, bids.begin(), bids.end(),
-                [hids = hilbert_ids.data(), x = system.x.data(), mins = bbox.xmin, grid_cell_size](auto idx) {
+                [hids = hilbert_ids.data(), p = system.p.data(), mins = bbox.xmin, grid_cell_size](auto idx) {
                   // Bucket the body into a Cartesian grid cell:
-                  vec<uint32_t, N> cell_idx = cast<uint32_t>((x[idx] - mins) / grid_cell_size);
+                  vec<uint32_t, N> cell_idx = cast<uint32_t>((p[idx].x() - mins) / grid_cell_size);
                   // Compute the Hilber index of the cell and assign it to the body:
                   hids[idx] = hilbert(cell_idx);
                 });
@@ -48,9 +48,9 @@ void hilbert_sort(System<T, N>& system, aabb<T, N> bbox) {
 #if defined(__NVCOMPILER)
   // Workaround for nvc++: we can use Thrust zip_iterator, which predates zip_view, but provides the same functionality,
   // and works just fine:
-  auto b = thrust::make_zip_iterator(hilbert_ids.begin(), system.x.begin(), system.m.begin(), system.v.begin(),
+  auto b = thrust::make_zip_iterator(hilbert_ids.begin(), system.p.begin(), system.v.begin(),
                                      system.a.begin(), system.ao.begin());
-  auto e = thrust::make_zip_iterator(hilbert_ids.end(), system.x.end(), system.m.end(), system.v.end(), system.a.end(),
+  auto e = thrust::make_zip_iterator(hilbert_ids.end(), system.p.end(), system.v.end(), system.a.end(),
                                      system.ao.end());
   std::sort(par_unseq, b, e, [](auto a, auto b) { return thrust::get<0>(a) < thrust::get<0>(b); });
 #elif defined(__clang__) || (__cplusplus < 202302L)
@@ -59,7 +59,7 @@ void hilbert_sort(System<T, N>& system, aabb<T, N> bbox) {
   // TODO: sort an array of keys and then apply a permutation in O(N) time and O(1) storage
   // (instead of O(N) time and O(N) storage).
 
-  static std::vector<std::pair<uint64_t, std::size_t>> hilbert_index_map(system.size);
+  static vector<std::pair<uint64_t, std::size_t>> hilbert_index_map(system.size);
   std::for_each_n(par_unseq, counting_iterator<std::size_t>(0), system.size,
                   [hmap = hilbert_index_map.data(), hids = hilbert_ids.data()](std::size_t idx) {
                     hmap[idx] = std::make_pair(hids[idx], idx);
@@ -70,27 +70,26 @@ void hilbert_sort(System<T, N>& system, aabb<T, N> bbox) {
 
   // create temp copy of system so that we don't get race conditions when
   // rearranging values in the next step
-  static std::vector<std::tuple<vec<T, N>, T, vec<T, N>, vec<T, N>, vec<T, N>>> tmp_system(system.size);
+  static vector<std::tuple<monopole<T, N>, vec<T, N>, vec<T, N>, vec<T, N>>> tmp_system(system.size);
   std::for_each_n(par_unseq, counting_iterator<std::size_t>(0), system.size,
-                  [tmp_sys = tmp_system.data(), x = system.x.data(), m = system.m.data(), v = system.v.data(),
+                  [tmp_sys = tmp_system.data(), p = system.p.data(), v = system.v.data(),
                    a = system.a.data(), ao = system.ao.data()](std::size_t idx) {
-                    tmp_sys[idx] = std::make_tuple(x[idx], m[idx], v[idx], a[idx], ao[idx]);
+                    tmp_sys[idx] = std::make_tuple(p[idx], v[idx], a[idx], ao[idx]);
                   });
 
   // copy back
   std::for_each_n(par_unseq, counting_iterator<std::size_t>(0), system.size,
-                  [tmp_sys = tmp_system.data(), hmap = hilbert_index_map.data(), x = system.x.data(),
-                   m = system.m.data(), v = system.v.data(), a = system.a.data(), ao = system.ao.data()](auto idx) {
+                  [tmp_sys = tmp_system.data(), hmap = hilbert_index_map.data(), p = system.p.data(),
+		   v = system.v.data(), a = system.a.data(), ao = system.ao.data()](auto idx) {
                     std::size_t original_index = hmap[idx].second;
                     auto e                     = tmp_sys[original_index];
-                    x[idx]                     = std::get<0>(e);
-                    m[idx]                     = std::get<1>(e);
-                    v[idx]                     = std::get<2>(e);
-                    a[idx]                     = std::get<3>(e);
-                    ao[idx]                    = std::get<4>(e);
+                    p[idx]                     = std::get<0>(e);
+                    v[idx]                     = std::get<1>(e);
+                    a[idx]                     = std::get<2>(e);
+                    ao[idx]                    = std::get<3>(e);
                   });
 #else
-  auto r = std::views::zip(hilbert_ids, system.x, system.m, system.v, system.a, system.ao);
+  auto r = std::views::zip(hilbert_ids, system.p, system.v, system.a, system.ao);
   std::sort(par_unseq, r.begin(), r.end(), [](auto a, auto b) { return std::get<0>(a) < std::get<0>(b); });
 #endif
 }
@@ -188,18 +187,21 @@ struct bvh {
         }
 
         if (br >= nbodies) {
-          m[i]    = monopole(s.m[bl], s.x[bl]);
-          auto bb = aabb<T, N>(from_points, s.x[bl]);
+	  auto [ml, xl] = s.p[bl];
+          m[i]    = monopole(ml, xl);
+          auto bb = aabb<T, N>(from_points, xl);
           b[i]    = bb;
           bw[i]   = node_width(bb);
         } else {
-          T mass = s.m[bl] + s.m[br];
+	  auto [ml, xl] = s.p[bl];
+	  auto [mr, xr] = s.p[br];
+          T mass = ml + mr;
 
-          vec<T, N> center_of_mass = s.m[bl] * s.x[bl] + s.m[br] * s.x[br];
+          vec<T, N> center_of_mass = ml * xl + mr * xr;
           center_of_mass /= mass;
           m[i] = monopole(mass, center_of_mass);
 
-          auto bb = aabb<T, N>(from_points, s.x[bl], s.x[br]);
+          auto bb = aabb<T, N>(from_points, xl, xr);
           b[i]    = bb;
           bw[i]   = node_width(bb);
         }
@@ -216,24 +218,24 @@ struct bvh {
         auto bl = (li * 2) + first + count;
         auto br = bl + 1;
 
-        auto ml = m[bl];
-        auto mr = m[br];
+        auto [ml, xl] = m[bl];
+        auto [mr, xr] = m[br];
 
-        auto ibl = ml.mass() != 0.;
-        auto ibr = mr.mass() != 0.;
+        auto ibl = ml != 0.;
+        auto ibr = mr != 0.;
 
         if (!ibl) {
-          m[i] = ml;
+          m[i] = m[bl];
           return;
         }
 
         if (!ibr) {
-          m[i]  = ml;
+          m[i]  = m[bl];
           b[i]  = b[bl];
           bw[i] = bw[bl];
         } else {
-          T mass  = ml.mass() + mr.mass();
-          auto x  = (ml.mass() * ml.x() + mr.mass() * mr.x()) / mass;
+          T mass  = ml + mr;
+          auto x  = (ml * xl + mr * xr) / mass;
           m[i]    = monopole(mass, x);
           auto bb = merge(b[bl], b[br]);
           b[i]    = bb;
@@ -253,7 +255,7 @@ struct bvh {
     node_t nbodies  = system.size;
     auto ids        = system.body_indices();
     std::for_each(par_unseq, ids.begin(), ids.end(), [=, s = system.state(), *this](node_t i) {
-      auto xs = s.x[i];
+      auto [mi, xi] = s.p[i];
 
       node_t tree_index  = 0;
       auto a             = vec<T, N>::splat(0);
@@ -291,10 +293,8 @@ struct bvh {
 
           for (int k = 0; k < 2; ++k) {
             if (bidx < nbodies && bidx != i) {
-              vec<T, N> xj = s.x[bidx];
-              T mj         = s.m[bidx];
-
-              a += mj * (xj - xs) / dist3(xs, xj);
+              auto [mj, xj] = s.p[bidx];
+              a += mj * (xj - xi) / dist3(xi, xj);
             }
             ++bidx;
           }
@@ -303,9 +303,9 @@ struct bvh {
           force_ascend_right();
         } else {
           auto [mj, xj] = m[tree_index];
-          if (can_approximate(xs, xj, bw[tree_index], theta_squared)) {
+          if (can_approximate(xi, xj, bw[tree_index], theta_squared)) {
             // below threshold
-            a += mj * (xj - xs) / dist3(xs, xj);
+            a += mj * (xj - xi) / dist3(xi, xj);
             num_covered_particles += ncontained_leaves_at_level(level, nlevels);
 
             ascend_right();
@@ -359,7 +359,7 @@ void run_bvh(System<T, N>& system, Arguments arguments) {
         dt_force += time([&] {
           // Bounding box
           aabb<T, N> bbox;
-          dt_bbox += time([&] { bbox = bounding_box(std::span{system.x}); });
+          dt_bbox += time([&] { bbox = bounding_box(std::span{system.p}); });
 
           // Sort bodies along Hilbert curve:
           dt_sort += time([&] { hilbert_sort(system, bbox); });
@@ -381,7 +381,7 @@ void run_bvh(System<T, N>& system, Arguments arguments) {
   } else {
     auto kernels = [&] {
       // Bounding box
-      aabb<T, N> bbox = bounding_box(std::span{system.x});
+      aabb<T, N> bbox = bounding_box(std::span{system.p});
 
       // Sort bodies along Hilbert curve:
       hilbert_sort(system, bbox);
diff --git a/src/monopole.h b/src/monopole.h
index 56f980c..25de603 100644
--- a/src/monopole.h
+++ b/src/monopole.h
@@ -6,7 +6,8 @@
 
 template <typename T, dim_t N>
 struct monopole {
-  vec<T, N + 1> data{};
+  alignas(N % 2 == 0? alignof(vec<T, N>) : alignof(T) * (N+1)) vec<T, N> x_;
+  T m_;
 
   monopole()                            = default;
   monopole(monopole const &)            = default;
@@ -14,17 +15,14 @@ struct monopole {
   monopole &operator=(monopole const &) = default;
   monopole &operator=(monopole &&)      = default;
 
-  monopole(T mass, vec<T, N> x) {
-    for (dim_t i = 0; i < N; ++i) data[i] = x[i];
-    data[N] = mass;
-  }
-
-  T mass() { return data[N]; }
-  vec<T, N> x() {
-    vec<T, N> x;
-    for (dim_t i = 0; i < N; ++i) x[i] = data[i];
-    return x;
+  monopole(T m, vec<T, N> x) {
+    for (dim_t i = 0; i < N; ++i) x_[i] = x[i];
+    m_ = m;
   }
+  T& mass() { return m_; }
+  T const& mass() const { return m_; }
+  vec<T, N>& x() { return x_; }
+  vec<T, N> const& x() const { return x_; }
 };
 
 namespace std {
diff --git a/src/octree.h b/src/octree.h
index 630f138..9a9f0c1 100644
--- a/src/octree.h
+++ b/src/octree.h
@@ -27,8 +27,7 @@ struct octree {
   mutable atomic<Index>* next_free_child_group;
 
   // Node data:
-  mutable T* total_masses;
-  mutable vec<T, N>* centre_masses;
+  mutable monopole<T, N>* m;
 
   // Helper latch per node for the last thread to proceed to the parent during
   // the tree computation of tree node centroids and masses:
@@ -46,10 +45,7 @@ struct octree {
     ::alloc(qt.first_child, capacity);
     ::alloc(qt.parent, 1 + capacity / child_count<N>);
     ::alloc(qt.next_free_child_group, 1);
-
-    ::alloc(qt.total_masses, capacity);
-    ::alloc(qt.centre_masses, capacity);
-
+    ::alloc(qt.m, capacity);
     ::alloc(qt.child_mass_complete, capacity);
     return qt;
   }
@@ -59,10 +55,7 @@ struct octree {
     dealloc(qt->first_child, qt->capacity);
     dealloc(qt->parent, 1 + qt->capacity / child_count<N>);
     dealloc(qt->next_free_child_group, 1);
-
-    dealloc(qt->total_masses, qt->capacity);
-    dealloc(qt->centre_masses, qt->capacity);
-
+    dealloc(qt->m, qt->capacity);
     dealloc(qt->child_mass_complete, qt->capacity);
   }
 
@@ -83,10 +76,9 @@ struct octree {
   // Resets tree node `i`
   void clear(Index i) const {
     if (i == 0) next_free_child_group->store(1, memory_order_relaxed);
-    first_child[i]   = empty;
-    parent[sg(i)]    = empty;
-    total_masses[i]  = T(0);
-    centre_masses[i] = vec<T, N>::splat(0);
+    first_child[i] = empty;
+    parent[sg(i)]  = empty;
+    m[i]           = monopole(T(0), vec<T, N>::splat(0));
     child_mass_complete[i].store(0, memory_order_relaxed);
   }
 
@@ -105,7 +97,7 @@ struct octree {
      [](auto lhs, auto rhs) -> std::tuple<T, T> {
        return {gmin(std::get<0>(lhs), std::get<0>(rhs)), gmax(std::get<1>(lhs), std::get<1>(rhs))};
      },
-     [s = system.state()](auto i) -> std::tuple<T, T> { return {min(s.x[i]), max(s.x[i])}; });
+     [s = system.state()](auto i) -> std::tuple<T, T> { return {min(s.p[i].x()), max(s.p[i].x())}; });
 
     // adjust boundary
     max_size += 1;
@@ -120,7 +112,8 @@ struct octree {
   }
 
   // Inserts body with `mass` at position `pos`:
-  void insert(T mass, vec<T, N> pos) const {
+  void insert(monopole<T, N> p) const {
+    auto pos = p.x();
     Index tree_index = 0;  // insert into root
     vec<T, N> divide = root_x;
     T side_length    = root_side_length;
@@ -148,8 +141,7 @@ struct octree {
       } else if (status == empty && fc.compare_exchange_weak(status, locked, memory_order_acquire)) {
         // If the node is empty and we locked it: insert body, unlock it, and done.
         // compare_exchange_weak suffices: if it fails spuriously, we'll retry again
-        total_masses[tree_index]  = mass;
-        centre_masses[tree_index] = pos;
+        m[tree_index] = p;
         fc.store(body, memory_order_release);
         break;
       } else if (status == body && fc.compare_exchange_weak(status, locked, memory_order_acquire)) {
@@ -162,17 +154,16 @@ struct octree {
         parent[sg(first_child_index)] = tree_index;
 
         // evict body at current index and insert into children keeping node locked
-        auto p_x        = centre_masses[tree_index];
+        auto [p_m, p_x] = m[tree_index];
         Index child_pos = 0;
         Index level     = 1;
         for (dim_t i = 0; i < N; i++) {
           child_pos += level * (p_x[i] > divide[i]);
           level *= 2;
         }
-        Index evicted_index          = first_child_index + child_pos;
-        centre_masses[evicted_index] = p_x;
-        total_masses[evicted_index]  = total_masses[tree_index];
-        first_child[evicted_index]   = body;
+        Index evicted_index        = first_child_index + child_pos;
+        m[evicted_index]           = monopole(p_m, p_x);
+        first_child[evicted_index] = body;
 
         // release node and continue to try to insert body
         fc.store(first_child_index, memory_order_release);
@@ -187,7 +178,7 @@ struct octree {
   void insert(System<T, N>& system) {
     auto r = system.body_indices();
     std::for_each(par, r.begin(), r.end(),
-                  [s = system.state(), tree = *this](Index i) { tree.insert(s.m[i], s.x[i]); });
+                  [s = system.state(), tree = *this](Index i) { tree.insert(s.p[i]); });
   }
 
   // Computes tree node centroids and masses that depend on the body at the leaf node `i`:
@@ -216,15 +207,14 @@ struct octree {
       T m    = 0;
       auto x = vec<T, N>::splat(0);
       for (dim_t i = 0; i < child_count<N>; i++) {
-        auto child_index      = first_child[tree_index] + i;
-        auto child_total_mass = total_masses[child_index];
-        m += child_total_mass;
-        x += child_total_mass * centre_masses[child_index];
+        auto child_index        = first_child[tree_index] + i;
+        auto [child_m, child_x] = this->m[child_index];
+        m += child_m;
+        x += child_m * child_x;
       }
       x /= m;
 
-      total_masses[tree_index]  = m;
-      centre_masses[tree_index] = x;
+      this->m[tree_index] = monopole(m, x);
     } while (true);
   }
 
@@ -245,12 +235,11 @@ struct octree {
     while (tree_index != empty) {
       Index next_node_index = next_node(tree_index);
       if (came_forwards) {  // child or sibling node
-        vec<T, N> xj = centre_masses[tree_index];
+        auto [mj, xj] = m[tree_index];
         // check if below threshold
         auto fc = first_child[tree_index];
         auto dx = dist(x, xj);
         if (fc == empty || fc == body || side_length / dx < theta) {
-          T mj = total_masses[tree_index];
           a += mj * (xj - x) / (dx * dx * dx);
         } else {  // visit children
           next_node_index = first_child[tree_index];
@@ -270,7 +259,7 @@ struct octree {
   void compute_force(System<T, N>& system, T const theta) {
     auto r = system.body_indices();
     std::for_each(par_unseq, r.begin(), r.end(), [s = system.state(), theta, tree = *this](Index i) {
-      s.a[i] = s.c * tree.compute_force(s.x[i], theta);
+      s.a[i] = s.c * tree.compute_force(s.p[i].x(), theta);
     });
   }
 };
@@ -324,7 +313,7 @@ void run_octree(System<T, N>& system, Arguments arguments) {
 
         if (arguments.print_info) {
           std::cout << std::format("Tree size: {}\n", tree.next_free_child_group->load());
-          std::cout << std::format("Total mass: {: .5f}\n", tree.total_masses[0]);
+          std::cout << std::format("Total mass: {: .5f}\n", tree.m[0].mass());
         }
         saver.save_all(system);
       }
diff --git a/src/saving.h b/src/saving.h
index e7c16de..018762d 100644
--- a/src/saving.h
+++ b/src/saving.h
@@ -17,7 +17,7 @@ class Saver {
     if (is_save_energy) setup_energy_file();
   }
 
-  auto save_all(System<T, N> const &system) -> void {
+  auto save_all(System<T, N>& system) -> void {
     save_points(system);
     save_energy(system);
   }
@@ -107,10 +107,16 @@ class Saver {
     energy_out_file.write(reinterpret_cast<char const *>(&data_size), sizeof(data_size));
   }
 
-  auto save_points(System<T, N> const &system) -> void {
+  auto save_points(System<T, N>& system) -> void {
     if (!is_save_pos) return;
 
-    pos_out_file.write(reinterpret_cast<char const *>(system.x.data()), simulation_size * data_size * std::uint32_t(N));
+    static vector<vec<T, N>> xs(simulation_size, vec<T, N>::splat(T(0)));
+    auto it = counting_iterator<uint32_t>(0);
+    std::for_each_n(par_unseq, it, simulation_size, [s = system.state(), xs = xs.data()](uint32_t i) {
+      xs[i] = s.p[i].x();
+    });
+
+    pos_out_file.write(reinterpret_cast<char const *>(xs.data()), simulation_size * data_size * std::uint32_t(N));
   }
 
   auto save_energy(System<T, N> const &system) -> void {
diff --git a/src/system.h b/src/system.h
index f432d1b..ef7c8c2 100644
--- a/src/system.h
+++ b/src/system.h
@@ -2,10 +2,10 @@
 #include <cmath>
 #include <functional>
 #include <random>
-#include <vector>
 
 #include "alloc.h"
 #include "execution.h"
+#include "monopole.h"
 #include "vec.h"
 
 template <typename T, dim_t N>
@@ -16,8 +16,8 @@ class System {
   index_t const max_tree_node_size;
   T const dt;
   T const constant;
-  std::vector<T, allocator<T>> m;
-  std::vector<vec<T, N>, allocator<vec<T, N>>> x, v, a, ao;
+  vector<monopole<T, N>> p;
+  vector<vec<T, N>> v, a, ao;
 
   // random generation
   std::mt19937 gen{42};  // fix random generation
@@ -30,8 +30,7 @@ class System {
      max_tree_node_size(std::max<std::size_t>(child_count<N> * size, 1000)),
      dt(time_step),
      constant(constant),
-     m(size),
-     x(size),
+     p(size),
      v(size),
      a(size),
      ao(size) {}
@@ -40,21 +39,21 @@ class System {
 
   // Helper to make it easier to access all state from parallel algorithms
   struct state_t {
-    T* m;
-    vec<T, N>*x, *v, *a, *ao;
+    monopole<T, N>* p;
+    vec<T, N>*v, *a, *ao;
     T dt, c;
     index_t sz;
   };
   state_t state() {
     return {
-     .m = m.data(), .x = x.data(), .v = v.data(), .a = a.data(), .ao = ao.data(), .dt = dt, .c = constant, .sz = size};
+     .p = p.data(), .v = v.data(), .a = a.data(), .ao = ao.data(), .dt = dt, .c = constant, .sz = size};
   }
 
   void accelerate_step() {
     // performs leap frog integration
     auto r = body_indices();
     std::for_each(par_unseq, r.begin(), r.end(), [s = state()](auto i) {
-      s.x[i] += s.dt * s.v[i] + T(0.5) * s.dt * s.dt * s.ao[i];
+      s.p[i].x() += s.dt * s.v[i] + T(0.5) * s.dt * s.dt * s.ao[i];
       s.v[i] += T(0.5) * s.dt * (s.a[i] + s.ao[i]);
       s.ao[i] = s.a[i];
     });
@@ -64,15 +63,16 @@ class System {
     auto r           = body_indices();
     T kinetic_energy = static_cast<T>(0.5)
                        * std::transform_reduce(par_unseq, r.begin(), r.end(), static_cast<T>(0), std::plus<T>{},
-                                               [m = m.data(), v = v.data()](auto i) { return m[i] * l2norm2(v[i]); });
+                                               [p = p.data(), v = v.data()](auto i) { return p[i].mass() * l2norm2(v[i]); });
     T gravitational_energy = -static_cast<T>(0.5) * constant
                              * std::transform_reduce(par_unseq, r.begin(), r.end(), static_cast<T>(0), std::plus<T>{},
-                                                     [m = m.data(), x = x.data(), sz = size](auto i) {
+                                                     [p = p.data(), sz = size](auto i) {
                                                        T total = 0;
-                                                       T mi    = m[i];
-                                                       auto xi = x[i];
-                                                       for (index_t j = 0; j < sz; j++)
-                                                         if (j != i) total += mi * m[j] / dist(xi, x[j]);
+                                                       auto [mi, xi] = p[i];
+                                                       for (index_t j = 0; j < sz; j++) {
+							 auto [mj, xj] = p[j];
+                                                         if (j != i) total += mi * mj / dist(xi, xj);
+						       }
                                                        return total;
                                                      });
 
@@ -83,8 +83,7 @@ class System {
     auto i = next_point;
     next_point += 1;
 
-    m[i] = mass;
-    x[i] = pos;
+    p[i] = monopole(mass, pos);
     v[i] = vel;
   }
 
@@ -92,7 +91,7 @@ class System {
     for (size_t i = 0; i < size; i++) {
       std::cout << std::format("{:02}: m={: .3e}, p=({: .3e}, {: .3e}), v=({: "
                                ".3e}, {: .3e}), f=({: .3e}, {: .3e})",
-                               i, m[i], x[i][0], x[i][1], v[i][0], v[i][1], a[i][0], a[i][1])
+                               i, p[i].mass(), p[i].x()[0], p[i].x()[1], v[i][0], v[i][1], a[i][0], a[i][1])
                 << std::endl;
     }
   }