diff --git a/src/all_pairs.h b/src/all_pairs.h index 68d7cfa..edc5962 100644 --- a/src/all_pairs.h +++ b/src/all_pairs.h @@ -16,10 +16,11 @@ void all_pairs_force(System& system) { auto it = counting_iterator(0); std::for_each_n(par_unseq, it, system.size, [s = system.state()](auto i) { auto ai = vec::splat(0); - auto pi = s.x[i]; + auto pi = s.p[i].x(); for (typename System::index_t j = 0; j < s.sz; j++) { - auto pj = s.x[j]; - ai += s.m[j] * (pj - pi) / dist3(pi, pj); + if (i == j) continue; + auto [mj, pj] = s.p[j]; + ai += mj * (pj - pi) / dist3(pi, pj); } s.a[i] = s.c * ai; }); @@ -38,9 +39,11 @@ void all_pairs_collapsed_force(System& system) { return; } - auto pi = s.x[i]; - auto pj = s.x[j]; - auto a = s.c * s.m[j] * (pj - pi) / dist3(pi, pj); + // TODO: we should exploit the symmetry of the force pairs to do + // (N^2)/2 computations here by taking this "a" and doing "s.a[j] -= a / mj * mi". + auto [mi, pi] = s.p[i]; + auto [mj, pj] = s.p[j]; + auto a = s.c * mj * (pj - pi) / dist3(pi, pj); atomic_ref{s.a[i][0]}.fetch_add(a[0], memory_order_relaxed); atomic_ref{s.a[i][1]}.fetch_add(a[1], memory_order_relaxed); }); diff --git a/src/alloc.h b/src/alloc.h index 1469047..9ab4602 100644 --- a/src/alloc.h +++ b/src/alloc.h @@ -2,6 +2,7 @@ #include #include #include +#include template void alloc(T*& ptr, std::size_t n) { @@ -50,3 +51,6 @@ template constexpr bool operator!=(allocator const &, allocator const &) noexcept { return false; } + +template +using vector = std::vector>; diff --git a/src/bvh.h b/src/bvh.h index 1f87632..8071670 100644 --- a/src/bvh.h +++ b/src/bvh.h @@ -15,10 +15,10 @@ using clock_timer = std::chrono::steady_clock; /// Computes the bounding box of the grid. template -aabb bounding_box(std::span> xs) { +aabb bounding_box(std::span> ms) { return std::transform_reduce( - par_unseq, xs.begin(), xs.end(), aabb(from_points, vec::splat(0.)), - [](auto a, auto b) { return merge(a, b); }, [](auto a) { return aabb(from_points, a); }); + par_unseq, ms.begin(), ms.end(), aabb(from_points, vec::splat(0.)), + [](auto a, auto b) { return merge(a, b); }, [](auto a) { return aabb(from_points, a.x()); }); } /// Sorts bodies along the Hilbert curve. @@ -35,11 +35,11 @@ void hilbert_sort(System& system, aabb bbox) { // Compute the Hilbert index for each body in the Cartesian grid auto bids = system.body_indices(); - static std::vector hilbert_ids(system.size); + static vector hilbert_ids(system.size); std::for_each(par_unseq, bids.begin(), bids.end(), - [hids = hilbert_ids.data(), x = system.x.data(), mins = bbox.xmin, grid_cell_size](auto idx) { + [hids = hilbert_ids.data(), p = system.p.data(), mins = bbox.xmin, grid_cell_size](auto idx) { // Bucket the body into a Cartesian grid cell: - vec cell_idx = cast((x[idx] - mins) / grid_cell_size); + vec cell_idx = cast((p[idx].x() - mins) / grid_cell_size); // Compute the Hilber index of the cell and assign it to the body: hids[idx] = hilbert(cell_idx); }); @@ -48,9 +48,9 @@ void hilbert_sort(System& system, aabb bbox) { #if defined(__NVCOMPILER) // Workaround for nvc++: we can use Thrust zip_iterator, which predates zip_view, but provides the same functionality, // and works just fine: - auto b = thrust::make_zip_iterator(hilbert_ids.begin(), system.x.begin(), system.m.begin(), system.v.begin(), + auto b = thrust::make_zip_iterator(hilbert_ids.begin(), system.p.begin(), system.v.begin(), system.a.begin(), system.ao.begin()); - auto e = thrust::make_zip_iterator(hilbert_ids.end(), system.x.end(), system.m.end(), system.v.end(), system.a.end(), + auto e = thrust::make_zip_iterator(hilbert_ids.end(), system.p.end(), system.v.end(), system.a.end(), system.ao.end()); std::sort(par_unseq, b, e, [](auto a, auto b) { return thrust::get<0>(a) < thrust::get<0>(b); }); #elif defined(__clang__) || (__cplusplus < 202302L) @@ -59,7 +59,7 @@ void hilbert_sort(System& system, aabb bbox) { // TODO: sort an array of keys and then apply a permutation in O(N) time and O(1) storage // (instead of O(N) time and O(N) storage). - static std::vector> hilbert_index_map(system.size); + static vector> hilbert_index_map(system.size); std::for_each_n(par_unseq, counting_iterator(0), system.size, [hmap = hilbert_index_map.data(), hids = hilbert_ids.data()](std::size_t idx) { hmap[idx] = std::make_pair(hids[idx], idx); @@ -70,27 +70,26 @@ void hilbert_sort(System& system, aabb bbox) { // create temp copy of system so that we don't get race conditions when // rearranging values in the next step - static std::vector, T, vec, vec, vec>> tmp_system(system.size); + static vector, vec, vec, vec>> tmp_system(system.size); std::for_each_n(par_unseq, counting_iterator(0), system.size, - [tmp_sys = tmp_system.data(), x = system.x.data(), m = system.m.data(), v = system.v.data(), + [tmp_sys = tmp_system.data(), p = system.p.data(), v = system.v.data(), a = system.a.data(), ao = system.ao.data()](std::size_t idx) { - tmp_sys[idx] = std::make_tuple(x[idx], m[idx], v[idx], a[idx], ao[idx]); + tmp_sys[idx] = std::make_tuple(p[idx], v[idx], a[idx], ao[idx]); }); // copy back std::for_each_n(par_unseq, counting_iterator(0), system.size, - [tmp_sys = tmp_system.data(), hmap = hilbert_index_map.data(), x = system.x.data(), - m = system.m.data(), v = system.v.data(), a = system.a.data(), ao = system.ao.data()](auto idx) { + [tmp_sys = tmp_system.data(), hmap = hilbert_index_map.data(), p = system.p.data(), + v = system.v.data(), a = system.a.data(), ao = system.ao.data()](auto idx) { std::size_t original_index = hmap[idx].second; auto e = tmp_sys[original_index]; - x[idx] = std::get<0>(e); - m[idx] = std::get<1>(e); - v[idx] = std::get<2>(e); - a[idx] = std::get<3>(e); - ao[idx] = std::get<4>(e); + p[idx] = std::get<0>(e); + v[idx] = std::get<1>(e); + a[idx] = std::get<2>(e); + ao[idx] = std::get<3>(e); }); #else - auto r = std::views::zip(hilbert_ids, system.x, system.m, system.v, system.a, system.ao); + auto r = std::views::zip(hilbert_ids, system.p, system.v, system.a, system.ao); std::sort(par_unseq, r.begin(), r.end(), [](auto a, auto b) { return std::get<0>(a) < std::get<0>(b); }); #endif } @@ -188,18 +187,21 @@ struct bvh { } if (br >= nbodies) { - m[i] = monopole(s.m[bl], s.x[bl]); - auto bb = aabb(from_points, s.x[bl]); + auto [ml, xl] = s.p[bl]; + m[i] = monopole(ml, xl); + auto bb = aabb(from_points, xl); b[i] = bb; bw[i] = node_width(bb); } else { - T mass = s.m[bl] + s.m[br]; + auto [ml, xl] = s.p[bl]; + auto [mr, xr] = s.p[br]; + T mass = ml + mr; - vec center_of_mass = s.m[bl] * s.x[bl] + s.m[br] * s.x[br]; + vec center_of_mass = ml * xl + mr * xr; center_of_mass /= mass; m[i] = monopole(mass, center_of_mass); - auto bb = aabb(from_points, s.x[bl], s.x[br]); + auto bb = aabb(from_points, xl, xr); b[i] = bb; bw[i] = node_width(bb); } @@ -216,24 +218,24 @@ struct bvh { auto bl = (li * 2) + first + count; auto br = bl + 1; - auto ml = m[bl]; - auto mr = m[br]; + auto [ml, xl] = m[bl]; + auto [mr, xr] = m[br]; - auto ibl = ml.mass() != 0.; - auto ibr = mr.mass() != 0.; + auto ibl = ml != 0.; + auto ibr = mr != 0.; if (!ibl) { - m[i] = ml; + m[i] = m[bl]; return; } if (!ibr) { - m[i] = ml; + m[i] = m[bl]; b[i] = b[bl]; bw[i] = bw[bl]; } else { - T mass = ml.mass() + mr.mass(); - auto x = (ml.mass() * ml.x() + mr.mass() * mr.x()) / mass; + T mass = ml + mr; + auto x = (ml * xl + mr * xr) / mass; m[i] = monopole(mass, x); auto bb = merge(b[bl], b[br]); b[i] = bb; @@ -253,7 +255,7 @@ struct bvh { node_t nbodies = system.size; auto ids = system.body_indices(); std::for_each(par_unseq, ids.begin(), ids.end(), [=, s = system.state(), *this](node_t i) { - auto xs = s.x[i]; + auto [mi, xi] = s.p[i]; node_t tree_index = 0; auto a = vec::splat(0); @@ -291,10 +293,8 @@ struct bvh { for (int k = 0; k < 2; ++k) { if (bidx < nbodies && bidx != i) { - vec xj = s.x[bidx]; - T mj = s.m[bidx]; - - a += mj * (xj - xs) / dist3(xs, xj); + auto [mj, xj] = s.p[bidx]; + a += mj * (xj - xi) / dist3(xi, xj); } ++bidx; } @@ -303,9 +303,9 @@ struct bvh { force_ascend_right(); } else { auto [mj, xj] = m[tree_index]; - if (can_approximate(xs, xj, bw[tree_index], theta_squared)) { + if (can_approximate(xi, xj, bw[tree_index], theta_squared)) { // below threshold - a += mj * (xj - xs) / dist3(xs, xj); + a += mj * (xj - xi) / dist3(xi, xj); num_covered_particles += ncontained_leaves_at_level(level, nlevels); ascend_right(); @@ -359,7 +359,7 @@ void run_bvh(System& system, Arguments arguments) { dt_force += time([&] { // Bounding box aabb bbox; - dt_bbox += time([&] { bbox = bounding_box(std::span{system.x}); }); + dt_bbox += time([&] { bbox = bounding_box(std::span{system.p}); }); // Sort bodies along Hilbert curve: dt_sort += time([&] { hilbert_sort(system, bbox); }); @@ -381,7 +381,7 @@ void run_bvh(System& system, Arguments arguments) { } else { auto kernels = [&] { // Bounding box - aabb bbox = bounding_box(std::span{system.x}); + aabb bbox = bounding_box(std::span{system.p}); // Sort bodies along Hilbert curve: hilbert_sort(system, bbox); diff --git a/src/monopole.h b/src/monopole.h index 56f980c..25de603 100644 --- a/src/monopole.h +++ b/src/monopole.h @@ -6,7 +6,8 @@ template struct monopole { - vec data{}; + alignas(N % 2 == 0? alignof(vec) : alignof(T) * (N+1)) vec x_; + T m_; monopole() = default; monopole(monopole const &) = default; @@ -14,17 +15,14 @@ struct monopole { monopole &operator=(monopole const &) = default; monopole &operator=(monopole &&) = default; - monopole(T mass, vec x) { - for (dim_t i = 0; i < N; ++i) data[i] = x[i]; - data[N] = mass; - } - - T mass() { return data[N]; } - vec x() { - vec x; - for (dim_t i = 0; i < N; ++i) x[i] = data[i]; - return x; + monopole(T m, vec x) { + for (dim_t i = 0; i < N; ++i) x_[i] = x[i]; + m_ = m; } + T& mass() { return m_; } + T const& mass() const { return m_; } + vec& x() { return x_; } + vec const& x() const { return x_; } }; namespace std { diff --git a/src/octree.h b/src/octree.h index 630f138..9a9f0c1 100644 --- a/src/octree.h +++ b/src/octree.h @@ -27,8 +27,7 @@ struct octree { mutable atomic* next_free_child_group; // Node data: - mutable T* total_masses; - mutable vec* centre_masses; + mutable monopole* m; // Helper latch per node for the last thread to proceed to the parent during // the tree computation of tree node centroids and masses: @@ -46,10 +45,7 @@ struct octree { ::alloc(qt.first_child, capacity); ::alloc(qt.parent, 1 + capacity / child_count); ::alloc(qt.next_free_child_group, 1); - - ::alloc(qt.total_masses, capacity); - ::alloc(qt.centre_masses, capacity); - + ::alloc(qt.m, capacity); ::alloc(qt.child_mass_complete, capacity); return qt; } @@ -59,10 +55,7 @@ struct octree { dealloc(qt->first_child, qt->capacity); dealloc(qt->parent, 1 + qt->capacity / child_count); dealloc(qt->next_free_child_group, 1); - - dealloc(qt->total_masses, qt->capacity); - dealloc(qt->centre_masses, qt->capacity); - + dealloc(qt->m, qt->capacity); dealloc(qt->child_mass_complete, qt->capacity); } @@ -83,10 +76,9 @@ struct octree { // Resets tree node `i` void clear(Index i) const { if (i == 0) next_free_child_group->store(1, memory_order_relaxed); - first_child[i] = empty; - parent[sg(i)] = empty; - total_masses[i] = T(0); - centre_masses[i] = vec::splat(0); + first_child[i] = empty; + parent[sg(i)] = empty; + m[i] = monopole(T(0), vec::splat(0)); child_mass_complete[i].store(0, memory_order_relaxed); } @@ -105,7 +97,7 @@ struct octree { [](auto lhs, auto rhs) -> std::tuple { return {gmin(std::get<0>(lhs), std::get<0>(rhs)), gmax(std::get<1>(lhs), std::get<1>(rhs))}; }, - [s = system.state()](auto i) -> std::tuple { return {min(s.x[i]), max(s.x[i])}; }); + [s = system.state()](auto i) -> std::tuple { return {min(s.p[i].x()), max(s.p[i].x())}; }); // adjust boundary max_size += 1; @@ -120,7 +112,8 @@ struct octree { } // Inserts body with `mass` at position `pos`: - void insert(T mass, vec pos) const { + void insert(monopole p) const { + auto pos = p.x(); Index tree_index = 0; // insert into root vec divide = root_x; T side_length = root_side_length; @@ -148,8 +141,7 @@ struct octree { } else if (status == empty && fc.compare_exchange_weak(status, locked, memory_order_acquire)) { // If the node is empty and we locked it: insert body, unlock it, and done. // compare_exchange_weak suffices: if it fails spuriously, we'll retry again - total_masses[tree_index] = mass; - centre_masses[tree_index] = pos; + m[tree_index] = p; fc.store(body, memory_order_release); break; } else if (status == body && fc.compare_exchange_weak(status, locked, memory_order_acquire)) { @@ -162,17 +154,16 @@ struct octree { parent[sg(first_child_index)] = tree_index; // evict body at current index and insert into children keeping node locked - auto p_x = centre_masses[tree_index]; + auto [p_m, p_x] = m[tree_index]; Index child_pos = 0; Index level = 1; for (dim_t i = 0; i < N; i++) { child_pos += level * (p_x[i] > divide[i]); level *= 2; } - Index evicted_index = first_child_index + child_pos; - centre_masses[evicted_index] = p_x; - total_masses[evicted_index] = total_masses[tree_index]; - first_child[evicted_index] = body; + Index evicted_index = first_child_index + child_pos; + m[evicted_index] = monopole(p_m, p_x); + first_child[evicted_index] = body; // release node and continue to try to insert body fc.store(first_child_index, memory_order_release); @@ -187,7 +178,7 @@ struct octree { void insert(System& system) { auto r = system.body_indices(); std::for_each(par, r.begin(), r.end(), - [s = system.state(), tree = *this](Index i) { tree.insert(s.m[i], s.x[i]); }); + [s = system.state(), tree = *this](Index i) { tree.insert(s.p[i]); }); } // Computes tree node centroids and masses that depend on the body at the leaf node `i`: @@ -216,15 +207,14 @@ struct octree { T m = 0; auto x = vec::splat(0); for (dim_t i = 0; i < child_count; i++) { - auto child_index = first_child[tree_index] + i; - auto child_total_mass = total_masses[child_index]; - m += child_total_mass; - x += child_total_mass * centre_masses[child_index]; + auto child_index = first_child[tree_index] + i; + auto [child_m, child_x] = this->m[child_index]; + m += child_m; + x += child_m * child_x; } x /= m; - total_masses[tree_index] = m; - centre_masses[tree_index] = x; + this->m[tree_index] = monopole(m, x); } while (true); } @@ -245,12 +235,11 @@ struct octree { while (tree_index != empty) { Index next_node_index = next_node(tree_index); if (came_forwards) { // child or sibling node - vec xj = centre_masses[tree_index]; + auto [mj, xj] = m[tree_index]; // check if below threshold auto fc = first_child[tree_index]; auto dx = dist(x, xj); if (fc == empty || fc == body || side_length / dx < theta) { - T mj = total_masses[tree_index]; a += mj * (xj - x) / (dx * dx * dx); } else { // visit children next_node_index = first_child[tree_index]; @@ -270,7 +259,7 @@ struct octree { void compute_force(System& system, T const theta) { auto r = system.body_indices(); std::for_each(par_unseq, r.begin(), r.end(), [s = system.state(), theta, tree = *this](Index i) { - s.a[i] = s.c * tree.compute_force(s.x[i], theta); + s.a[i] = s.c * tree.compute_force(s.p[i].x(), theta); }); } }; @@ -324,7 +313,7 @@ void run_octree(System& system, Arguments arguments) { if (arguments.print_info) { std::cout << std::format("Tree size: {}\n", tree.next_free_child_group->load()); - std::cout << std::format("Total mass: {: .5f}\n", tree.total_masses[0]); + std::cout << std::format("Total mass: {: .5f}\n", tree.m[0].mass()); } saver.save_all(system); } diff --git a/src/saving.h b/src/saving.h index e7c16de..018762d 100644 --- a/src/saving.h +++ b/src/saving.h @@ -17,7 +17,7 @@ class Saver { if (is_save_energy) setup_energy_file(); } - auto save_all(System const &system) -> void { + auto save_all(System& system) -> void { save_points(system); save_energy(system); } @@ -107,10 +107,16 @@ class Saver { energy_out_file.write(reinterpret_cast(&data_size), sizeof(data_size)); } - auto save_points(System const &system) -> void { + auto save_points(System& system) -> void { if (!is_save_pos) return; - pos_out_file.write(reinterpret_cast(system.x.data()), simulation_size * data_size * std::uint32_t(N)); + static vector> xs(simulation_size, vec::splat(T(0))); + auto it = counting_iterator(0); + std::for_each_n(par_unseq, it, simulation_size, [s = system.state(), xs = xs.data()](uint32_t i) { + xs[i] = s.p[i].x(); + }); + + pos_out_file.write(reinterpret_cast(xs.data()), simulation_size * data_size * std::uint32_t(N)); } auto save_energy(System const &system) -> void { diff --git a/src/system.h b/src/system.h index f432d1b..ef7c8c2 100644 --- a/src/system.h +++ b/src/system.h @@ -2,10 +2,10 @@ #include #include #include -#include #include "alloc.h" #include "execution.h" +#include "monopole.h" #include "vec.h" template @@ -16,8 +16,8 @@ class System { index_t const max_tree_node_size; T const dt; T const constant; - std::vector> m; - std::vector, allocator>> x, v, a, ao; + vector> p; + vector> v, a, ao; // random generation std::mt19937 gen{42}; // fix random generation @@ -30,8 +30,7 @@ class System { max_tree_node_size(std::max(child_count * size, 1000)), dt(time_step), constant(constant), - m(size), - x(size), + p(size), v(size), a(size), ao(size) {} @@ -40,21 +39,21 @@ class System { // Helper to make it easier to access all state from parallel algorithms struct state_t { - T* m; - vec*x, *v, *a, *ao; + monopole* p; + vec*v, *a, *ao; T dt, c; index_t sz; }; state_t state() { return { - .m = m.data(), .x = x.data(), .v = v.data(), .a = a.data(), .ao = ao.data(), .dt = dt, .c = constant, .sz = size}; + .p = p.data(), .v = v.data(), .a = a.data(), .ao = ao.data(), .dt = dt, .c = constant, .sz = size}; } void accelerate_step() { // performs leap frog integration auto r = body_indices(); std::for_each(par_unseq, r.begin(), r.end(), [s = state()](auto i) { - s.x[i] += s.dt * s.v[i] + T(0.5) * s.dt * s.dt * s.ao[i]; + s.p[i].x() += s.dt * s.v[i] + T(0.5) * s.dt * s.dt * s.ao[i]; s.v[i] += T(0.5) * s.dt * (s.a[i] + s.ao[i]); s.ao[i] = s.a[i]; }); @@ -64,15 +63,16 @@ class System { auto r = body_indices(); T kinetic_energy = static_cast(0.5) * std::transform_reduce(par_unseq, r.begin(), r.end(), static_cast(0), std::plus{}, - [m = m.data(), v = v.data()](auto i) { return m[i] * l2norm2(v[i]); }); + [p = p.data(), v = v.data()](auto i) { return p[i].mass() * l2norm2(v[i]); }); T gravitational_energy = -static_cast(0.5) * constant * std::transform_reduce(par_unseq, r.begin(), r.end(), static_cast(0), std::plus{}, - [m = m.data(), x = x.data(), sz = size](auto i) { + [p = p.data(), sz = size](auto i) { T total = 0; - T mi = m[i]; - auto xi = x[i]; - for (index_t j = 0; j < sz; j++) - if (j != i) total += mi * m[j] / dist(xi, x[j]); + auto [mi, xi] = p[i]; + for (index_t j = 0; j < sz; j++) { + auto [mj, xj] = p[j]; + if (j != i) total += mi * mj / dist(xi, xj); + } return total; }); @@ -83,8 +83,7 @@ class System { auto i = next_point; next_point += 1; - m[i] = mass; - x[i] = pos; + p[i] = monopole(mass, pos); v[i] = vel; } @@ -92,7 +91,7 @@ class System { for (size_t i = 0; i < size; i++) { std::cout << std::format("{:02}: m={: .3e}, p=({: .3e}, {: .3e}), v=({: " ".3e}, {: .3e}), f=({: .3e}, {: .3e})", - i, m[i], x[i][0], x[i][1], v[i][0], v[i][1], a[i][0], a[i][1]) + i, p[i].mass(), p[i].x()[0], p[i].x()[1], v[i][0], v[i][1], a[i][0], a[i][1]) << std::endl; } }