diff --git a/Cargo.toml b/Cargo.toml index e089311..63eb3ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,9 @@ battleship = [] name = "bempp-octree" version = "0.0.1-dev" edition = "2021" -authors = ["Srinath Kailasa , Timo Betcke "] +authors = [ + "Srinath Kailasa , Timo Betcke ", +] description = "A library to create Octrees" license = "BSD-3-Clause" homepage = "https://github.com/bempp/octree" @@ -22,14 +24,15 @@ crate-type = ["cdylib", "lib"] [dependencies] itertools = "0.13.*" -rand = "0.8.5" -bytemuck = "1.*" +rand = { version = "0.8.5", features = ["alloc"] } +rand_chacha = "0.3.*" +num = "0.4.*" vtkio = "0.6.*" -mpi = {version = "0.8.*", features = ["derive", "user-operations"] } +mpi = { version = "0.8.*", features = ["derive", "user-operations"] } [profile.release] debug = 1 - + [dev-dependencies] rand_distr = "0.4.3" #criterion = { version = "0.5.*", features = ["html_reports"]} diff --git a/examples/battleship.rs b/examples/battleship.rs index 97f7c81..50587d9 100644 --- a/examples/battleship.rs +++ b/examples/battleship.rs @@ -4,7 +4,7 @@ use std::time::Instant; #[cfg(feature = "battleship")] -use bempp_octree::octree::Octree; +use bempp_octree::serial::Octree; #[cfg(feature = "battleship")] use vtkio::model::*; diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs new file mode 100644 index 0000000..68633e1 --- /dev/null +++ b/examples/mpi_complete_tree.rs @@ -0,0 +1,95 @@ +//! Test the computation of a complete octree. + +use bempp_octree::{ + morton::MortonKey, + octree::{is_complete_linear_and_balanced, KeyType, Octree}, + tools::{gather_to_all, generate_random_points}, +}; +use itertools::Itertools; +use mpi::traits::Communicator; +use rand::prelude::*; +use rand_chacha::ChaCha8Rng; + +pub fn main() { + // Initialise MPI + let universe = mpi::initialize().unwrap(); + + // Get the world communicator + let comm = universe.world(); + + // Initialise a seeded Rng. + let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); + + // Create `npoints` per rank. + let npoints = 10000; + + // Generate random points. + + let points = generate_random_points(npoints, &mut rng, &comm); + + let tree = Octree::new(&points, 15, 50, &comm); + + // We now check that each node of the tree has all its neighbors available. + + let leaf_tree = tree.leaf_tree(); + let all_keys = tree.all_keys(); + + assert!(is_complete_linear_and_balanced(leaf_tree, &comm)); + for &key in leaf_tree { + // We only check interior keys. Leaf keys may not have a neighbor + // on the same level. + let mut parent = key.parent(); + while parent.level() > 0 { + // Check that the key itself is there. + assert!(all_keys.contains_key(&key)); + // Check that all its neighbours are there. + for neighbor in parent.neighbours().iter().filter(|&key| key.is_valid()) { + assert!(all_keys.contains_key(neighbor)); + } + parent = parent.parent(); + // Check that the parent is there. + assert!(all_keys.contains_key(&parent)); + } + } + + // At the end check that the root of the tree is also contained. + assert!(all_keys.contains_key(&MortonKey::root())); + + // Count the number of ghosts on each rank + // Count the number of global keys on each rank. + + // Assert that all ghosts are from a different rank and count them. + + let nghosts = all_keys + .iter() + .filter_map(|(_, &value)| { + if let KeyType::Ghost(rank) = value { + assert!(rank != comm.size() as usize); + Some(rank) + } else { + None + } + }) + .count(); + + if comm.size() == 1 { + assert_eq!(nghosts, 0); + } else { + assert!(nghosts > 0); + } + + let nglobal = all_keys + .iter() + .filter(|(_, &value)| matches!(value, KeyType::Global)) + .count(); + + // Assert that all globals across all ranks have the same count. + + let nglobals = gather_to_all(std::slice::from_ref(&nglobal), &comm); + + assert_eq!(nglobals.iter().unique().count(), 1); + + if comm.rank() == 0 { + println!("Distributed tree is complete and linear."); + } +} diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs new file mode 100644 index 0000000..1b3fcc3 --- /dev/null +++ b/examples/mpi_cumsum.rs @@ -0,0 +1,62 @@ +//! Test the computation of a global bounding box across MPI ranks. + +use bempp_octree::tools::{gather_to_root, global_inclusive_cumsum}; +use itertools::{izip, Itertools}; +use mpi::traits::Communicator; +use rand::prelude::*; +use rand_chacha::ChaCha8Rng; + +pub fn main() { + // Initialise MPI + let universe = mpi::initialize().unwrap(); + + // Get the world communicator + let comm = universe.world(); + + // Initialise a seeded Rng. + let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); + + // Create `npoints` per rank. + let nelems = 10; + + // Generate random numbers + + let mut elems = Vec::::with_capacity(nelems); + + for _ in 0..nelems { + elems.push(rng.gen_range(0..100)); + } + + // Compute the cumulative sum. + + let global_cum_sum = global_inclusive_cumsum(&elems, &comm); + + // Copy array to root and compare with inclusive scan there. + + if let (Some(cum_sum_root), Some(original_array)) = ( + gather_to_root(&global_cum_sum, &comm), + gather_to_root(&elems, &comm), + ) { + // Scan on root + + let expected_cum_sum = original_array + .iter() + .scan(0, |state, x| { + *state += *x; + Some(*state) + }) + .collect_vec(); + + // Check that the first element is not modified (inclusive cumsum) + assert_eq!( + original_array.first().unwrap(), + cum_sum_root.first().unwrap() + ); + + for (actual, expected) in izip!(cum_sum_root.iter(), expected_cum_sum.iter()) { + assert_eq!(*actual, *expected); + } + + println!("Cumulative sum computed."); + } +} diff --git a/examples/mpi_global_bounding_box.rs b/examples/mpi_global_bounding_box.rs new file mode 100644 index 0000000..144d0e5 --- /dev/null +++ b/examples/mpi_global_bounding_box.rs @@ -0,0 +1,40 @@ +//! Test the computation of a global bounding box across MPI ranks. + +use bempp_octree::{ + geometry::PhysicalBox, + octree::compute_global_bounding_box, + tools::{gather_to_root, generate_random_points}, +}; +use rand::prelude::*; +use rand_chacha::ChaCha8Rng; + +pub fn main() { + // Initialise MPI + let universe = mpi::initialize().unwrap(); + + // Get the world communicator + let comm = universe.world(); + + // Initialise a seeded Rng. + let mut rng = ChaCha8Rng::seed_from_u64(2); + + // Create `npoints` per rank. + let npoints = 10; + + // Generate random points. + + let points = generate_random_points(npoints, &mut rng, &comm); + + // Compute the distributed bounding box. + + let bounding_box = compute_global_bounding_box(&points, &comm); + + // Copy all points to root and compare local bounding box there. + + if let Some(points_root) = gather_to_root(&points, &comm) { + // Compute the bounding box on root. + + let expected = PhysicalBox::from_points(&points_root); + assert_eq!(expected.coordinates(), bounding_box.coordinates()); + } +} diff --git a/examples/parsort.rs b/examples/parsort.rs index 6361f6c..5de9c07 100644 --- a/examples/parsort.rs +++ b/examples/parsort.rs @@ -1,37 +1,25 @@ //! Testing the hyksort component. -use bempp_octree::parsort::{array_to_root, parsort}; -use itertools::Itertools; +use bempp_octree::{ + parsort::parsort, + tools::{generate_random_keys, is_sorted_array}, +}; use mpi::traits::Communicator; use rand::prelude::*; pub fn main() { let universe = mpi::initialize().unwrap(); let world = universe.world(); - let rank = world.rank() as u64; let n_per_rank = 1000; let mut rng = rand::rngs::StdRng::seed_from_u64(0); - let mut arr = Vec::::new(); + let keys = generate_random_keys(n_per_rank, &mut rng); - for _ in 0..n_per_rank { - arr.push(rng.gen()); - } - - // let splitters = get_splitters(&arr, &world, &mut rng); - - // let bin_displs = get_bin_displacements(&arr, &splitters); - - let arr = parsort(&arr, &world, &mut rng); - let arr = array_to_root(&arr, &world); + let arr = parsort(&keys, &world, &mut rng); - if rank == 0 { - let arr = arr.unwrap(); + assert!(is_sorted_array(&arr, &world)); - for (elem1, elem2) in arr.iter().tuple_windows() { - assert!(elem1 <= elem2); - } - println!("Sorted {} elements.", arr.len()); - println!("Finished."); + if world.rank() == 0 { + println!("Array is sorted."); } } diff --git a/src/geometry.rs b/src/geometry.rs index 303fdf3..62b9753 100644 --- a/src/geometry.rs +++ b/src/geometry.rs @@ -1,9 +1,33 @@ //! Geometry information -use bytemuck; +use mpi::traits::Equivalence; use crate::constants::DEEPEST_LEVEL; +/// Definition of a point. +#[derive(Clone, Copy, Equivalence)] +pub struct Point { + coords: [f64; 3], + global_id: usize, +} + +impl Point { + /// Create a new point from coordinates and global id. + pub fn new(coords: [f64; 3], global_id: usize) -> Self { + Self { coords, global_id } + } + + /// Return the coordintes of a point. + pub fn coords(&self) -> [f64; 3] { + self.coords + } + + /// Return the global id of the point. + pub fn global_id(&self) -> usize { + self.global_id + } +} + /// A bounding box describes geometry in which an Octree lives. pub struct PhysicalBox { coords: [f64; 6], @@ -18,11 +42,7 @@ impl PhysicalBox { } /// Give a slice of points. Compute an associated bounding box. - pub fn from_points(points: &[f64]) -> PhysicalBox { - assert_eq!(points.len() % 3, 0); - - let points: &[[f64; 3]] = bytemuck::cast_slice(points); - + pub fn from_points(points: &[Point]) -> PhysicalBox { let mut xmin = f64::MAX; let mut xmax = f64::MIN; @@ -33,9 +53,9 @@ impl PhysicalBox { let mut zmax = f64::MIN; for point in points { - let x = point[0]; - let y = point[1]; - let z = point[2]; + let x = point.coords()[0]; + let y = point.coords()[1]; + let z = point.coords()[2]; xmin = f64::min(xmin, x); xmax = f64::max(xmax, x); diff --git a/src/lib.rs b/src/lib.rs index 3f5fbcc..c94ad07 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,4 +7,6 @@ pub mod geometry; pub mod morton; pub mod octree; pub mod parsort; +pub mod serial; +pub mod tools; pub mod types; diff --git a/src/morton.rs b/src/morton.rs index 6701ead..5fee9a3 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -5,16 +5,17 @@ use crate::constants::{ LEVEL_SIZE, NINE_BIT_MASK, NSIBLINGS, X_LOOKUP_DECODE, X_LOOKUP_ENCODE, Y_LOOKUP_DECODE, Y_LOOKUP_ENCODE, Z_LOOKUP_DECODE, Z_LOOKUP_ENCODE, }; -use crate::geometry::PhysicalBox; +use crate::geometry::{PhysicalBox, Point}; use itertools::izip; use itertools::Itertools; +use mpi::traits::Equivalence; use std::collections::HashSet; /// A morton key /// /// This is a distinct type to distinguish from u64 /// numbers. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Equivalence)] pub struct MortonKey { value: u64, } @@ -34,6 +35,13 @@ impl MortonKey { key } + /// A key that is not valid or well formed but guaranteed to be larger than any valid key. + /// + /// This is useful when a guaranteed upper bound is needed. + pub fn upper_bound() -> Self { + Self { value: u64::MAX } + } + /// Check if a key is invalid. pub fn invalid_key() -> Self { Self { value: 1 << 63 } @@ -147,9 +155,9 @@ impl MortonKey { /// Map a physical point within a bounding box to a Morton key on a given level. /// It is assumed that points are strictly contained within the bounding box. - pub fn from_physical_point(point: [f64; 3], bounding_box: &PhysicalBox, level: usize) -> Self { + pub fn from_physical_point(point: Point, bounding_box: &PhysicalBox, level: usize) -> Self { let level_size = 1 << level; - let reference = bounding_box.physical_to_reference(point); + let reference = bounding_box.physical_to_reference(point.coords()); let x = (reference[0] * level_size as f64) as usize; let y = (reference[1] * level_size as f64) as usize; let z = (reference[2] * level_size as f64) as usize; @@ -381,6 +389,10 @@ impl MortonKey { let mut result = [MortonKey::default(); 26]; let (level, [x, y, z]) = self.decode(); + + if level == 0 { + return result; + } let level_size = 1 << level; for (direction, res) in izip!(DIRECTIONS, result.iter_mut()) { @@ -409,6 +421,69 @@ impl MortonKey { result } + /// Return the index of the key as a child of the parent, i.e. 0, 1, ..., 7. + #[inline(always)] + pub fn child_index(&self) -> usize { + if *self == MortonKey::root() { + return 0; + } + let level = self.level() as u64; + + let shift = LEVEL_DISPLACEMENT + 3 * (DEEPEST_LEVEL - level); + + ((self.value >> shift) % 8) as usize + } + + /// Return the finest descendent that is opposite to the joint corner with the siblings. + pub fn finest_outer_descendent(&self) -> MortonKey { + // First find out which child the current key is. + + let level = self.level() as u64; + + if level == DEEPEST_LEVEL { + return *self; + } + + let mut child_level = 1 + level; + let mut key = *self; + let outer_index = self.child_index() as u64; + + while child_level <= DEEPEST_LEVEL { + let shift = LEVEL_DISPLACEMENT + 3 * (DEEPEST_LEVEL - child_level); + key = MortonKey::new(1 + (key.value | outer_index << shift)); + child_level += 1; + } + + key + } + + /// Return the next possible Morton key on the deepest level that is not a descendent of the current key. + /// + /// If the key is already the last possible key then return None. + pub fn next_non_descendent_key(&self) -> Option { + // If we are an ancestor of deepest_last we return None as then there + // is next key. + + if self.is_ancestor(MortonKey::deepest_last()) { + return None; + } + + let level = self.level() as u64; + + let level_diff = DEEPEST_LEVEL - level; + let shift = LEVEL_DISPLACEMENT + 3 * level_diff; + + // Need to know which sibling we are. + let child_index = ((self.value >> shift) % 8) as usize; + // If we are between 0 and 6 take the next sibling and go to deepest level. + if child_index < 7 { + Some(MortonKey::new(self.value + (1 << shift) + level_diff)) + } else { + // If we are the last child go to the parent and take next key from there. + self.parent().next_non_descendent_key() + } + } + /// Linearize by sorting and removing overlaps. pub fn linearize(keys: &[MortonKey]) -> Vec { let mut new_keys = Vec::::new(); @@ -476,8 +551,10 @@ impl MortonKey { result } - /// Complete a region ensuring that the given keys are part of the leafs. - pub fn complete_region(keys: &[MortonKey]) -> Vec { + /// Complete a tree ensuring that the given keys are part of the leafs. + /// + /// The given keys must not overlap. + pub fn complete_tree(keys: &[MortonKey]) -> Vec { // First make sure that the input sequence is sorted. let mut keys = keys.to_vec(); keys.sort_unstable(); @@ -490,20 +567,13 @@ impl MortonKey { return result; } - // If a single element is given then just return the result if it is the root of the tree. - if keys.len() == 1 && result[0] == MortonKey::from_index_and_level([0, 0, 0], 0) { - return result; + // If just the root is given return that. + if keys.len() == 1 && *keys.first().unwrap() == MortonKey::root() { + return keys.to_vec(); } - let deepest_first = MortonKey::from_index_and_level([0, 0, 0], DEEPEST_LEVEL as usize); - let deepest_last = MortonKey::from_index_and_level( - [ - LEVEL_SIZE as usize - 1, - LEVEL_SIZE as usize - 1, - LEVEL_SIZE as usize - 1, - ], - DEEPEST_LEVEL as usize, - ); + let deepest_first = MortonKey::deepest_first(); + let deepest_last = MortonKey::deepest_last(); // If the first key is not an ancestor of the deepest possible first element in the // tree get the finest ancestor between the two and use the first child of that. @@ -646,13 +716,7 @@ impl MortonKey { ); } } - new_work_list.extend_from_slice( - keys.iter() - .copied() - .filter(|&key| key.level() == level - 1) - .collect_vec() - .as_slice(), - ); + new_work_list.extend(keys.iter().copied().filter(|&key| key.level() == level - 1)); work_list = new_work_list; // Now extend the work list with the @@ -961,8 +1025,6 @@ mod test { let keys = children[1].fill_between_keys(children[2]); assert!(keys.is_empty()); - - // Correct result for two keys at deepest level } #[test] @@ -1011,13 +1073,13 @@ mod test { let keys = [key1, key2, key3]; - let complete_region = MortonKey::complete_region(keys.as_slice()); + let complete_region = MortonKey::complete_tree(keys.as_slice()); sanity_checks(keys.as_slice(), complete_region.as_slice()); // For an empty slice the complete region method should just add the root of the tree. let keys = Vec::::new(); - let complete_region = MortonKey::complete_region(keys.as_slice()); + let complete_region = MortonKey::complete_tree(keys.as_slice()); assert_eq!(complete_region.len(), 1); sanity_checks(keys.as_slice(), complete_region.as_slice()); @@ -1026,7 +1088,7 @@ mod test { let keys = [MortonKey::deepest_first(), MortonKey::deepest_last()]; - let complete_region = MortonKey::complete_region(keys.as_slice()); + let complete_region = MortonKey::complete_tree(keys.as_slice()); sanity_checks(keys.as_slice(), complete_region.as_slice()); } @@ -1222,7 +1284,7 @@ mod test { pub fn test_from_physical_point() { let bounding_box = PhysicalBox::new([-2.0, -3.0, -1.0, 4.0, 5.0, 6.0]); - let point = [1.5, -2.5, 5.0]; + let point = Point::new([1.5, -2.5, 5.0], 0); let level = 10; let key = MortonKey::from_physical_point(point, &bounding_box, level); @@ -1231,10 +1293,79 @@ mod test { let coords = physical_box.coordinates(); - assert!(coords[0] <= point[0] && point[0] < coords[3]); - assert!(coords[1] <= point[1] && point[1] < coords[4]); - assert!(coords[2] <= point[2] && point[2] < coords[5]); + assert!(coords[0] <= point.coords()[0] && point.coords()[0] < coords[3]); + assert!(coords[1] <= point.coords()[1] && point.coords()[1] < coords[4]); + assert!(coords[2] <= point.coords()[2] && point.coords()[2] < coords[5]); // Now compute the box. } + + #[test] + pub fn test_child_index() { + let key = MortonKey::from_index_and_level([1, 501, 718], 10); + + let children = key.children(); + + for (index, child) in children.iter().enumerate() { + assert_eq!(index, child.child_index()); + } + } + + #[test] + pub fn test_finest_outer_descendent() { + let key = MortonKey::from_index_and_level([0, 0, 0], 1); + + let finest_outer_descendent = key.finest_outer_descendent(); + + assert_eq!( + finest_outer_descendent, + MortonKey::from_index_and_level([0, 0, 0], DEEPEST_LEVEL as usize) + ); + + let key = MortonKey::from_index_and_level([1, 1, 0], 1); + let finest_outer_descendent = key.finest_outer_descendent(); + + assert_eq!( + finest_outer_descendent, + MortonKey::from_index_and_level( + [LEVEL_SIZE as usize - 1, LEVEL_SIZE as usize - 1, 0], + DEEPEST_LEVEL as usize + ) + ); + } + + #[test] + pub fn test_next_nondescendent_key() { + let key = MortonKey::from_index_and_level([25, 17, 6], 5); + + let children = key.children(); + + // Check the next nondescendent key for the first six children + + for (child, next_child) in children.iter().tuple_windows() { + let next_key = child.next_non_descendent_key().unwrap(); + assert_eq!(next_key.level(), DEEPEST_LEVEL as usize); + assert!(!child.is_ancestor(next_key)); + assert!(next_child.is_ancestor(next_key)); + } + + // Now check the next nondescendent key from the last child. + + let next_child = children.last().unwrap().next_non_descendent_key(); + + // Check that the next nondescendent key from the parent is the same as that of the last child. + + assert_eq!(key.next_non_descendent_key(), next_child); + + // Check that it is not a descendent of the parent and that its level is correct. + + assert_eq!(next_child.unwrap().level(), DEEPEST_LEVEL as usize); + assert!(!key.is_ancestor(next_child.unwrap())); + + // Finally make sure that an ancestor of deepest last returns None. + + assert!(MortonKey::deepest_last() + .next_non_descendent_key() + .is_none()); + } } diff --git a/src/octree.rs b/src/octree.rs index e68297d..5dbd076 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -1,325 +1,168 @@ -//! Definition of a linear octree +//! Definition of Octree. +pub mod parallel; +use std::collections::HashMap; + +use mpi::traits::CommunicatorCollectives; +pub use parallel::*; +use rand::SeedableRng; +use rand_chacha::ChaCha8Rng; use crate::{ - constants::{DEEPEST_LEVEL, NLEVELS}, - geometry::PhysicalBox, + constants::DEEPEST_LEVEL, + geometry::{PhysicalBox, Point}, morton::MortonKey, }; -use bytemuck; -use std::collections::HashMap; -use vtkio; - -/// A neighbour -pub struct Neighbour { - /// Direction - pub direction: [i64; 3], - /// Level - pub level: usize, - /// Morton key - pub key: MortonKey, + +/// Stores what type of key it is. +#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] +pub enum KeyType { + /// A local leaf. + LocalLeaf, + /// A local interior key. + LocalInterior, + /// A global key. + Global, + /// A ghost key from a specific process. + Ghost(usize), } -/// An octree -pub struct Octree { - leaf_keys: Vec, - points: Vec<[f64; 3]>, - point_to_level_keys: [Vec; NLEVELS], +/// A general structure for octrees. +pub struct Octree<'o, C> { + points: Vec, + point_keys: Vec, + coarse_tree: Vec, + leaf_tree: Vec, + coarse_tree_bounds: Vec, + all_keys: HashMap, bounding_box: PhysicalBox, - key_counts: HashMap, - max_leaf_level: usize, - max_points_in_leaf: usize, + comm: &'o C, } -impl Octree { - /// Create octress from points - pub fn from_points(points: &[f64], max_level: usize, max_points_per_box: usize) -> Self { - // Make sure that the points array is a multiple of 3. - assert_eq!(points.len() % 3, 0); - - // Make sure that max level never exceeds DEEPEST_LEVEL - let max_level = if max_level > DEEPEST_LEVEL as usize { - DEEPEST_LEVEL as usize - } else { - max_level - }; +impl<'o, C: CommunicatorCollectives> Octree<'o, C> { + /// Create a new distributed Octree. + pub fn new(points: &[Point], max_level: usize, max_leaf_points: usize, comm: &'o C) -> Self { + // We need a random number generator for sorting. For simplicity we use a ChaCha8 random number generator + // seeded with the rank of the process. + let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); - // Compute the physical bounding box. + // First compute the Morton keys of the points. + let (point_keys, bounding_box) = points_to_morton(points, DEEPEST_LEVEL as usize, comm); - let bounding_box = PhysicalBox::from_points(points); + // Generate the coarse tree - // Bunch the points in arrays of 3. + let (coarse_tree, leaf_tree) = { + // Linearize the keys. + let linear_keys = linearize(&point_keys, &mut rng, comm); - let points: &[[f64; 3]] = bytemuck::cast_slice(points); - let npoints = points.len(); + // Compute the first version of the coarse tree without load balancing. + // We want to ensure that it is 2:1 balanced. + let coarse_tree = compute_coarse_tree(&linear_keys, comm); - // We create a vector of keys for each point on each level. We compute the - // keys on the deepest level and fill the other levels by going from - // parent to parent. + let coarse_tree = balance(&coarse_tree, &mut rng, comm); + debug_assert!(is_complete_linear_tree(&coarse_tree, comm)); - let mut point_to_level_keys: [Vec; NLEVELS] = Default::default(); - point_to_level_keys[DEEPEST_LEVEL as usize] = points - .iter() - .map(|&point| { - MortonKey::from_physical_point(point, &bounding_box, DEEPEST_LEVEL as usize) - }) - .collect::>(); + // We now compute the weights for the initial coarse tree. - for index in (1..=DEEPEST_LEVEL as usize).rev() { - let mut new_vec = Vec::::with_capacity(npoints); - for &key in &point_to_level_keys[index] { - new_vec.push(key.parent()); - } - point_to_level_keys[index - 1] = new_vec; - } + let weights = compute_coarse_tree_weights(&linear_keys, &coarse_tree, comm); - // We now have to create level keys. We are starting at the root and recursing - // down until each box has fewer than max_points_per_box keys. + // We now load balance the initial coarse tree. This forms our final coarse tree + // that is used from now on. - // First we compute the counts of each key on each level. For that we create - // for each level a Hashmap for the keys and then add up. + let coarse_tree = load_balance(&coarse_tree, &weights, comm); + // We also want to redistribute the fine keys with respect to the load balanced coarse trees. - let mut key_counts: HashMap = Default::default(); + let fine_keys = + redistribute_with_respect_to_coarse_tree(&linear_keys, &coarse_tree, comm); - for keys in &point_to_level_keys { - for key in keys { - *key_counts.entry(*key).or_default() += 1; - } - } + // We now create the refined tree by recursing the coarse tree until we are at max level + // or the fine tree keys per coarse tree box is small enough. + let refined_tree = + create_local_tree(&fine_keys, &coarse_tree, max_level, max_leaf_points); - // We can now easily create an adaptive tree by subdividing. We do this by - // a recursive function. - - let mut leaf_keys = Vec::::new(); - - fn recurse_keys( - key: MortonKey, - key_counts: &HashMap, - leaf_keys: &mut Vec, - max_points_per_box: usize, - max_level: usize, - ) { - let level = key.level(); - // A key may have not be associated with points. This happens if one of the children on - // the previous level has no points in its physical box. However, we want to create a - // complete tree. So we still add this one empty child. - if let Some(&count) = key_counts.get(&key) { - if count > max_points_per_box && level < max_level { - for child in key.children() { - recurse_keys(child, key_counts, leaf_keys, max_points_per_box, max_level); - } - } else { - leaf_keys.push(key) - } - } else { - leaf_keys.push(key) - } - } + // We now need to 2:1 balance the refined tree and then redistribute again with respect to the coarse tree. + + let refined_tree = redistribute_with_respect_to_coarse_tree( + &balance(&refined_tree, &mut rng, comm), + &coarse_tree, + comm, + ); + + (coarse_tree, refined_tree) - // Now execute the recursion starting from root + // redistribute the balanced tree according to coarse tree + }; - recurse_keys( - MortonKey::root(), - &key_counts, - &mut leaf_keys, - max_points_per_box, - max_level, + let (points, point_keys) = redistribute_points_with_respect_to_coarse_tree( + points, + &point_keys, + &coarse_tree, + comm, ); - // The leaf keys are now a complete linear tree. But they are not yet balanced. - // In the final step we balance the leafs. + let coarse_tree_bounds = get_tree_bins(&coarse_tree, comm); - let leaf_keys = MortonKey::balance(&leaf_keys, MortonKey::root()); + // Duplicate the coarse tree across all nodes - let mut max_leaf_level = 0; - let mut max_points_in_leaf = 0; + // let coarse_tree = gather_to_all(&coarse_tree, comm); - for key in &leaf_keys { - max_leaf_level = max_leaf_level.max(key.level()); - max_points_in_leaf = - max_points_in_leaf.max(if let Some(&count) = key_counts.get(key) { - count - } else { - 0 - }); - } + let all_keys = generate_all_keys(&leaf_tree, &coarse_tree, &coarse_tree_bounds, comm); Self { - leaf_keys, points: points.to_vec(), - point_to_level_keys, + point_keys, + coarse_tree, + leaf_tree, + coarse_tree_bounds, + all_keys, bounding_box, - key_counts, - max_leaf_level, - max_points_in_leaf, + comm, } } - /// Leaf keys - pub fn leaf_keys(&self) -> &Vec { - &self.leaf_keys - } - - /// Points - pub fn points(&self) -> &Vec<[f64; 3]> { - &self.points - } - - /// Get level keys for each point - pub fn point_to_level_keys(&self) -> &[Vec; NLEVELS] { - &self.point_to_level_keys + /// Return the keys associated with the redistributed points. + pub fn point_keys(&self) -> &Vec { + &self.point_keys } - /// Bounding box + /// Return the bounding box. pub fn bounding_box(&self) -> &PhysicalBox { &self.bounding_box } - /// Maximum leaf level - pub fn maximum_leaf_level(&self) -> usize { - self.max_leaf_level + /// Return the associated coarse tree. + pub fn coarse_tree(&self) -> &Vec { + &self.coarse_tree } - /// Maximum number of points in a leaf box - pub fn max_points_in_leaf_box(&self) -> usize { - self.max_points_in_leaf - } - - /// Number of points in the box indexed by a key - pub fn number_of_points_in_key(&self, key: MortonKey) -> usize { - if let Some(&count) = self.key_counts.get(&key) { - count - } else { - 0 - } + /// Return the points. + /// + /// Points are distributed across the nodes as part of the tree generation. + /// This function returns the redistributed points. + pub fn points(&self) -> &Vec { + &self.points } - /// Export the tree to vtk - pub fn export_to_vtk(&self, file_path: &str) { - use vtkio::model::{ - Attributes, ByteOrder, CellType, Cells, DataSet, IOBuffer, UnstructuredGridPiece, - Version, VertexNumbers, - }; - - // Each box has 8 corners with 3 coordinates each, hence 24 floats per key. - let mut points = Vec::::new(); - // 8 coords per box, hence 8 * nkeys values in connectivity. - let mut connectivity = Vec::::new(); - // Store the vtk offset for each box. - let mut offsets = Vec::::new(); - - let bounding_box = self.bounding_box(); - - // Go through the keys and add coordinates and connectivity. - // Box coordinates are already in the right order, so connectivity - // just counts up. We don't mind doubly counted vertices from two boxes. - let mut point_count = 0; - let mut key_count = 0; - - for key in self.leaf_keys().iter() { - // We only want to export non-empty boxes. - if self.number_of_points_in_key(*key) == 0 { - continue; - } - let coords = key.physical_box(bounding_box).corners(); - - key_count += 1; - offsets.push(8 * key_count); - - for coord in &coords { - points.push(coord[0]); - points.push(coord[1]); - points.push(coord[2]); - - connectivity.push(point_count); - point_count += 1; - } - } - - let vtk_file = vtkio::Vtk { - version: Version::new((1, 0)), - title: String::new(), - byte_order: ByteOrder::LittleEndian, - file_path: None, - data: DataSet::inline(UnstructuredGridPiece { - points: IOBuffer::F64(points), - cells: Cells { - cell_verts: VertexNumbers::XML { - connectivity, - offsets, - }, - types: vec![CellType::Hexahedron; key_count as usize], - }, - data: Attributes { - point: vec![], - cell: vec![], - }, - }), - }; - - vtk_file.export_ascii(file_path).unwrap(); + /// Return the leaf tree. + pub fn leaf_tree(&self) -> &Vec { + &self.leaf_tree } - // We can now create the vtk object. -} - -#[cfg(test)] -mod test { - use super::Octree; - use rand::prelude::*; - - fn get_points_on_sphere(npoints: usize) -> Vec { - let mut rng = rand::rngs::StdRng::seed_from_u64(0); - let normal = rand_distr::Normal::new(0.0, 1.0).unwrap(); - - let mut points = Vec::::with_capacity(3 * npoints); - for _ in 0..(npoints) { - let x: f64 = normal.sample(&mut rng); - let y: f64 = normal.sample(&mut rng); - let z: f64 = normal.sample(&mut rng); - - let norm = (x * x + y * y + z * z).sqrt(); - - points.push(x / norm); - points.push(y / norm); - points.push(z / norm); - } - - points + /// Get the coarse tree bounds. + /// + /// This returns an array of size the number of ranks, + /// where each element consists of the smallest Morton key in + /// the corresponding rank. + pub fn coarse_tree_bounds(&self) -> &Vec { + &self.coarse_tree_bounds } - #[test] - fn test_octree() { - use std::time::Instant; - - let npoints = 1000000; - let points = get_points_on_sphere(npoints); - let max_level = 7; - let max_points_per_box = 100; - - let start = Instant::now(); - let octree = Octree::from_points(&points, max_level, max_points_per_box); - let duration = start.elapsed(); - - println!("Creation time: {}", duration.as_millis()); - println!("Number of leaf keys: {}", octree.leaf_keys().len()); - println!("Bounding box: {}", octree.bounding_box()); + /// Return the communicator. + pub fn comm(&self) -> &C { + self.comm } - #[test] - fn test_export() { - let fname = "_test_sphere.vtk"; - let npoints = 1000000; - let points = get_points_on_sphere(npoints); - let max_level = 7; - let max_points_per_box = 100; - - let octree = Octree::from_points(&points, max_level, max_points_per_box); - - octree.export_to_vtk(fname); - println!("Maximum leaf level: {}", octree.maximum_leaf_level()); - println!( - "Maximum number of points in leaf box: {}", - octree.max_points_in_leaf_box() - ); + /// Return a map of all keys. + pub fn all_keys(&self) -> &HashMap { + &self.all_keys } } diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs new file mode 100644 index 0000000..4a8a80f --- /dev/null +++ b/src/octree/parallel.rs @@ -0,0 +1,1033 @@ +//! Parallel Octree structure + +use std::collections::{HashMap, HashSet}; + +use crate::{ + constants::{DEEPEST_LEVEL, NSIBLINGS}, + geometry::{PhysicalBox, Point}, + morton::MortonKey, + parsort::parsort, + tools::{ + communicate_back, gather_to_all, gather_to_root, global_inclusive_cumsum, redistribute, + sort_to_bins, + }, +}; + +use mpi::traits::{Equivalence, Root}; + +use itertools::{izip, Itertools}; +use mpi::{collective::SystemOperation, traits::CommunicatorCollectives}; +use rand::Rng; + +use super::KeyType; + +/// Compute the global bounding box across all points on all processes. +pub fn compute_global_bounding_box( + points: &[Point], + comm: &C, +) -> PhysicalBox { + // Make sure that the points array is a multiple of 3. + + // Now compute the minimum and maximum across each dimension. + + let mut xmin = f64::MAX; + let mut xmax = f64::MIN; + + let mut ymin = f64::MAX; + let mut ymax = f64::MIN; + + let mut zmin = f64::MAX; + let mut zmax = f64::MIN; + + for point in points { + let x = point.coords()[0]; + let y = point.coords()[1]; + let z = point.coords()[2]; + + xmin = f64::min(xmin, x); + xmax = f64::max(xmax, x); + + ymin = f64::min(ymin, y); + ymax = f64::max(ymax, y); + + zmin = f64::min(zmin, z); + zmax = f64::max(zmax, z); + } + + let mut global_xmin = 0.0; + let mut global_xmax = 0.0; + + let mut global_ymin = 0.0; + let mut global_ymax = 0.0; + + let mut global_zmin = 0.0; + let mut global_zmax = 0.0; + + comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min()); + comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max()); + + comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min()); + comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max()); + + comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min()); + comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max()); + + let xdiam = global_xmax - global_xmin; + let ydiam = global_ymax - global_ymin; + let zdiam = global_zmax - global_zmin; + + let xmean = global_xmin + 0.5 * xdiam; + let ymean = global_ymin + 0.5 * ydiam; + let zmean = global_zmin + 0.5 * zdiam; + + // We increase diameters by box size on deepest level + // and use the maximum diameter to compute a + // cubic bounding box. + + let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64; + + let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap(); + + let max_diam = max_diam * (1.0 + deepest_box_diam); + + PhysicalBox::new([ + xmean - 0.5 * max_diam, + ymean - 0.5 * max_diam, + zmean - 0.5 * max_diam, + xmean + 0.5 * max_diam, + ymean + 0.5 * max_diam, + zmean + 0.5 * max_diam, + ]) +} + +/// Convert points to Morton keys on specified level. +pub fn points_to_morton( + points: &[Point], + max_level: usize, + comm: &C, +) -> (Vec, PhysicalBox) { + // Make sure that max level never exceeds DEEPEST_LEVEL + let max_level = if max_level > DEEPEST_LEVEL as usize { + DEEPEST_LEVEL as usize + } else { + max_level + }; + + // Compute the physical bounding box. + + let bounding_box = compute_global_bounding_box(points, comm); + + // Bunch the points in arrays of 3. + + let keys = points + .iter() + .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level)) + .collect_vec(); + + (keys, bounding_box) +} + +/// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree. +/// The returned coarse tree is load balanced according to the number of linear keys in each coarse block. +pub fn compute_coarse_tree( + linear_keys: &[MortonKey], + comm: &C, +) -> Vec { + let size = comm.size(); + + debug_assert!(is_linear_tree(linear_keys, comm)); + + // On a single node a complete coarse tree is simply the root. + if size == 1 { + return vec![MortonKey::root()]; + } + + let mut completed_region = linear_keys + .first() + .unwrap() + .fill_between_keys(*linear_keys.last().unwrap()); + + completed_region.insert(0, *linear_keys.first().unwrap()); + completed_region.push(*linear_keys.last().unwrap()); + + // Get the smallest level members of the completed region. + + let min_level = completed_region + .iter() + .map(|elem| elem.level()) + .min() + .unwrap(); + + // Each process selects its largest boxes. These are used to create + // a coarse tree. + + let largest_boxes = completed_region + .iter() + .filter(|elem| elem.level() == min_level) + .copied() + .collect_vec(); + + debug_assert!(is_linear_tree(&largest_boxes, comm)); + + complete_tree(&largest_boxes, comm) +} + +/// Compute the weights of each coarse tree block as the number of linear keys associated with each coarse block. +pub fn compute_coarse_tree_weights( + linear_keys: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> Vec { + let rank = comm.rank(); + // We want to partition the coarse tree. But we need the correct weights. The idea + // is that we use the number of original leafs that intersect with the coarse tree + // as leafs. In order to compute this we send the coarse tree around to all processes + // so that each process computes for each coarse tree element how many of its keys + // intersect with each node of the coarse tree. We then sum up the local weight for each + // coarse tree node across all nodes to get the weight. + + let global_coarse_tree = gather_to_all(coarse_tree, comm); + + // We also want to send around a corresponding array of ranks so that for each global coarse tree key + // we have the rank of where it originates from. + + let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm); + + // We now compute the local weights. + let mut local_weight_contribution = vec![0; global_coarse_tree.len()]; + + // In the following loop we want to be a bit smart. We do not iterate through all the local elements. + // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region + // of our sorted keys that overlaps with the coarse tree region. + + // Let's find the start of our region. The start of our region is a coarse key that is an ancestor + // of our current key. This works because the coarse tree has levels at most as high as the sorted keys. + + let first_key = *linear_keys.first().unwrap(); + + let first_coarse_index = global_coarse_tree + .iter() + .take_while(|coarse_key| !coarse_key.is_ancestor(first_key)) + .count(); + + // Now we need to find the end index of our region. For this again we find the index of our coarse tree that + // is an ancestor of our last key. + let last_key = *linear_keys.last().unwrap(); + + let last_coarse_index = global_coarse_tree + .iter() + .take_while(|coarse_key| !coarse_key.is_ancestor(last_key)) + .count(); + + // We now only need to iterate through between the first and last coarse index in the coarse tree. + // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key). + + for (w, &global_coarse_key) in izip!( + local_weight_contribution[first_coarse_index..=last_coarse_index].iter_mut(), + global_coarse_tree[first_coarse_index..=last_coarse_index].iter() + ) { + *w += linear_keys + .iter() + .filter(|&&key| global_coarse_key.is_ancestor(key)) + .count(); + } + + // We now need to sum up the weights across all processes. + + let mut global_weights = vec![0; global_coarse_tree.len()]; + + comm.all_reduce_into( + &local_weight_contribution, + &mut global_weights, + SystemOperation::sum(), + ); + + // Each process now has all weights. However, we only need the ones for the current process. + // So we just filter the rest out. + + izip!(coarse_tree_ranks, global_weights) + .filter_map(|(r, weight)| { + if r == rank as usize { + Some(weight) + } else { + None + } + }) + .collect_vec() +} + +/// Redistribute sorted keys with respect to a linear coarse tree. +pub fn redistribute_with_respect_to_coarse_tree( + linear_keys: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> Vec { + let size = comm.size(); + + if size == 1 { + return linear_keys.to_vec(); + } + + // We want to globally redistribute keys so that the keys on each process are descendents + // of the local coarse tree keys. + + // We are using here the fact that the coarse tree is complete and sorted. + // We are sending around to each process the first local index. This + // defines bins in which we sort our keys. The keys are then sent around to the correct + // processes via an alltoallv operation. + + let my_first = coarse_tree.first().unwrap(); + + let global_bins = gather_to_all(std::slice::from_ref(my_first), comm); + + // We now have our bins. We go through our keys and store how + // many keys are assigned to each rank. We are using here that + // our keys and the coarse tree are both sorted. + + // This will store for each rank how many keys will be assigned to it. + + let rank_counts = sort_to_bins(linear_keys, &global_bins) + .iter() + .map(|&elem| elem as i32) + .collect_vec(); + + // We now have the counts for each rank. Let's redistribute accordingly and return. + + let result = redistribute(linear_keys, &rank_counts, comm); + + #[cfg(debug_assertions)] + { + // Check through that the first and last key of result are descendents + // of the first and last coarse bloack. + debug_assert!(coarse_tree + .first() + .unwrap() + .is_ancestor(*result.first().unwrap())); + debug_assert!(coarse_tree + .last() + .unwrap() + .is_ancestor(*result.last().unwrap())); + } + + result +} + +/// Return a complete tree generated from local keys and associated coarse keys. +/// +/// The coarse keys are refined until the maximum level is reached or until each coarse key +/// is the ancestor of at most `max_keys` fine keys. +/// It is assumed that the level of the fine keys is at least as large as `max_level`. +pub fn create_local_tree( + sorted_fine_keys: &[MortonKey], + coarse_keys: &[MortonKey], + mut max_level: usize, + max_keys: usize, +) -> Vec { + if max_level > DEEPEST_LEVEL as usize { + max_level = DEEPEST_LEVEL as usize; + } + + // We split the sorted fine keys into subslices so that each subslice + // is associated with a coarse slice. + + let bins = coarse_keys.to_vec(); + + let counts = sort_to_bins(sorted_fine_keys, &bins); + + // We now know how many fine keys are associated with each coarse block. We iterate + // through and locally refine for each block that requires it. + + let mut remainder = sorted_fine_keys; + let mut refined_keys = Vec::::new(); + + for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) { + let current; + (current, remainder) = remainder.split_at(count); + if coarse_key.level() < max_level && current.len() > max_keys { + // We need to refine the current split. + refined_keys.extend_from_slice( + create_local_tree( + current, + coarse_key.children().as_slice(), + max_level, + max_keys, + ) + .as_slice(), + ); + } else { + refined_keys.push(coarse_key) + } + } + + refined_keys +} + +/// Linearize a set of weighted Morton keys. +pub fn linearize( + keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + let size = comm.size(); + let rank = comm.rank(); + + // If we only have one process we use the standard serial linearization. + + if size == 1 { + return MortonKey::linearize(keys); + } + + // We are first sorting the keys. Then in a linear process across all processors we + // go through the arrays and delete ancestors of nodes. + + let sorted_keys = parsort(keys, comm, rng); + + // Each process needs to send its first element to the previous process. Each process + // then goes through its own list and retains elements that are not ancestors of the + // next element. + + let mut result = Vec::::new(); + + let next_key = communicate_back(&sorted_keys, comm); + + // Treat the local keys + for (&m1, &m2) in sorted_keys.iter().tuple_windows() { + // m1 is also ancestor of m2 if they are identical. + if m1.is_ancestor(m2) { + continue; + } else { + result.push(m1); + } + } + + // If we are at the last process simply push the last key. + // Otherwise check whether it might be the ancestor of `next_key`, + // the first key on the next process. If yes, don't push it. Otherwise do. + + if rank == size - 1 { + result.push(*sorted_keys.last().unwrap()); + } else { + let last = *sorted_keys.last().unwrap(); + if !last.is_ancestor(next_key.unwrap()) { + result.push(last); + } + } + + debug_assert!(is_linear_tree(&result, comm)); + + result +} + +/// Balance a sorted list of Morton keys across processors given an array of corresponding weights. +pub fn load_balance( + sorted_keys: &[MortonKey], + weights: &[usize], + comm: &C, +) -> Vec { + assert_eq!(sorted_keys.len(), weights.len()); + + let size = comm.size(); + let rank = comm.rank(); + + // If we only have one process we simply return. + + if size == 1 { + return sorted_keys.to_vec(); + } + + // First scan the weight. + // We scan the local arrays, then use a global scan operation on the last element + // of each array to get the global sums and then we update the array of each rank + // with the sum from the previous ranks. + + let scan = global_inclusive_cumsum(weights, comm); + + // Now broadcast the total weight to all processes. + + let mut total_weight = if rank == size - 1 { + *scan.last().unwrap() + } else { + 0 + }; + + comm.process_at_rank(size - 1) + .broadcast_into(&mut total_weight); + + let w = total_weight / (size as usize); + let k = total_weight % (size as usize); + + // Sort the elements into bins according to which process they should be sent. + // We do not need to sort the Morton keys themselves into bins but the scanned weights. + // The corresponding counts are the right counts for the Morton keys. + + let mut bins = Vec::::with_capacity(size as usize); + + for p in 1..=size as usize { + if p <= k { + bins.push((p - 1) * (1 + w)); + } else { + bins.push((p - 1) * w + k); + } + } + + let counts = sort_to_bins(&scan, &bins) + .iter() + .map(|elem| *elem as i32) + .collect_vec(); + + // Now distribute the data with an all to all v. + // We create a vector of how many elements to send to each process and + // then send the actual data. + + let mut recvbuffer = redistribute(sorted_keys, &counts, comm); + + recvbuffer.sort_unstable(); + recvbuffer +} + +/// Given a distributed set of linear keys, generate a complete tree. +pub fn complete_tree( + linear_keys: &[MortonKey], + comm: &C, +) -> Vec { + let mut linear_keys = linear_keys.to_vec(); + + debug_assert!(is_linear_tree(&linear_keys, comm)); + + let size = comm.size(); + let rank = comm.rank(); + + if size == 1 { + return MortonKey::complete_tree(linear_keys.as_slice()); + } + + // Now insert on the first and last process the first and last child of the + // finest ancestor of first/last box on deepest level + + // Send first element to previous rank and insert into local keys. + // On the first process we also need to insert the first child of the finest + // ancestor of the deepest first key and first element. Correspondingly on the last process + // we need to insert the last child of the finest ancester of the deepest last key and last element. + + let next_key = communicate_back(&linear_keys, comm); + + if rank < size - 1 { + linear_keys.push(next_key.unwrap()); + } + + // Now fix the first key on the first rank. + + if rank == 0 { + let first_key = linear_keys.first().unwrap(); + let deepest_first = MortonKey::deepest_first(); + if !first_key.is_ancestor(deepest_first) { + let ancestor = deepest_first.finest_common_ancestor(*first_key); + linear_keys.insert(0, ancestor.children()[0]); + } + } + + if rank == size - 1 { + let last_key = linear_keys.last().unwrap(); + let deepest_last = MortonKey::deepest_last(); + if !last_key.is_ancestor(deepest_last) { + let ancestor = deepest_last.finest_common_ancestor(*last_key); + linear_keys.push(ancestor.children()[NSIBLINGS - 1]); + } + } + + // Now complete the regions defined by the keys on each process. + + let mut result = Vec::::new(); + + for (&key1, &key2) in linear_keys.iter().tuple_windows() { + result.push(key1); + result.extend_from_slice(key1.fill_between_keys(key2).as_slice()); + } + + if rank == size - 1 { + result.push(*linear_keys.last().unwrap()); + } + + debug_assert!(is_complete_linear_tree(&result, comm)); + + result +} + +/// Balance a distributed tree. +pub fn balance( + linear_keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + // Treat the case that the length of the keys is one and is only the root. + // This would lead to an empty output below as we only iterate up to level 1. + + if linear_keys.len() == 1 && *linear_keys.first().unwrap() == MortonKey::root() { + return vec![MortonKey::root()]; + } + + let deepest_level = deepest_level(linear_keys, comm); + + // Start with keys at deepest level + let mut work_list = linear_keys + .iter() + .copied() + .filter(|&key| key.level() == deepest_level) + .collect_vec(); + + let mut result = Vec::::new(); + + // Now go through and make sure that for each key siblings and neighbours of parents are added + + for level in (1..=deepest_level).rev() { + let mut parents = HashSet::::new(); + let mut new_work_list = Vec::::new(); + // We filter the work list by level and also make sure that + // only one sibling of each of the parents children is added to + // our current level list. + for key in work_list.iter() { + let parent = key.parent(); + if !parents.contains(&parent) { + parents.insert(parent); + result.extend_from_slice(key.siblings().as_slice()); + new_work_list.extend_from_slice( + parent + .neighbours() + .iter() + .copied() + .filter(|&key| key.is_valid()) + .collect_vec() + .as_slice(), + ); + } + } + new_work_list.extend( + linear_keys + .iter() + .copied() + .filter(|&key| key.level() == level - 1), + ); + + work_list = new_work_list; + } + + let result = linearize(&result, rng, comm); + + debug_assert!(is_complete_linear_and_balanced(&result, comm)); + result +} + +/// Return true if the keys are linear. +pub fn is_linear_tree(arr: &[MortonKey], comm: &C) -> bool { + let mut is_linear = true; + + for (&key1, &key2) in arr.iter().tuple_windows() { + if key1 >= key2 || key1.is_ancestor(key2) { + is_linear = false; + break; + } + } + + if comm.size() == 1 { + return is_linear; + } + + // Now check the interfaces + + if let Some(next_key) = communicate_back(arr, comm) { + let last = *arr.last().unwrap(); + if last >= next_key || last.is_ancestor(next_key) { + is_linear = false; + } + } + + let mut global_is_linear = false; + + comm.all_reduce_into( + &is_linear, + &mut global_is_linear, + SystemOperation::logical_and(), + ); + + global_is_linear +} + +/// Redistribute points with respect to a given coarse tree +pub fn redistribute_points_with_respect_to_coarse_tree( + points: &[Point], + morton_keys_for_points: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> (Vec, Vec) { + if comm.size() == 1 { + return (points.to_vec(), morton_keys_for_points.to_vec()); + } + + pub fn argsort(arr: &[T]) -> Vec { + let mut sort_indices = (0..arr.len()).collect_vec(); + sort_indices.sort_unstable_by_key(|&index| arr[index]); + sort_indices + } + + pub fn reorder(arr: &[T], permutation: &[usize]) -> Vec { + let mut reordered = Vec::::with_capacity(arr.len()); + for &index in permutation.iter() { + reordered.push(arr[index]) + } + reordered + } + + assert_eq!(points.len(), morton_keys_for_points.len()); + + let size = comm.size(); + + if size == 1 { + return (points.to_vec(), morton_keys_for_points.to_vec()); + } + + let sort_indices = argsort(morton_keys_for_points); + let sorted_keys = reorder(morton_keys_for_points, &sort_indices); + let sorted_points = reorder(points, &sort_indices); + + // Now get the bins + + let my_first = coarse_tree.first().unwrap(); + + let global_bins = gather_to_all(std::slice::from_ref(my_first), comm); + + // We now sort the morton indices into the bins. + + // This will store for each rank how many keys will be assigned to it. + + let counts = sort_to_bins(&sorted_keys, &global_bins) + .iter() + .map(|&elem| elem as i32) + .collect_vec(); + + // We now redistribute the points and the corresponding keys. + + let (distributed_points, distributed_keys) = ( + redistribute(&sorted_points, &counts, comm), + redistribute(&sorted_keys, &counts, comm), + ); + + // Now sort the distributed points and keys internally again. + + let sort_indices = argsort(&distributed_keys); + let sorted_keys = reorder(&distributed_keys, &sort_indices); + let sorted_points = reorder(&distributed_points, &sort_indices); + + (sorted_points, sorted_keys) +} + +/// Return true on all ranks if distributed tree is complete. Otherwise, return false. +pub fn is_complete_linear_tree(arr: &[MortonKey], comm: &C) -> bool { + // First check that the local tree on each node is complete. + + let mut complete_linear = true; + for (key1, key2) in arr.iter().tuple_windows() { + // Make sure that the keys are sorted and not duplicated. + if key1 >= key2 { + complete_linear = false; + break; + } + // The next key should be an ancestor of the next non-descendent key. + if let Some(expected_next) = key1.next_non_descendent_key() { + if !key2.is_ancestor(expected_next) { + complete_linear = false; + break; + } + } else { + // Only for the very last key there should not be a next non-descendent key. + complete_linear = false; + } + } + + // We now check the interfaces. + + if let Some(next_first) = communicate_back(arr, comm) { + // We are on any but the last rank + let last_key = arr.last().unwrap(); + + // Check that the keys are sorted and not duplicated. + if *last_key >= next_first { + complete_linear = false; + } + + // Check that the next key is an encestor of the next non-descendent. + if let Some(expected_next) = last_key.next_non_descendent_key() { + if !next_first.is_ancestor(expected_next) { + complete_linear = false; + } + } else { + complete_linear = false; + } + } else { + // We are on the last rank + // Check that the last key is ancestor of deepest last. + if !arr.last().unwrap().is_ancestor(MortonKey::deepest_last()) { + complete_linear = false; + } + } + + // Now check that at the first rank we include the deepest first. + + if comm.rank() == 0 && !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) { + complete_linear = false; + } + + // Now communicate everything together. + + let mut result = false; + comm.all_reduce_into( + &complete_linear, + &mut result, + SystemOperation::logical_and(), + ); + + result +} + +/// Return the deepest level of a distributed list of Morton keys. +pub fn deepest_level(keys: &[MortonKey], comm: &C) -> usize { + let local_deepest_level = keys.iter().map(|elem| elem.level()).max().unwrap(); + + if comm.size() == 1 { + return local_deepest_level; + } + + let mut global_deepest_level: usize = 0; + + comm.all_reduce_into( + &local_deepest_level, + &mut global_deepest_level, + SystemOperation::max(), + ); + + global_deepest_level +} + +/// Check if tree is balanced. +pub fn is_complete_linear_and_balanced( + arr: &[MortonKey], + comm: &C, +) -> bool { + // Send the tree to the root node and check there that it is balanced. + + let mut balanced = false; + + if let Some(arr) = gather_to_root(arr, comm) { + balanced = MortonKey::is_complete_linear_and_balanced(&arr); + } + + comm.process_at_rank(0).broadcast_into(&mut balanced); + + balanced +} + +/// For a complete linear bin get on each process the first key of all processes. +/// +/// This information can be used to query on which process a key is living. +pub fn get_tree_bins( + complete_linear_tree: &[MortonKey], + comm: &C, +) -> Vec { + gather_to_all( + std::slice::from_ref(complete_linear_tree.first().unwrap()), + comm, + ) +} + +/// For a sorted array return either position of the key or positioin directly before search key. +pub fn get_key_index(arr: &[MortonKey], key: MortonKey) -> usize { + // Does a binary search of the key. If the key is found with Ok(..) + // the exact index is returned of the found key. If the key is not found + // the closest larger index is returned. So we subtract one to get the closest + // smaller index. + + match arr.binary_search(&key) { + Ok(index) => index, + Err(index) => index - 1, + } +} + +/// Check if a key is associated with the current rank. +/// +/// Note that the key does not need to exist as leaf. It just needs +/// to be descendent of a coarse key on the current rank. +pub fn key_on_current_rank( + key: MortonKey, + coarse_tree_bounds: &[MortonKey], + rank: usize, + size: usize, +) -> bool { + if rank == size - 1 { + key >= *coarse_tree_bounds.last().unwrap() + } else { + coarse_tree_bounds[rank] <= key && key < coarse_tree_bounds[rank + 1] + } +} + +/// Generate all leaf and interior keys. +pub fn generate_all_keys( + leaf_tree: &[MortonKey], + coarse_tree: &[MortonKey], + coarse_tree_bounds: &[MortonKey], + comm: &C, +) -> HashMap { + /// This struct combines rank and key information for sending ghosts to neighbors. + #[derive(Copy, Clone, Equivalence)] + struct KeyWithRank { + key: MortonKey, + rank: usize, + } + + let rank = comm.rank() as usize; + let size = comm.size() as usize; + + let mut all_keys = HashMap::::new(); + let leaf_keys: HashSet = HashSet::from_iter(leaf_tree.iter().copied()); + + // If size == 1 we simply create locally the keys, so don't need to treat the global keys. + + if size > 1 { + let mut global_keys = HashSet::::new(); + + // First deal with the parents of the coarse tree. These are different + // as they may exist on multiple nodes, so receive a different label. + + for &key in coarse_tree { + let mut parent = key.parent(); + while parent.level() > 0 && !all_keys.contains_key(&parent) { + global_keys.insert(parent); + parent = parent.parent(); + } + } + + // We now send around the parents of the coarse tree to every node. These will + // be global keys. + + let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), comm); + + // We can now insert the global keys into `all_keys` with the `Global` label. + + for &key in &global_keys { + all_keys.entry(key).or_insert(KeyType::Global); + } + } + + // We now deal with the fine leafs and their ancestors. + // The leafs of the coarse tree will also be either part + // of the fine tree leafs or will be interior keys. In either + // case the following loop catches them. + + for leaf in leaf_keys { + debug_assert!(!all_keys.contains_key(&leaf)); + all_keys.insert(leaf, KeyType::LocalLeaf); + let mut parent = leaf.parent(); + while parent.level() > 0 && !all_keys.contains_key(&parent) { + all_keys.insert(parent, KeyType::LocalInterior); + parent = parent.parent(); + } + } + + // Need to explicitly add the root at the end. + all_keys.entry(MortonKey::root()).or_insert(KeyType::Global); + + // We only need to deal with ghosts if the size is larger than 1. + + if size > 1 { + // This maps from rank to the keys that we want to send to the ranks + + let mut rank_send_ghost = HashMap::>::new(); + for index in 0..size { + rank_send_ghost.insert(index, Vec::::new()); + } + + let mut send_to_all = Vec::::new(); + + for (&key, &status) in all_keys.iter() { + // We need not send around global keys to neighbors. + if status == KeyType::Global { + continue; + } + for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) { + // If the neighbour is a global key then continue. + if all_keys + .get(&neighbor) + .is_some_and(|&value| value == KeyType::Global) + { + // Global keys exist on all nodes, so need to send their neighbors to all nodes. + send_to_all.push(KeyWithRank { key, rank }); + } else { + // Get rank of the neighbour + let neighbor_rank = get_key_index(coarse_tree_bounds, neighbor); + rank_send_ghost + .entry(neighbor_rank) + .and_modify(|keys| keys.push(KeyWithRank { key, rank })); + } + } + } + + let send_ghost_to_all = gather_to_all(&send_to_all, comm); + // We now know which key needs to be sent to which rank. + // Turn to array, get the counts and send around. + + let (arr, counts) = { + let mut arr = Vec::::new(); + let mut counts = Vec::::new(); + for index in 0..size { + let keys = rank_send_ghost.get(&index).unwrap(); + arr.extend(keys.iter()); + counts.push(keys.len() as i32); + } + (arr, counts) + }; + + // These are all the keys that are neighbors to our keys. We now go through + // and store those that do not live on our tree as into `all_keys` with a label + // of `Ghost`. + let mut ghost_keys = redistribute(&arr, &counts, comm); + // Add the neighbors of any global key. + ghost_keys.extend(send_ghost_to_all.iter()); + + for key in &ghost_keys { + if key.rank == rank { + // Don't need to add the keys that are already on the rank. + continue; + } + all_keys.insert(key.key, KeyType::Ghost(key.rank)); + } + } + + all_keys +} + +#[cfg(test)] +mod test { + use crate::{ + octree::get_key_index, + tools::{generate_random_keys, seeded_rng}, + }; + + #[test] + fn test_get_key_rank() { + let mut rng = seeded_rng(0); + + let mut keys = generate_random_keys(50, &mut rng); + + keys.sort_unstable(); + + let mid = keys[25]; + + assert_eq!(25, get_key_index(&keys, mid)); + + // Now remove the mid index and do the same again. + + keys.remove(25); + + // The result should be 24. + + assert_eq!(24, get_key_index(&keys, mid)); + } +} diff --git a/src/parsort.rs b/src/parsort.rs index 16518cc..7191783 100644 --- a/src/parsort.rs +++ b/src/parsort.rs @@ -1,107 +1,28 @@ //! Implementation of a parallel samplesort. use std::fmt::Display; -use std::mem::offset_of; use itertools::Itertools; -use mpi::datatype::{UncommittedDatatypeRef, UncommittedUserDatatype, UserDatatype}; -use mpi::traits::{Equivalence, Root}; -use mpi::{ - datatype::{Partition, PartitionMut}, - traits::CommunicatorCollectives, -}; +use mpi::traits::CommunicatorCollectives; +use mpi::traits::Equivalence; use rand::{seq::SliceRandom, Rng}; -const OVERSAMPLING: usize = 8; +use crate::morton::MortonKey; +use crate::tools::{gather_to_all, global_max, global_min, redistribute_by_bins}; -/// Sortable trait that each type fed into parsort needs to satisfy. -pub trait ParallelSortable: - MinValue - + MaxValue - + Equivalence - + Copy - + Clone - + Default - + PartialEq - + Eq - + PartialOrd - + Ord - + Display - + Sized -{ -} - -impl< - T: MinValue - + MaxValue - + Equivalence - + Copy - + Clone - + Default - + PartialEq - + Eq - + PartialOrd - + Ord - + Display - + Sized, - > ParallelSortable for T -{ -} +const OVERSAMPLING: usize = 8; /// An internal struct. We convert every array element /// into this struct. The idea is that this is guaranteed to be unique /// as it encodes not only the element but also its rank and index. -#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord)] -struct UniqueItem { - pub value: T, +#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Equivalence)] +struct UniqueItem { + pub value: MortonKey, pub rank: usize, pub index: usize, } -unsafe impl Equivalence for UniqueItem { - type Out = UserDatatype; - - // Depending on the MPI implementation the below offset needs - // to be an i64 or isize. If it is an i64 Clippy warns about - // a useless conversion. But this warning is MPI implementation - // dependent. So switch off here. - - #[allow(clippy::useless_conversion)] - fn equivalent_datatype() -> Self::Out { - UserDatatype::structured::( - &[1, 1, 1], - &[ - (offset_of!(UniqueItem, value) as i64) - .try_into() - .unwrap(), - (offset_of!(UniqueItem, rank) as i64).try_into().unwrap(), - (offset_of!(UniqueItem, index) as i64) - .try_into() - .unwrap(), - ], - &[ - UncommittedUserDatatype::contiguous(1, &::equivalent_datatype()) - .as_ref(), - usize::equivalent_datatype().into(), - usize::equivalent_datatype().into(), - ], - ) - } -} - -/// Return the minimum possible value of a type. -pub trait MinValue { - /// Return the min value. - fn min_value() -> Self; -} - -/// Return the maximum possible value of a type. -pub trait MaxValue { - /// Return the max value. - fn max_value() -> Self; -} - -impl Display for UniqueItem { +impl Display for UniqueItem { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, @@ -111,34 +32,21 @@ impl Display for UniqueItem { } } -impl MinValue for UniqueItem { - fn min_value() -> Self { - UniqueItem::new(::min_value(), 0, 0) - } -} - -impl MaxValue for UniqueItem { - fn max_value() -> Self { - UniqueItem::new(::max_value(), 0, 0) - } -} - -impl UniqueItem { - pub fn new(value: T, rank: usize, index: usize) -> Self { +impl UniqueItem { + pub fn new(value: MortonKey, rank: usize, index: usize) -> Self { Self { value, rank, index } } } -fn to_unique_item(arr: &[T], rank: usize) -> Vec> { +fn to_unique_item(arr: &[MortonKey], rank: usize) -> Vec { arr.iter() .enumerate() .map(|(index, &item)| UniqueItem::new(item, rank, index)) .collect() } -fn get_buckets(arr: &[UniqueItem], comm: &C, rng: &mut R) -> Vec> +fn get_buckets(arr: &[UniqueItem], comm: &C, rng: &mut R) -> Vec where - T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized, { @@ -152,42 +60,22 @@ where OVERSAMPLING }; - // We are choosing unique splitters that neither contain - // zero nor u64::max. + // We get the global smallest and global largest element. We do not want those + // in the splitter so filter out their occurence. + + let global_min_elem = global_min(arr, comm); + let global_max_elem = global_max(arr, comm); + + // We do not want the global smallest element in the splitter. let splitters = arr .choose_multiple(rng, oversampling) .copied() .collect::>(); - // We use an all_gatherv so that each process receives all splitters. - // For that we first communicate how many splitters each process has - // and then we send the splitters themselves. - - let nsplitters = splitters.len(); - let mut splitters_per_rank = vec![0_usize; size]; - - comm.all_gather_into(&nsplitters, &mut splitters_per_rank); - - // We now know how many splitters each process has. We now create space - // for the splitters and send them all around. - - let n_all_splitters = splitters_per_rank.iter().sum(); + // We gather the splitters into all ranks so that each rank has all splitters. - let mut all_splitters = vec![Default::default(); n_all_splitters]; - let splitters_per_rank = splitters_per_rank.iter().map(|&x| x as i32).collect_vec(); - - let displs: Vec = splitters_per_rank - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut partition = PartitionMut::new(&mut all_splitters[..], splitters_per_rank, &displs[..]); - comm.all_gather_varcount_into(&splitters, &mut partition); + let mut all_splitters = gather_to_all(&splitters, comm); // We now have all splitters available on each process. // We can now sort the splitters. Every process will then have the same list of sorted splitters. @@ -197,97 +85,31 @@ where // We now insert the smallest and largest possible element if they are not already // in the splitter collection. - if *all_splitters.first().unwrap() != UniqueItem::min_value() { - all_splitters.insert(0, UniqueItem::min_value()) + if *all_splitters.first().unwrap() != global_min_elem { + all_splitters.insert(0, global_min_elem) } - if *all_splitters.last().unwrap() != UniqueItem::max_value() { - all_splitters.push(UniqueItem::max_value()); + if *all_splitters.last().unwrap() != global_max_elem { + all_splitters.push(global_max_elem); } // We now define p buckets (p is number of processors) and we return - // a p + 1 element array containing the first element of each bucket - // concluded with the largest possible element. + // a p element array containing the first element of each bucket all_splitters = split(&all_splitters, size) .map(|slice| slice.first().unwrap()) .copied() .collect::>(); - all_splitters.push(UniqueItem::max_value()); all_splitters } -fn get_counts(arr: &[UniqueItem], buckets: &[UniqueItem]) -> Vec { - // The following array will store the counts for each bucket. - - let mut counts = vec![0_usize; buckets.len() - 1]; - - // We are iterating through the array. Whenever an element is larger or equal than - // the current splitter we store the current position in `bin_displs` and advance `splitter_iter` - // by 1. - - // In the following iterator we skip the first bin displacement position as this must be the default - // zero (start of the bins). - - // Note that bucket iterator has as many elements as counts as the tuple_windows has length - // 1 smaller than the original array length. - let mut bucket_iter = buckets.iter().tuple_windows::<(_, _)>(); - - // We skip the first element as this is always zero. - let mut count_iter = counts.iter_mut(); - - let mut count: usize = 0; - let mut current_count = count_iter.next().unwrap(); - - let (mut first, mut last) = bucket_iter.next().unwrap(); - - for elem in arr { - // The test after the or sorts out the case that our set includes the maximum possible - // item and we are in the last bucket. The biggest item should be counted as belonging - // to the bucket. - if (first <= elem && elem < last) - || (*last == UniqueItem::max_value() && *elem == UniqueItem::max_value()) - { - // Element is in the right bucket. - count += 1; - continue; - } else { - // Element is not in the right bucket. - // Store counts and find the correct bucket. - *current_count = count; - loop { - (first, last) = bucket_iter.next().unwrap(); - current_count = count_iter.next().unwrap(); - if (first <= elem && elem < last) - || (*last == UniqueItem::max_value() && *elem == UniqueItem::max_value()) - { - break; - } - } - // Now have the right bucket. Reset count and continue. - count = 1; - } - } - - // Need to store the count for the last bucket in the iterator. - // This is always necessary as last iterator is half open interval. - // So we don't go into the else part of the for loop. - - *current_count = count; - - // We don't need to fill the remaining counts entries with zero - // since the array is already initialized with zero. - - counts -} - /// Parallel sort -pub fn parsort( - arr: &[T], +pub fn parsort( + arr: &[MortonKey], comm: &C, rng: &mut R, -) -> Vec { +) -> Vec { let size = comm.size() as usize; let rank = comm.rank() as usize; // If we only have a single rank simply sort the local array and return @@ -313,50 +135,8 @@ pub fn parsort let buckets = get_buckets(&arr, comm, rng); - // We now compute how many elements of our array go into each bucket. - - let counts = get_counts(&arr, &buckets) - .iter() - .map(|&elem| elem as i32) - .collect::>(); - - // We now do an all_to_allv to communicate the array elements to the right processors. - - // First we need to communicate how many elements everybody gets from each processor. - - let mut counts_from_processor = vec![0_i32; size]; - - comm.all_to_all_into(&counts, &mut counts_from_processor); - - // Each processor now knows how much he gets from all the others. - - // We can now send around the actual elements with an alltoallv. - let send_displs: Vec = counts - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let send_partition = Partition::new(&arr, counts, &send_displs[..]); - - let mut recvbuffer = - vec![UniqueItem::default(); counts_from_processor.iter().sum::() as usize]; - - let recv_displs: Vec = counts_from_processor - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut receiv_partition = - PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]); - comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); + // We now redistribute with respect to these buckets. + let mut recvbuffer = redistribute_by_bins(&arr, &buckets, comm); // We now have everything in the receive buffer. Now sort the local elements and return @@ -396,72 +176,3 @@ impl<'a, T> Iterator for Split<'a, T> { Some(chunk) } } - -/// Array to root -pub fn array_to_root( - arr: &[T], - comm: &C, -) -> Option> { - let n = arr.len() as i32; - let rank = comm.rank(); - let size = comm.size(); - let root_process = comm.process_at_rank(0); - - // We first communicate the length of the array to root. - - if rank == 0 { - // We are at root. - - let mut ranks = vec![0_i32; size as usize]; - root_process.gather_into_root(&n, &mut ranks); - - // We now have all ranks at root. Can now a varcount gather to get - // the array elements. - - let nelements = ranks.iter().sum::(); - - let mut new_arr = vec![::default(); nelements as usize]; - - let displs: Vec = ranks - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut partition = PartitionMut::new(&mut new_arr[..], ranks, &displs[..]); - - root_process.gather_varcount_into_root(arr, &mut partition); - Some(new_arr) - } else { - root_process.gather_into(&n); - root_process.gather_varcount_into(arr); - None - } -} - -macro_rules! impl_min_max_value { - ($type:ty) => { - impl MinValue for $type { - fn min_value() -> Self { - <$type>::MIN - } - } - - impl MaxValue for $type { - fn max_value() -> Self { - <$type>::MAX - } - } - }; -} - -impl_min_max_value!(usize); -impl_min_max_value!(i8); -impl_min_max_value!(i32); -impl_min_max_value!(i64); -impl_min_max_value!(u8); -impl_min_max_value!(u32); -impl_min_max_value!(u64); diff --git a/src/serial.rs b/src/serial.rs new file mode 100644 index 0000000..0249bb8 --- /dev/null +++ b/src/serial.rs @@ -0,0 +1,322 @@ +//! Definition of a linear octree + +use crate::{ + constants::{DEEPEST_LEVEL, NLEVELS}, + geometry::{PhysicalBox, Point}, + morton::MortonKey, +}; +use std::collections::HashMap; +use vtkio; + +/// A neighbour +pub struct Neighbour { + /// Direction + pub direction: [i64; 3], + /// Level + pub level: usize, + /// Morton key + pub key: MortonKey, +} + +/// An octree +pub struct Octree { + leaf_keys: Vec, + points: Vec, + point_to_level_keys: [Vec; NLEVELS], + bounding_box: PhysicalBox, + key_counts: HashMap, + max_leaf_level: usize, + max_points_in_leaf: usize, +} + +impl Octree { + /// Create octress from points + pub fn from_points(points: &[Point], max_level: usize, max_points_per_box: usize) -> Self { + // Make sure that the points array is a multiple of 3. + + // Make sure that max level never exceeds DEEPEST_LEVEL + let max_level = if max_level > DEEPEST_LEVEL as usize { + DEEPEST_LEVEL as usize + } else { + max_level + }; + + // Compute the physical bounding box. + + let bounding_box = PhysicalBox::from_points(points); + + // Bunch the points in arrays of 3. + + let npoints = points.len(); + + // We create a vector of keys for each point on each level. We compute the + // keys on the deepest level and fill the other levels by going from + // parent to parent. + + let mut point_to_level_keys: [Vec; NLEVELS] = Default::default(); + point_to_level_keys[DEEPEST_LEVEL as usize] = points + .iter() + .map(|&point| { + MortonKey::from_physical_point(point, &bounding_box, DEEPEST_LEVEL as usize) + }) + .collect::>(); + + for index in (1..=DEEPEST_LEVEL as usize).rev() { + let mut new_vec = Vec::::with_capacity(npoints); + for &key in &point_to_level_keys[index] { + new_vec.push(key.parent()); + } + point_to_level_keys[index - 1] = new_vec; + } + + // We now have to create level keys. We are starting at the root and recursing + // down until each box has fewer than max_points_per_box keys. + + // First we compute the counts of each key on each level. For that we create + // for each level a Hashmap for the keys and then add up. + + let mut key_counts: HashMap = Default::default(); + + for keys in &point_to_level_keys { + for key in keys { + *key_counts.entry(*key).or_default() += 1; + } + } + + // We can now easily create an adaptive tree by subdividing. We do this by + // a recursive function. + + let mut leaf_keys = Vec::::new(); + + fn recurse_keys( + key: MortonKey, + key_counts: &HashMap, + leaf_keys: &mut Vec, + max_points_per_box: usize, + max_level: usize, + ) { + let level = key.level(); + // A key may have not be associated with points. This happens if one of the children on + // the previous level has no points in its physical box. However, we want to create a + // complete tree. So we still add this one empty child. + if let Some(&count) = key_counts.get(&key) { + if count > max_points_per_box && level < max_level { + for child in key.children() { + recurse_keys(child, key_counts, leaf_keys, max_points_per_box, max_level); + } + } else { + leaf_keys.push(key) + } + } else { + leaf_keys.push(key) + } + } + + // Now execute the recursion starting from root + + recurse_keys( + MortonKey::root(), + &key_counts, + &mut leaf_keys, + max_points_per_box, + max_level, + ); + + // The leaf keys are now a complete linear tree. But they are not yet balanced. + // In the final step we balance the leafs. + + let leaf_keys = MortonKey::balance(&leaf_keys, MortonKey::root()); + + let mut max_leaf_level = 0; + let mut max_points_in_leaf = 0; + + for key in &leaf_keys { + max_leaf_level = max_leaf_level.max(key.level()); + max_points_in_leaf = + max_points_in_leaf.max(if let Some(&count) = key_counts.get(key) { + count + } else { + 0 + }); + } + + Self { + leaf_keys, + points: points.to_vec(), + point_to_level_keys, + bounding_box, + key_counts, + max_leaf_level, + max_points_in_leaf, + } + } + + /// Leaf keys + pub fn leaf_keys(&self) -> &Vec { + &self.leaf_keys + } + + /// Points + pub fn points(&self) -> &Vec { + &self.points + } + + /// Get level keys for each point + pub fn point_to_level_keys(&self) -> &[Vec; NLEVELS] { + &self.point_to_level_keys + } + + /// Bounding box + pub fn bounding_box(&self) -> &PhysicalBox { + &self.bounding_box + } + + /// Maximum leaf level + pub fn maximum_leaf_level(&self) -> usize { + self.max_leaf_level + } + + /// Maximum number of points in a leaf box + pub fn max_points_in_leaf_box(&self) -> usize { + self.max_points_in_leaf + } + + /// Number of points in the box indexed by a key + pub fn number_of_points_in_key(&self, key: MortonKey) -> usize { + if let Some(&count) = self.key_counts.get(&key) { + count + } else { + 0 + } + } + + /// Export the tree to vtk + pub fn export_to_vtk(&self, file_path: &str) { + use vtkio::model::{ + Attributes, ByteOrder, CellType, Cells, DataSet, IOBuffer, UnstructuredGridPiece, + Version, VertexNumbers, + }; + + // Each box has 8 corners with 3 coordinates each, hence 24 floats per key. + let mut points = Vec::::new(); + // 8 coords per box, hence 8 * nkeys values in connectivity. + let mut connectivity = Vec::::new(); + // Store the vtk offset for each box. + let mut offsets = Vec::::new(); + + let bounding_box = self.bounding_box(); + + // Go through the keys and add coordinates and connectivity. + // Box coordinates are already in the right order, so connectivity + // just counts up. We don't mind doubly counted vertices from two boxes. + let mut point_count = 0; + let mut key_count = 0; + + for key in self.leaf_keys().iter() { + // We only want to export non-empty boxes. + if self.number_of_points_in_key(*key) == 0 { + continue; + } + let coords = key.physical_box(bounding_box).corners(); + + key_count += 1; + offsets.push(8 * key_count); + + for coord in &coords { + points.push(coord[0]); + points.push(coord[1]); + points.push(coord[2]); + + connectivity.push(point_count); + point_count += 1; + } + } + + let vtk_file = vtkio::Vtk { + version: Version::new((1, 0)), + title: String::new(), + byte_order: ByteOrder::LittleEndian, + file_path: None, + data: DataSet::inline(UnstructuredGridPiece { + points: IOBuffer::F64(points), + cells: Cells { + cell_verts: VertexNumbers::XML { + connectivity, + offsets, + }, + types: vec![CellType::Hexahedron; key_count as usize], + }, + data: Attributes { + point: vec![], + cell: vec![], + }, + }), + }; + + vtk_file.export_ascii(file_path).unwrap(); + } + + // We can now create the vtk object. +} + +#[cfg(test)] +mod test { + use crate::geometry::Point; + + use super::Octree; + use rand::prelude::*; + + fn get_points_on_sphere(npoints: usize) -> Vec { + let mut rng = rand::rngs::StdRng::seed_from_u64(0); + let normal = rand_distr::Normal::new(0.0, 1.0).unwrap(); + + let mut points = Vec::::with_capacity(npoints); + for _ in 0..(npoints) { + let x: f64 = normal.sample(&mut rng); + let y: f64 = normal.sample(&mut rng); + let z: f64 = normal.sample(&mut rng); + + let norm = (x * x + y * y + z * z).sqrt(); + + points.push(Point::new([x / norm, y / norm, z / norm], 0)); + } + + points + } + + #[test] + fn test_octree() { + use std::time::Instant; + + let npoints = 10000; + let points = get_points_on_sphere(npoints); + let max_level = 7; + let max_points_per_box = 100; + + let start = Instant::now(); + let octree = Octree::from_points(&points, max_level, max_points_per_box); + let duration = start.elapsed(); + + println!("Creation time: {}", duration.as_millis()); + println!("Number of leaf keys: {}", octree.leaf_keys().len()); + println!("Bounding box: {}", octree.bounding_box()); + } + + #[test] + fn test_export() { + let fname = "_test_sphere.vtk"; + let npoints = 10000; + let points = get_points_on_sphere(npoints); + let max_level = 7; + let max_points_per_box = 100; + + let octree = Octree::from_points(&points, max_level, max_points_per_box); + + octree.export_to_vtk(fname); + println!("Maximum leaf level: {}", octree.maximum_leaf_level()); + println!( + "Maximum number of points in leaf box: {}", + octree.max_points_in_leaf_box() + ); + } +} diff --git a/src/tools.rs b/src/tools.rs new file mode 100644 index 0000000..dbe8edd --- /dev/null +++ b/src/tools.rs @@ -0,0 +1,438 @@ +//! Utility routines. + +use itertools::{izip, Itertools}; +use mpi::{ + collective::{SystemOperation, UserOperation}, + datatype::{Partition, PartitionMut}, + point_to_point as p2p, + traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source}, +}; +use num::traits::Zero; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + +use crate::{ + constants::{DEEPEST_LEVEL, LEVEL_SIZE}, + geometry::Point, + morton::MortonKey, +}; + +/// Gather array to all processes +pub fn gather_to_all(arr: &[T], comm: &C) -> Vec { + // First we need to broadcast the individual sizes on each process. + + let size = comm.size(); + + let local_len = arr.len() as i32; + + let mut sizes = vec![0; size as usize]; + + comm.all_gather_into(&local_len, &mut sizes); + + let recv_len = sizes.iter().sum::() as usize; + + // Now we have the size of each local contribution. + // let mut recvbuffer = + // vec![T: Default; counts_from_processor.iter().sum::() as usize]; + let mut recvbuffer = Vec::::with_capacity(recv_len); + let buf: &mut [T] = unsafe { std::mem::transmute(recvbuffer.spare_capacity_mut()) }; + + let recv_displs: Vec = sizes + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect(); + + let mut receiv_partition = PartitionMut::new(buf, sizes, &recv_displs[..]); + + comm.all_gather_varcount_into(arr, &mut receiv_partition); + + unsafe { recvbuffer.set_len(recv_len) }; + + recvbuffer +} +/// Array to root + +/// Gather distributed array to the root rank. +/// +/// The result is a `Vec` on root and `None` on all other ranks. +pub fn gather_to_root( + arr: &[T], + comm: &C, +) -> Option> { + let n = arr.len() as i32; + let rank = comm.rank(); + let size = comm.size(); + let root_process = comm.process_at_rank(0); + + // We first communicate the length of the array to root. + + if rank == 0 { + // We are at root. + + let mut counts = vec![0_i32; size as usize]; + root_process.gather_into_root(&n, &mut counts); + + // We now have all ranks at root. Can now a varcount gather to get + // the array elements. + + let nelements = counts.iter().sum::(); + let mut new_arr = Vec::::with_capacity(nelements as usize); + let new_arr_buf: &mut [T] = unsafe { std::mem::transmute(new_arr.spare_capacity_mut()) }; + + let displs = displacements(counts.as_slice()); + + let mut partition = PartitionMut::new(new_arr_buf, counts, &displs[..]); + + root_process.gather_varcount_into_root(arr, &mut partition); + + unsafe { new_arr.set_len(nelements as usize) }; + Some(new_arr) + } else { + root_process.gather_into(&n); + root_process.gather_varcount_into(arr); + None + } +} + +/// Get global size of a distributed array. +/// +/// Computes the size and broadcoasts it to all ranks. +pub fn global_size(arr: &[T], comm: &C) -> usize { + let local_size = arr.len(); + let mut global_size = 0; + + comm.all_reduce_into(&local_size, &mut global_size, SystemOperation::sum()); + + global_size +} + +/// Get the maximum value across all ranks +pub fn global_max( + arr: &[T], + comm: &C, +) -> T { + let local_max = arr.iter().max().unwrap(); + + // Just need to initialize global_max with something. + let mut global_max = *local_max; + + comm.all_reduce_into( + local_max, + &mut global_max, + &UserOperation::commutative(|x, y| { + let x: &[T] = x.downcast().unwrap(); + let y: &mut [T] = y.downcast().unwrap(); + for (&x_i, y_i) in x.iter().zip(y) { + *y_i = x_i.max(*y_i); + } + }), + ); + + global_max +} + +/// Get the minimum value across all ranks +pub fn global_min( + arr: &[T], + comm: &C, +) -> T { + let local_min = *arr.iter().min().unwrap(); + + // Just need to initialize global_min with something. + let mut global_min = local_min; + + comm.all_reduce_into( + &local_min, + &mut global_min, + &UserOperation::commutative(|x, y| { + let x: &[T] = x.downcast().unwrap(); + let y: &mut [T] = y.downcast().unwrap(); + for (&x_i, y_i) in x.iter().zip(y) { + *y_i = x_i.min(*y_i); + } + }), + ); + + global_min +} + +/// Communicate the first element of each local array back to the previous rank. +pub fn communicate_back( + arr: &[T], + comm: &C, +) -> Option { + let rank = comm.rank(); + let size = comm.size(); + + if size == 1 { + return None; + } + + if rank == size - 1 { + comm.process_at_rank(rank - 1).send(arr.first().unwrap()); + None + } else { + let (new_last, _status) = if rank > 0 { + p2p::send_receive( + arr.first().unwrap(), + &comm.process_at_rank(rank - 1), + &comm.process_at_rank(rank + 1), + ) + } else { + comm.process_at_rank(1).receive::() + }; + Some(new_last) + } +} + +/// Check if an array is sorted. +pub fn is_sorted_array( + arr: &[T], + comm: &C, +) -> bool { + let mut sorted = true; + for (elem1, elem2) in arr.iter().tuple_windows() { + if elem1 > elem2 { + sorted = false; + } + } + + if comm.size() == 1 { + return sorted; + } + + if let Some(next_first) = communicate_back(arr, comm) { + sorted = *arr.last().unwrap() <= next_first; + } + + let mut global_sorted: bool = false; + comm.all_reduce_into(&sorted, &mut global_sorted, SystemOperation::logical_and()); + + global_sorted +} + +/// Redistribute an array via an all_to_all_varcount operation. +pub fn redistribute( + arr: &[T], + counts: &[i32], + comm: &C, +) -> Vec { + assert_eq!(counts.len(), comm.size() as usize); + + // First send the counts around via an alltoall operation. + + let mut recv_counts = vec![0; counts.len()]; + + comm.all_to_all_into(counts, &mut recv_counts); + + // We have the recv_counts. Allocate space and setup the partitions. + + let nelems = recv_counts.iter().sum::() as usize; + + let mut output = Vec::::with_capacity(nelems); + let out_buf: &mut [T] = unsafe { std::mem::transmute(output.spare_capacity_mut()) }; + + let send_partition = Partition::new(arr, counts, displacements(counts)); + let mut recv_partition = + PartitionMut::new(out_buf, &recv_counts[..], displacements(&recv_counts)); + + comm.all_to_all_varcount_into(&send_partition, &mut recv_partition); + + unsafe { output.set_len(nelems) }; + + output +} + +/// Perform a global inclusive cumulative sum operation. +/// +/// For the array `[1, 3, 5, 7]` the output will be `[1, 4, 9, 16]`. +pub fn global_inclusive_cumsum( + arr: &[T], + comm: &C, +) -> Vec { + let mut scan: Vec = arr + .iter() + .scan(::zero(), |state, x| { + *state = *x + *state; + Some(*state) + }) + .collect_vec(); + let scan_last = *scan.last().unwrap(); + let mut scan_result = T::zero(); + comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum()); + for elem in &mut scan { + *elem = *elem + scan_result; + } + + scan +} + +/// Distribute a sorted sequence into bins. +/// +/// For an array with n elements to be distributed into p bins, +/// the array `bins` has p elements. The bins are defined by half-open intervals +/// of the form [b_j, b_{j+1})). The final bin is the half-open interval [b_{p-1}, \infty). +/// It is assumed that the bins and the elements are both sorted sequences and that +/// every element has an associated bin. +/// The function returns a p element array with the counts of how many elements go to each bin. +/// Since the sequence is sorted this fully defines what element goes into which bin. +pub fn sort_to_bins(sorted_keys: &[T], bins: &[T]) -> Vec { + let nbins = bins.len(); + + // Make sure that the smallest element of the sorted keys fits into the bins. + assert!(bins.first().unwrap() <= sorted_keys.first().unwrap()); + + // Deal with the special case that there is only one bin. + // This means that all elements are in the one bin. + if nbins == 1 { + return vec![sorted_keys.len(); 1]; + } + + let mut bin_counts = vec![0; nbins]; + + // This iterates over each possible bin and returns also the associated rank. + // The last bin position is not iterated over since for an array with p elements + // there are p-1 tuple windows. + let mut bin_iter = izip!( + bin_counts.iter_mut(), + bins.iter().tuple_windows::<(&T, &T)>(), + ); + + // We take the first element of the bin iterator. There will always be at least one since + // there are at least two bins (an actual one, and the last half infinite one) + let mut r: &mut usize; + let mut bin_start: &T; + let mut bin_end: &T; + (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); + + let mut count = 0; + 'outer: for key in sorted_keys.iter() { + if bin_start <= key && key < bin_end { + *r += 1; + count += 1; + } else { + // Move the bin forward until it fits. There will always be a fitting bin. + loop { + if let Some((rn, (bsn, ben))) = bin_iter.next() { + if bsn <= key && key < ben { + // We have found the next fitting bin for our current element. + // Can register it and go back to the outer for loop. + *rn += 1; + r = rn; + bin_start = bsn; + bin_end = ben; + count += 1; + break; + } + } else { + // We have no more fitting bin. So break the outer loop. + break 'outer; + } + } + } + } + + // We now have everything but the last bin. Just bunch the remaining elements to + // the last count. + *bin_counts.last_mut().unwrap() = sorted_keys.len() - count; + + bin_counts +} + +/// Redistribute locally sorted keys with respect to bins. +/// +/// - The array `sorted_keys` is assumed to be sorted within each process. It needs not be globally sorted. +/// - If there are `r` ranks in the communicator, the size of `bins` must be `r`. +/// - The bins are defined through half-open intervals `(bin[0], bin[1])`, .... This defines r-1 bins. The +/// last bin is the half-open interval `[bin[r-1], \infty)`. +/// - All array elements must be larger or equal `bin[0]`. This means that each element can be sorted into a bin. +pub fn redistribute_by_bins( + sorted_keys: &[T], + bins: &[T], + comm: &C, +) -> Vec { + let counts = sort_to_bins(sorted_keys, bins); + let counts = counts.iter().map(|elem| *elem as i32).collect_vec(); + redistribute(sorted_keys, &counts, comm) +} + +/// Generate random keys for testing. +pub fn generate_random_keys(nkeys: usize, rng: &mut R) -> Vec { + let mut result = Vec::::with_capacity(nkeys); + + let xindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + let yindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + let zindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + + for (xval, yval, zval) in izip!(xindices.iter(), yindices.iter(), zindices.iter()) { + result.push(MortonKey::from_index_and_level( + [xval, yval, zval], + DEEPEST_LEVEL as usize, + )); + } + + result +} + +/// Generate random points for testing. +pub fn generate_random_points( + npoints: usize, + rng: &mut R, + comm: &C, +) -> Vec { + let mut points = Vec::::with_capacity(npoints); + let rank = comm.rank() as usize; + + for index in 0..npoints { + points.push(Point::new( + [rng.gen(), rng.gen(), rng.gen()], + npoints * rank + index, + )); + } + + points +} + +/// Get a seeded rng +pub fn seeded_rng(seed: usize) -> ChaCha8Rng { + ChaCha8Rng::seed_from_u64(seed as u64) +} + +/// Compute displacements from a vector of counts. +/// +/// This is useful for global MPI varcount operations. Let +/// count [ 3, 4, 5]. Then the corresponding displacements are +// [0, 3, 7]. Note that the last element `5` is ignored. +pub fn displacements(counts: &[i32]) -> Vec { + counts + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect() +} + +#[cfg(test)] +mod test { + use itertools::Itertools; + + use super::sort_to_bins; + + #[test] + fn test_sort_to_bins() { + let elems = (0..100).collect_vec(); + let bins = [0, 17, 55]; + + let counts = sort_to_bins(&elems, &bins); + + assert_eq!(counts[0], 17); + assert_eq!(counts[1], 38); + assert_eq!(counts[2], 45); + } +}