From 8a13a8db309009dd9382c9c0017bcfa3a7c08729 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Wed, 11 Sep 2024 23:19:48 +0100 Subject: [PATCH 01/42] WIP: Partitioning --- examples/mpi.rs | 49 +++++ src/lib.rs | 1 + src/morton.rs | 16 +- src/parallel_octree.rs | 397 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 462 insertions(+), 1 deletion(-) create mode 100644 examples/mpi.rs create mode 100644 src/parallel_octree.rs diff --git a/examples/mpi.rs b/examples/mpi.rs new file mode 100644 index 0000000..05a88b9 --- /dev/null +++ b/examples/mpi.rs @@ -0,0 +1,49 @@ +//! Testing the hyksort component. +use bempp_octree::morton::MortonKey; +use bempp_octree::parallel_octree::partition; +use bempp_octree::parsort::{array_to_root, parsort}; +use itertools::Itertools; +use mpi::traits::Communicator; +use rand::prelude::*; + +pub fn main() { + let universe = mpi::initialize().unwrap(); + let world = universe.world(); + let rank = world.rank() as u64; + let n_per_rank = 10; + + let mut rng = rand::rngs::StdRng::seed_from_u64(0); + + let mut arr = Vec::::new(); + let mut weights = Vec::::new(); + + for index in 0..n_per_rank { + arr.push(MortonKey::from_index_and_level([0, 0, 0], 0)); + weights.push(1); + } + + // let t = n_per_rank * rank as usize; + // let mut index_sum = if rank == 0 { 0 } else { (t * (t - 1)) / 2 }; + // for index in n_per_rank * (rank as usize)..(n_per_rank * (1 + rank as usize)) { + // arr.push(MortonKey::from_index_and_level([0, 0, 0], 0)); + // weights.push(index_sum); + // index_sum += index; + // // weights.push(rng.gen_range(1..20)); + // } + + let partitioned = partition(&arr, &weights, &world); + + println!("Rank: {}, Len: {}", rank, partitioned.len()); + + let arr = array_to_root(&partitioned, &world); + + if rank == 0 { + let arr = arr.unwrap(); + + for (elem1, elem2) in arr.iter().tuple_windows() { + assert!(elem1 <= elem2); + } + println!("{} elements are sorted.", arr.len()); + println!("Finished."); + } +} diff --git a/src/lib.rs b/src/lib.rs index 3f5fbcc..abc07bd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,5 +6,6 @@ pub mod constants; pub mod geometry; pub mod morton; pub mod octree; +pub mod parallel_octree; pub mod parsort; pub mod types; diff --git a/src/morton.rs b/src/morton.rs index 6701ead..0f93c66 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -6,15 +6,17 @@ use crate::constants::{ Y_LOOKUP_ENCODE, Z_LOOKUP_DECODE, Z_LOOKUP_ENCODE, }; use crate::geometry::PhysicalBox; +use crate::parsort::{MaxValue, MinValue}; use itertools::izip; use itertools::Itertools; +use mpi::traits::Equivalence; use std::collections::HashSet; /// A morton key /// /// This is a distinct type to distinguish from u64 /// numbers. -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Equivalence)] pub struct MortonKey { value: u64, } @@ -25,6 +27,18 @@ impl Default for MortonKey { } } +impl MinValue for MortonKey { + fn min_value() -> Self { + MortonKey::root() + } +} + +impl MaxValue for MortonKey { + fn max_value() -> Self { + MortonKey::deepest_last() + } +} + impl MortonKey { /// Create a new Morton key. Users should use `[MortonKey::from_index_and_level].` fn new(value: u64) -> Self { diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs new file mode 100644 index 0000000..1db18e0 --- /dev/null +++ b/src/parallel_octree.rs @@ -0,0 +1,397 @@ +//! Parallel Octree structure + +use std::{borrow::BorrowMut, collections::HashMap, fmt::Display}; + +use crate::{ + constants::{DEEPEST_LEVEL, NLEVELS}, + geometry::PhysicalBox, + morton::MortonKey, + parsort::{parsort, MaxValue, MinValue}, +}; + +use mpi::{ + datatype::{Partition, PartitionMut}, + point_to_point as p2p, + traits::{Root, Source}, +}; + +use itertools::{izip, Itertools}; +use mpi::{ + collective::SystemOperation, + datatype::UncommittedUserDatatype, + topology::Process, + traits::{CommunicatorCollectives, Destination, Equivalence}, +}; +use rand::Rng; + +// /// A weighted Mortonkey contains weights to enable load balancing. +// #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Equivalence)] +// pub struct WeightedMortonKey { +// /// The actual MortonKey. +// pub key: MortonKey, +// /// The weight of the key, typically the number of points in the corresponding octant. +// pub weight: usize, +// } + +// impl WeightedMortonKey { +// /// Get a new weighted Morton key +// pub fn new(key: MortonKey, weight: usize) -> Self { +// Self { key, weight } +// } +// } + +// impl MinValue for WeightedMortonKey { +// fn min_value() -> Self { +// WeightedMortonKey { +// key: MortonKey::from_index_and_level([0, 0, 0], 0), +// weight: 0, +// } +// } +// } + +// impl MaxValue for WeightedMortonKey { +// fn max_value() -> Self { +// WeightedMortonKey { +// key: MortonKey::deepest_last(), +// weight: usize::MAX, +// } +// } +// } + +// impl Default for WeightedMortonKey { +// fn default() -> Self { +// WeightedMortonKey::new(Default::default(), 0) +// } +// } + +// impl Display for WeightedMortonKey { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// write!(f, "(Key: {}, Weight: {}", self.key, self.weight) +// } +// } + +/// Compute the global bounding box across all points on all processes. +pub fn compute_global_bounding_box( + points: &[f64], + comm: &C, +) -> PhysicalBox { + // Make sure that the points array is a multiple of 3. + assert_eq!(points.len() % 3, 0); + let points: &[[f64; 3]] = bytemuck::cast_slice(points); + + // Now compute the minimum and maximum across each dimension. + + let mut xmin = f64::MAX; + let mut xmax = f64::MIN; + + let mut ymin = f64::MAX; + let mut ymax = f64::MIN; + + let mut zmin = f64::MAX; + let mut zmax = f64::MIN; + + for point in points { + let x = point[0]; + let y = point[1]; + let z = point[2]; + + xmin = f64::min(xmin, x); + xmax = f64::max(xmax, x); + + ymin = f64::min(ymin, y); + ymax = f64::max(ymax, y); + + zmin = f64::min(zmin, z); + zmax = f64::max(zmax, z); + } + + let mut global_xmin = 0.0; + let mut global_xmax = 0.0; + + let mut global_ymin = 0.0; + let mut global_ymax = 0.0; + + let mut global_zmin = 0.0; + let mut global_zmax = 0.0; + + comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min()); + comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max()); + + comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min()); + comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max()); + + comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min()); + comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max()); + + let xdiam = global_xmax - global_xmin; + let ydiam = global_ymax - global_ymin; + let zdiam = global_zmax - global_zmin; + + let xmean = global_xmin + 0.5 * xdiam; + let ymean = global_ymin + 0.5 * ydiam; + let zmean = global_zmin + 0.5 * zdiam; + + // We increase diameters by box size on deepest level + // and use the maximum diameter to compute a + // cubic bounding box. + + let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64; + + let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap(); + + let max_diam = max_diam * (1.0 + deepest_box_diam); + + PhysicalBox::new([ + xmean - 0.5 * max_diam, + ymean - 0.5 * max_diam, + zmean - 0.5 * max_diam, + xmean + 0.5 * max_diam, + ymean + 0.5 * max_diam, + zmean + 0.5 * max_diam, + ]) +} + +/// Convert points to Morton keys on specified level. +pub fn points_to_morton( + points: &[f64], + max_level: usize, + comm: &C, +) -> (Vec, PhysicalBox) { + // Make sure that the points array is a multiple of 3. + assert_eq!(points.len() % 3, 0); + + // Make sure that max level never exceeds DEEPEST_LEVEL + let max_level = if max_level > DEEPEST_LEVEL as usize { + DEEPEST_LEVEL as usize + } else { + max_level + }; + + // Compute the physical bounding box. + + let bounding_box = compute_global_bounding_box(points, comm); + + // Bunch the points in arrays of 3. + + let points: &[[f64; 3]] = bytemuck::cast_slice(points); + + let keys = points + .iter() + .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level)) + .collect_vec(); + + // Now want to get weighted Morton keys. We use a HashMap. + + let mut value_counts = HashMap::::new(); + + for key in &keys { + *value_counts.entry(*key).or_insert(0) += 1; + } + + // let weighted_keys = value_counts + // .iter() + // .map(|(&key, &weight)| WeightedMortonKey::new(key, weight)) + // .collect_vec(); + + (keys, bounding_box) +} + +pub fn block_partition( + keys: &[MortonKey], + rng: &mut R, + comm: &C, +) { + // First we sort the array of weighted keys. + + let sorted_keys = parsort(&keys, comm, rng); + + let mut completed_region = + MortonKey::complete_region(&[*sorted_keys.first().unwrap(), *sorted_keys.last().unwrap()]); + + // Get the smallest level members of the completed region. + + let min_level = completed_region + .iter() + .map(|elem| elem.level()) + .min() + .unwrap(); + + let largest_boxes = completed_region + .iter() + .filter(|elem| elem.level() == min_level); +} + +/// Linearize a set of weighted Morton keys. +pub fn linearize( + keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + // We are first sorting the keys. Then in a linear process across all processors we + // go through the arrays and delete ancestors of nodes. + + let sorted_keys = parsort(&keys, comm, rng); + + let size = comm.size(); + let rank = comm.rank(); + + // Each process needs to send its first element to the previous process. Each process + // then goes through its own list and retains elements that are not ancestors of the + // next element. + + let mut result = Vec::::new(); + + if rank == size - 1 { + comm.process_at_rank(rank - 1) + .send(sorted_keys.first().unwrap()); + + for (&m1, &m2) in sorted_keys.iter().tuple_windows() { + // m1 is also ancestor of m2 if they are identical. + if m1.is_ancestor(m2) { + continue; + } else { + result.push(m1); + } + } + + result.push(*sorted_keys.last().unwrap()); + } else { + let (other, _status) = if rank > 0 { + p2p::send_receive( + sorted_keys.first().unwrap(), + &comm.process_at_rank(rank - 1), + &comm.process_at_rank(rank + 1), + ) + } else { + comm.any_process().receive::() + }; + for (&m1, &m2) in sorted_keys.iter().tuple_windows() { + // m1 is also ancestor of m2 if they are identical. + if m1.is_ancestor(m2) { + continue; + } else { + result.push(m1); + } + } + + let last = *sorted_keys.last().unwrap(); + + if !last.is_ancestor(other) { + result.push(last) + } + } + + result +} + +/// Balance a sorted list of Morton keys across processors given an array of corresponding weights. +pub fn partition( + sorted_keys: &[MortonKey], + weights: &[usize], + comm: &C, +) -> Vec { + assert_eq!(sorted_keys.len(), weights.len()); + + let size = comm.size(); + let rank = comm.rank(); + + // First scan the weight. + + let mut scan: Vec = vec![0; sorted_keys.len()]; + comm.scan_into(weights, scan.as_mut_slice(), SystemOperation::sum()); + + let mut total_weight = if rank == size - 1 { + *scan.last().unwrap() + } else { + 0 + }; + + // Scan the weight (form cumulative sums) and broadcast the total weight (last entry on last process) + // to all other processes. + + comm.process_at_rank(size - 1) + .broadcast_into(&mut total_weight); + + let w = total_weight / (size as usize); + let k = total_weight % (size as usize); + + let mut hash_map = HashMap::>::new(); + + // Sort the elements into bins according to which process they should be sent. + + for p in 1..=size as usize { + let q = if p <= k as usize { + izip!(sorted_keys, &scan) + .filter_map(|(&key, &s)| { + if (p - 1) * (1 + w) <= s && s < p * (w + 1) { + Some(key) + } else { + None + } + }) + .collect_vec() + } else { + izip!(sorted_keys, &scan) + .filter_map(|(&key, &s)| { + if (p - 1) * w + k <= s && s < p * w + k { + Some(key) + } else { + None + } + }) + .collect_vec() + }; + hash_map.insert(p - 1, q); + } + + // Now distribute the data with an all to all v. + // We create a vector of how many elements to send to each process and + // then send the actual data. + + let mut counts = vec![0 as i32; size as usize]; + let mut counts_from_processor = vec![0 as i32; size as usize]; + let mut all_elements = Vec::::new(); + for (index, c) in counts.iter_mut().enumerate() { + let elements = hash_map.get(&index).unwrap(); + *c = elements.len() as i32; + all_elements.extend(elements.iter()) + } + + // Send around the number of elements for each process + + comm.all_to_all_into(&counts, &mut counts_from_processor); + + // We have the number of elements for each process now. Now send around + // the actual elements. + + // We can now send around the actual elements with an alltoallv. + let send_displs: Vec = counts + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp as i32) + }) + .collect(); + + let send_partition = Partition::new(&all_elements, &counts[..], &send_displs[..]); + + let mut recvbuffer = + vec![MortonKey::default(); counts_from_processor.iter().sum::() as usize]; + + let recv_displs: Vec = counts_from_processor + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect(); + + let mut receiv_partition = + PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]); + comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); + + recvbuffer.sort_unstable(); + recvbuffer +} From b39e69e127128b445e9c5b7b07b78e72db785268 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Thu, 12 Sep 2024 10:02:57 +0100 Subject: [PATCH 02/42] Partitioning is working. --- examples/mpi.rs | 11 ++++++++--- src/parallel_octree.rs | 28 +++++++++++++++++++++++----- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/examples/mpi.rs b/examples/mpi.rs index 05a88b9..469d8d1 100644 --- a/examples/mpi.rs +++ b/examples/mpi.rs @@ -17,9 +17,14 @@ pub fn main() { let mut arr = Vec::::new(); let mut weights = Vec::::new(); - for index in 0..n_per_rank { - arr.push(MortonKey::from_index_and_level([0, 0, 0], 0)); - weights.push(1); + for index in n_per_rank * rank..n_per_rank * (rank + 1) { + arr.push(MortonKey::from_index_and_level([index as usize, 0, 0], 10)); + } + + let arr = parsort(&arr, &world, &mut rng); + + for index in 0..arr.len() { + weights.push((rank * n_per_rank) as usize + index); } // let t = n_per_rank * rank as usize; diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index 1db18e0..d686394 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -296,9 +296,23 @@ pub fn partition( let rank = comm.rank(); // First scan the weight. + // We scan the local arrays, then use a global scan operation on the last element + // of each array to get the global sums and then we update the array of each rank + // with the sum from the previous ranks. - let mut scan: Vec = vec![0; sorted_keys.len()]; - comm.scan_into(weights, scan.as_mut_slice(), SystemOperation::sum()); + let mut scan: Vec = weights + .iter() + .scan(0, |state, x| { + *state += *x; + Some(*state) + }) + .collect_vec(); + let scan_last = *scan.last().unwrap(); + let mut scan_result: usize = 0; + comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum()); + for elem in &mut scan { + *elem += scan_result; + } let mut total_weight = if rank == size - 1 { *scan.last().unwrap() @@ -323,7 +337,9 @@ pub fn partition( let q = if p <= k as usize { izip!(sorted_keys, &scan) .filter_map(|(&key, &s)| { - if (p - 1) * (1 + w) <= s && s < p * (w + 1) { + if ((p - 1) * (1 + w) <= s && s < p * (w + 1)) + || (p == size as usize && (p - 1) * (1 + w) <= s) + { Some(key) } else { None @@ -333,7 +349,9 @@ pub fn partition( } else { izip!(sorted_keys, &scan) .filter_map(|(&key, &s)| { - if (p - 1) * w + k <= s && s < p * w + k { + if ((p - 1) * w + k <= s && s < p * w + k) + || (p == size as usize && (p - 1) * w + k <= s) + { Some(key) } else { None @@ -350,6 +368,7 @@ pub fn partition( let mut counts = vec![0 as i32; size as usize]; let mut counts_from_processor = vec![0 as i32; size as usize]; + let mut all_elements = Vec::::new(); for (index, c) in counts.iter_mut().enumerate() { let elements = hash_map.get(&index).unwrap(); @@ -358,7 +377,6 @@ pub fn partition( } // Send around the number of elements for each process - comm.all_to_all_into(&counts, &mut counts_from_processor); // We have the number of elements for each process now. Now send around From f9904241f4f80fdfda9b68ebb01049d3244923fd Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Thu, 12 Sep 2024 11:49:57 +0100 Subject: [PATCH 03/42] Better testing of parallel code. --- examples/mpi.rs | 102 +++++++++++++++++++++++++++++------------ src/parallel_octree.rs | 18 ++++++-- 2 files changed, 88 insertions(+), 32 deletions(-) diff --git a/examples/mpi.rs b/examples/mpi.rs index 469d8d1..d427fd5 100644 --- a/examples/mpi.rs +++ b/examples/mpi.rs @@ -1,54 +1,98 @@ //! Testing the hyksort component. use bempp_octree::morton::MortonKey; -use bempp_octree::parallel_octree::partition; +use bempp_octree::parallel_octree::{linearize, partition}; use bempp_octree::parsort::{array_to_root, parsort}; use itertools::Itertools; -use mpi::traits::Communicator; +use mpi::traits::*; use rand::prelude::*; +pub fn assert_linearized(arr: &Vec, comm: &C) { + // Check that the keys are still linearized. + let arr = array_to_root(&arr, comm); + + if comm.rank() == 0 { + let arr = arr.unwrap(); + for (&elem1, &elem2) in arr.iter().tuple_windows() { + assert!(!elem1.is_ancestor(elem2)); + } + println!("{} keys are linearized.", &arr.len()); + } +} + pub fn main() { let universe = mpi::initialize().unwrap(); let world = universe.world(); let rank = world.rank() as u64; - let n_per_rank = 10; + let max_level = 6; + + // Each process gets its own rng + let mut rng = rand::rngs::StdRng::seed_from_u64(rank as u64); + + // We first create a non-uniform tree on rank 0. + + let mut keys = Vec::::new(); + + pub fn add_level( + keys: &mut Vec, + current: MortonKey, + rng: &mut R, + max_level: usize, + ) { + keys.push(current); + + if current.level() >= max_level { + return; + } - let mut rng = rand::rngs::StdRng::seed_from_u64(0); + let mut children = current.children(); - let mut arr = Vec::::new(); - let mut weights = Vec::::new(); + // This makes sure that the tree is not sorted. + children.shuffle(rng); - for index in n_per_rank * rank..n_per_rank * (rank + 1) { - arr.push(MortonKey::from_index_and_level([index as usize, 0, 0], 10)); + for child in children { + if rng.gen_bool(0.9) { + add_level(keys, child, rng, max_level); + } + } } - let arr = parsort(&arr, &world, &mut rng); + add_level(&mut keys, MortonKey::root(), &mut rng, max_level); + + println!("Number of keys on rank {}: {}", rank, keys.len()); + + // We now linearize the keys. - for index in 0..arr.len() { - weights.push((rank * n_per_rank) as usize + index); + if rank == 0 { + println!("Linearizing keys."); } + let sorted_keys = linearize(&keys, &mut rng, &world); - // let t = n_per_rank * rank as usize; - // let mut index_sum = if rank == 0 { 0 } else { (t * (t - 1)) / 2 }; - // for index in n_per_rank * (rank as usize)..(n_per_rank * (1 + rank as usize)) { - // arr.push(MortonKey::from_index_and_level([0, 0, 0], 0)); - // weights.push(index_sum); - // index_sum += index; - // // weights.push(rng.gen_range(1..20)); - // } + println!( + "Number of linearized keys on rank {}: {}", + rank, + sorted_keys.len() + ); - let partitioned = partition(&arr, &weights, &world); + // Now check that the tree is properly linearized. - println!("Rank: {}, Len: {}", rank, partitioned.len()); + assert_linearized(&sorted_keys, &world); - let arr = array_to_root(&partitioned, &world); + // We now partition the keys equally across the processes. We give + // each leaf equal weights here. - if rank == 0 { - let arr = arr.unwrap(); + let weights = vec![1 as usize; sorted_keys.len()]; - for (elem1, elem2) in arr.iter().tuple_windows() { - assert!(elem1 <= elem2); - } - println!("{} elements are sorted.", arr.len()); - println!("Finished."); + if rank == 0 { + println!("Partitioning keys."); } + + let sorted_keys = partition(&sorted_keys, &weights, &world); + + println!( + "After partitioning have {} keys on rank {}", + sorted_keys.len(), + rank + ); + + assert_linearized(&sorted_keys, &world); } diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index d686394..f6ab194 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -227,14 +227,20 @@ pub fn linearize( rng: &mut R, comm: &C, ) -> Vec { + let size = comm.size(); + let rank = comm.rank(); + + // If we only have one process we use the standard serial linearization. + + if size == 1 { + return MortonKey::linearize(keys); + } + // We are first sorting the keys. Then in a linear process across all processors we // go through the arrays and delete ancestors of nodes. let sorted_keys = parsort(&keys, comm, rng); - let size = comm.size(); - let rank = comm.rank(); - // Each process needs to send its first element to the previous process. Each process // then goes through its own list and retains elements that are not ancestors of the // next element. @@ -295,6 +301,12 @@ pub fn partition( let size = comm.size(); let rank = comm.rank(); + // If we only have one process we simply return. + + if size == 1 { + return sorted_keys.to_vec(); + } + // First scan the weight. // We scan the local arrays, then use a global scan operation on the last element // of each array to get the global sums and then we update the array of each rank From 902f08aea10a54d0a44ae8d08333d0a6358ad5f0 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Thu, 12 Sep 2024 15:08:18 +0100 Subject: [PATCH 04/42] WIP: Parallel tree generation --- src/morton.rs | 15 ++------ src/octree.rs | 4 +- src/parallel_octree.rs | 86 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 86 insertions(+), 19 deletions(-) diff --git a/src/morton.rs b/src/morton.rs index 0f93c66..7babc72 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -491,6 +491,8 @@ impl MortonKey { } /// Complete a region ensuring that the given keys are part of the leafs. + /// + /// The given keys must not overlap. pub fn complete_region(keys: &[MortonKey]) -> Vec { // First make sure that the input sequence is sorted. let mut keys = keys.to_vec(); @@ -509,15 +511,8 @@ impl MortonKey { return result; } - let deepest_first = MortonKey::from_index_and_level([0, 0, 0], DEEPEST_LEVEL as usize); - let deepest_last = MortonKey::from_index_and_level( - [ - LEVEL_SIZE as usize - 1, - LEVEL_SIZE as usize - 1, - LEVEL_SIZE as usize - 1, - ], - DEEPEST_LEVEL as usize, - ); + let deepest_first = MortonKey::deepest_first(); + let deepest_last = MortonKey::deepest_last(); // If the first key is not an ancestor of the deepest possible first element in the // tree get the finest ancestor between the two and use the first child of that. @@ -975,8 +970,6 @@ mod test { let keys = children[1].fill_between_keys(children[2]); assert!(keys.is_empty()); - - // Correct result for two keys at deepest level } #[test] diff --git a/src/octree.rs b/src/octree.rs index e68297d..64cdb08 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -291,7 +291,7 @@ mod test { fn test_octree() { use std::time::Instant; - let npoints = 1000000; + let npoints = 10000; let points = get_points_on_sphere(npoints); let max_level = 7; let max_points_per_box = 100; @@ -308,7 +308,7 @@ mod test { #[test] fn test_export() { let fname = "_test_sphere.vtk"; - let npoints = 1000000; + let npoints = 10000; let points = get_points_on_sphere(npoints); let max_level = 7; let max_points_per_box = 100; diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index f6ab194..f6c29f0 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -1,12 +1,12 @@ //! Parallel Octree structure -use std::{borrow::BorrowMut, collections::HashMap, fmt::Display}; +use std::collections::HashMap; use crate::{ - constants::{DEEPEST_LEVEL, NLEVELS}, + constants::{DEEPEST_LEVEL, NSIBLINGS}, geometry::PhysicalBox, morton::MortonKey, - parsort::{parsort, MaxValue, MinValue}, + parsort::parsort, }; use mpi::{ @@ -18,9 +18,7 @@ use mpi::{ use itertools::{izip, Itertools}; use mpi::{ collective::SystemOperation, - datatype::UncommittedUserDatatype, - topology::Process, - traits::{CommunicatorCollectives, Destination, Equivalence}, + traits::{CommunicatorCollectives, Destination}, }; use rand::Rng; @@ -425,3 +423,79 @@ pub fn partition( recvbuffer.sort_unstable(); recvbuffer } + +/// Given a distributed set of keys, generate a complete linear Octree. +pub fn complete_tree( + keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + let mut linearized_keys = linearize(keys, rng, comm); + + let size = comm.size(); + let rank = comm.rank(); + + if size == 1 { + return MortonKey::complete_region(linearized_keys.as_slice()); + } + + // Now insert on the first and last process the first and last child of the + // finest ancestor of first/last box on deepest level + + // Send first element to previous rank and insert into local keys. + // On the first process we also need to insert the first child of the finest + // ancestor of the deepest first key and first element. Correspondingly on the last process + // we need to insert the last child of the finest ancester of the deepest last key and last element. + + if rank == size - 1 { + // On last process send first element to previous processes and insert last + // possible box from region into list. + comm.process_at_rank(rank - 1) + .send(linearized_keys.first().unwrap()); + let last_key = *linearized_keys.last().unwrap(); + let deepest_last = MortonKey::deepest_last(); + if !last_key.is_ancestor(deepest_last) { + let ancestor = deepest_last.finest_common_ancestor(last_key); + linearized_keys.push(ancestor.children()[NSIBLINGS - 1]); + } + } else { + let (other, _status) = if rank > 0 { + // On intermediate process receive from the next process + // and send first element to previous process. + p2p::send_receive( + linearized_keys.first().unwrap(), + &comm.process_at_rank(rank - 1), + &comm.process_at_rank(rank + 1), + ) + } else { + // On first process insert at the beginning the first possible + // box in the region and receive the key from next process. + let first_key = *linearized_keys.first().unwrap(); + let deepest_first = MortonKey::deepest_first(); + if !first_key.is_ancestor(deepest_first) { + let ancestor = deepest_first.finest_common_ancestor(first_key); + linearized_keys.push(ancestor.children()[0]); + } + + comm.process_at_rank(1).receive::() + }; + // If we are not at the last process we need to introduce the received key + // into our list. + linearized_keys.push(other); + }; + + // Now complete the regions defined by the keys on each process. + + let mut result = Vec::::new(); + + for (&key1, &key2) in linearized_keys.iter().tuple_windows() { + result.push(key1); + result.extend_from_slice(key1.fill_between_keys(key2).as_slice()); + } + + if rank == size - 1 { + result.push(*linearized_keys.last().unwrap()); + } + + result +} From 59e3f9a30b976f18930bb681e2958bcc78d871bf Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Thu, 12 Sep 2024 16:37:03 +0100 Subject: [PATCH 05/42] WIP: Testing block partitioning --- examples/mpi.rs | 86 +++++++++++++++++++++++++----------------- src/morton.rs | 16 ++++---- src/parallel_octree.rs | 25 +++++++----- 3 files changed, 75 insertions(+), 52 deletions(-) diff --git a/examples/mpi.rs b/examples/mpi.rs index d427fd5..d1c3c8e 100644 --- a/examples/mpi.rs +++ b/examples/mpi.rs @@ -1,6 +1,6 @@ //! Testing the hyksort component. use bempp_octree::morton::MortonKey; -use bempp_octree::parallel_octree::{linearize, partition}; +use bempp_octree::parallel_octree::{block_partition, linearize, partition}; use bempp_octree::parsort::{array_to_root, parsort}; use itertools::Itertools; use mpi::traits::*; @@ -19,19 +19,7 @@ pub fn assert_linearized(arr: &Vec, comm: } } -pub fn main() { - let universe = mpi::initialize().unwrap(); - let world = universe.world(); - let rank = world.rank() as u64; - let max_level = 6; - - // Each process gets its own rng - let mut rng = rand::rngs::StdRng::seed_from_u64(rank as u64); - - // We first create a non-uniform tree on rank 0. - - let mut keys = Vec::::new(); - +pub fn generate_random_tree(max_level: usize, rng: &mut R) -> Vec { pub fn add_level( keys: &mut Vec, current: MortonKey, @@ -56,43 +44,71 @@ pub fn main() { } } - add_level(&mut keys, MortonKey::root(), &mut rng, max_level); + let mut keys = Vec::::new(); + add_level(&mut keys, MortonKey::root(), rng, max_level); - println!("Number of keys on rank {}: {}", rank, keys.len()); + keys +} + +pub fn test_linearize(rng: &mut R, comm: &C) { + let max_level = 6; + let keys = generate_random_tree(max_level, rng); + let rank = comm.rank(); // We now linearize the keys. if rank == 0 { println!("Linearizing keys."); } - let sorted_keys = linearize(&keys, &mut rng, &world); - - println!( - "Number of linearized keys on rank {}: {}", - rank, - sorted_keys.len() - ); + let sorted_keys = linearize(&keys, rng, comm); // Now check that the tree is properly linearized. - assert_linearized(&sorted_keys, &world); + assert_linearized(&sorted_keys, comm); + if rank == 0 { + println!("Linearization successful."); + } - // We now partition the keys equally across the processes. We give - // each leaf equal weights here. + // Now form the coarse tree +} - let weights = vec![1 as usize; sorted_keys.len()]; +pub fn test_coarse_partition(rng: &mut R, comm: &C) { + let max_level = 6; + let keys = generate_random_tree(max_level, rng); + let rank = comm.rank(); + + let arr = array_to_root(&keys, comm); if rank == 0 { - println!("Partitioning keys."); + let arr = arr.unwrap(); + println!("Fine tree has {} elements", arr.len()); } - let sorted_keys = partition(&sorted_keys, &weights, &world); + // We now linearize the keys. - println!( - "After partitioning have {} keys on rank {}", - sorted_keys.len(), - rank - ); + let keys = parsort(&keys, comm, rng); - assert_linearized(&sorted_keys, &world); + let coarse_tree = block_partition(&keys, rng, comm); + if rank == 1 { + println!("Length of coarse tree {}", coarse_tree.len()); + } + + let arr = array_to_root(&coarse_tree, comm); + + if rank == 0 { + let arr = arr.unwrap(); + assert!(MortonKey::is_complete_linear_octree(&arr)); + + println!("Coarse tree has {} keys", arr.len()); + } +} + +pub fn main() { + let universe = mpi::initialize().unwrap(); + let comm = universe.world(); + let rank = comm.rank() as u64; + // Each process gets its own rng + let mut rng = rand::rngs::StdRng::seed_from_u64(rank as u64); + test_linearize(&mut rng, &comm); + test_coarse_partition(&mut rng, &comm); } diff --git a/src/morton.rs b/src/morton.rs index 7babc72..1e1e633 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -490,10 +490,10 @@ impl MortonKey { result } - /// Complete a region ensuring that the given keys are part of the leafs. + /// Complete a tree ensuring that the given keys are part of the leafs. /// /// The given keys must not overlap. - pub fn complete_region(keys: &[MortonKey]) -> Vec { + pub fn complete_tree(keys: &[MortonKey]) -> Vec { // First make sure that the input sequence is sorted. let mut keys = keys.to_vec(); keys.sort_unstable(); @@ -506,9 +506,9 @@ impl MortonKey { return result; } - // If a single element is given then just return the result if it is the root of the tree. - if keys.len() == 1 && result[0] == MortonKey::from_index_and_level([0, 0, 0], 0) { - return result; + // If just the root is given return that. + if keys.len() == 1 && *keys.first().unwrap() == MortonKey::root() { + return keys.to_vec(); } let deepest_first = MortonKey::deepest_first(); @@ -1018,13 +1018,13 @@ mod test { let keys = [key1, key2, key3]; - let complete_region = MortonKey::complete_region(keys.as_slice()); + let complete_region = MortonKey::complete_tree(keys.as_slice()); sanity_checks(keys.as_slice(), complete_region.as_slice()); // For an empty slice the complete region method should just add the root of the tree. let keys = Vec::::new(); - let complete_region = MortonKey::complete_region(keys.as_slice()); + let complete_region = MortonKey::complete_tree(keys.as_slice()); assert_eq!(complete_region.len(), 1); sanity_checks(keys.as_slice(), complete_region.as_slice()); @@ -1033,7 +1033,7 @@ mod test { let keys = [MortonKey::deepest_first(), MortonKey::deepest_last()]; - let complete_region = MortonKey::complete_region(keys.as_slice()); + let complete_region = MortonKey::complete_tree(keys.as_slice()); sanity_checks(keys.as_slice(), complete_region.as_slice()); } diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index f6c29f0..5a33f8c 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -194,17 +194,19 @@ pub fn points_to_morton( (keys, bounding_box) } +/// Block partition of tree pub fn block_partition( - keys: &[MortonKey], + sorted_keys: &[MortonKey], rng: &mut R, comm: &C, -) { - // First we sort the array of weighted keys. - - let sorted_keys = parsort(&keys, comm, rng); +) -> Vec { + let mut completed_region = sorted_keys + .first() + .unwrap() + .fill_between_keys(*sorted_keys.last().unwrap()); - let mut completed_region = - MortonKey::complete_region(&[*sorted_keys.first().unwrap(), *sorted_keys.last().unwrap()]); + completed_region.insert(0, *sorted_keys.first().unwrap()); + completed_region.push(*sorted_keys.last().unwrap()); // Get the smallest level members of the completed region. @@ -216,7 +218,12 @@ pub fn block_partition( let largest_boxes = completed_region .iter() - .filter(|elem| elem.level() == min_level); + .filter(|elem| elem.level() == min_level) + .copied() + .collect_vec(); + + let coarse_tree = complete_tree(&largest_boxes, rng, comm); + coarse_tree } /// Linearize a set of weighted Morton keys. @@ -436,7 +443,7 @@ pub fn complete_tree( let rank = comm.rank(); if size == 1 { - return MortonKey::complete_region(linearized_keys.as_slice()); + return MortonKey::complete_tree(linearized_keys.as_slice()); } // Now insert on the first and last process the first and last child of the From d86abf4804c40da9da0107f22db61fbb401c627d Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Thu, 12 Sep 2024 23:39:37 +0100 Subject: [PATCH 06/42] WIP: Block coarsening --- examples/mpi.rs | 9 +++------ src/parallel_octree.rs | 20 ++++++++++++++++++-- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/examples/mpi.rs b/examples/mpi.rs index d1c3c8e..e8e0490 100644 --- a/examples/mpi.rs +++ b/examples/mpi.rs @@ -1,6 +1,6 @@ //! Testing the hyksort component. use bempp_octree::morton::MortonKey; -use bempp_octree::parallel_octree::{block_partition, linearize, partition}; +use bempp_octree::parallel_octree::{block_partition, is_sorted_array, linearize, partition}; use bempp_octree::parsort::{array_to_root, parsort}; use itertools::Itertools; use mpi::traits::*; @@ -89,17 +89,14 @@ pub fn test_coarse_partition(rng: &mut R, co let keys = parsort(&keys, comm, rng); let coarse_tree = block_partition(&keys, rng, comm); - if rank == 1 { - println!("Length of coarse tree {}", coarse_tree.len()); - } let arr = array_to_root(&coarse_tree, comm); if rank == 0 { let arr = arr.unwrap(); - assert!(MortonKey::is_complete_linear_octree(&arr)); - println!("Coarse tree has {} keys", arr.len()); + assert!(MortonKey::is_complete_linear_octree(&arr)); + println!("Coarse tree is sorted and complete."); } } diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index 5a33f8c..0fb778e 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -6,7 +6,7 @@ use crate::{ constants::{DEEPEST_LEVEL, NSIBLINGS}, geometry::PhysicalBox, morton::MortonKey, - parsort::parsort, + parsort::{array_to_root, parsort}, }; use mpi::{ @@ -481,7 +481,7 @@ pub fn complete_tree( let deepest_first = MortonKey::deepest_first(); if !first_key.is_ancestor(deepest_first) { let ancestor = deepest_first.finest_common_ancestor(first_key); - linearized_keys.push(ancestor.children()[0]); + linearized_keys.insert(0, ancestor.children()[0]); } comm.process_at_rank(1).receive::() @@ -506,3 +506,19 @@ pub fn complete_tree( result } + +/// Check if an array is sorted. +pub fn is_sorted_array(arr: &[MortonKey], comm: &C) -> Option { + let arr = array_to_root(arr, comm); + if comm.rank() == 0 { + let arr = arr.unwrap(); + for (&elem1, &elem2) in arr.iter().tuple_windows() { + if elem1 > elem2 { + return Some(false); + } + } + Some(true) + } else { + None + } +} From 695239fbf8a27cb9b31bed877531bcaa63e3abd3 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 13 Sep 2024 18:35:52 +0100 Subject: [PATCH 07/42] Testing the coarse partitioning. --- src/parallel_octree.rs | 131 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 130 insertions(+), 1 deletion(-) diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index 0fb778e..388abef 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -12,7 +12,7 @@ use crate::{ use mpi::{ datatype::{Partition, PartitionMut}, point_to_point as p2p, - traits::{Root, Source}, + traits::{Equivalence, Root, Source}, }; use itertools::{izip, Itertools}; @@ -200,6 +200,8 @@ pub fn block_partition( rng: &mut R, comm: &C, ) -> Vec { + let rank = comm.rank(); + let mut completed_region = sorted_keys .first() .unwrap() @@ -216,6 +218,9 @@ pub fn block_partition( .min() .unwrap(); + // Each process selects its largest boxes. These are used to create + // a coarse tree. + let largest_boxes = completed_region .iter() .filter(|elem| elem.level() == min_level) @@ -223,6 +228,82 @@ pub fn block_partition( .collect_vec(); let coarse_tree = complete_tree(&largest_boxes, rng, comm); + + // We want to partition the coarse tree. But we need the correct weights. The idea + // is that we use the number of original leafs that intersect with the coarse tree + // as leafs. In order to compute this we send the coarse tree around to all processes + // so that each process computes for each coarse tree element how many of its keys + // intersect with each node of the coarse tree. We then sum up the local weight for each + // coarse tree node across all nodes to get the weight. + + let global_coarse_tree = gather_to_all(&coarse_tree, comm); + + // We also want to send around a corresponding array of ranks so that for each global coarse tree key + // we have the rank of where it originates from. + + let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm); + + // We now compute the local weights. + let mut local_weights = vec![0 as usize; global_coarse_tree.len()]; + + // In the following loop we want to be a bit smart. We do not iterate through all the local elements. + // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region + // of our sorted keys that overlaps with the coarse tree region. + + // Let's find the start of our region. + + let first_key = *sorted_keys.first().unwrap(); + + let first_coarse_index = global_coarse_tree + .iter() + .take_while(|coarse_key| !coarse_key.is_ancestor(first_key)) + .count(); + + // Now we need to find the end index of our region. + let last_key = *sorted_keys.last().unwrap(); + + let last_coarse_index = first_coarse_index + + global_coarse_tree + .iter() + .skip(first_coarse_index) + .take_while(|coarse_key| coarse_key.is_ancestor(last_key)) + .count(); + + // We now only need to iterate through between the first and last coarse index in the coarse tree. + + for (w, &global_coarse_key) in izip!( + local_weights[first_coarse_index..last_coarse_index].iter_mut(), + global_coarse_tree[first_coarse_index..last_coarse_index].iter() + ) { + *w += sorted_keys + .iter() + .filter(|&&key| global_coarse_key.is_ancestor(key)) + .count(); + } + + // We now need to sum up the weights across all processes. + + let mut weights = vec![0 as usize; global_coarse_tree.len()]; + + comm.all_reduce_into(&local_weights, &mut weights, SystemOperation::sum()); + + // Each process now has all weights. However, we only need the ones for the current process. + // So we just filter the rest out. + + let weights = izip!(coarse_tree_ranks, weights) + .filter_map(|(r, weight)| { + if r == rank as usize { + Some(weight) + } else { + None + } + }) + .collect_vec(); + + // We have now all the information we need to repartition the coarse tree (finally...). Let's just do it. + + let coarse_tree = partition(&coarse_tree, &weights, comm); + coarse_tree } @@ -522,3 +603,51 @@ pub fn is_sorted_array(arr: &[MortonKey], comm: &C) None } } + +/// Get global size of a distributed array. +pub fn global_size(arr: &[T], comm: &C) -> usize { + let local_size = arr.len(); + let mut global_size = 0; + + comm.all_reduce_into(&local_size, &mut global_size, SystemOperation::sum()); + + global_size +} + +/// Gather array to all processes +pub fn gather_to_all(arr: &[T], comm: &C) -> Vec { + // First we need to broadcast the individual sizes on each process. + + let size = comm.size(); + + let local_len = arr.len(); + + let mut sizes = vec![0 as i32; size as usize]; + + comm.all_to_all_into(&local_len, &mut sizes); + + let recv_len = sizes.iter().sum::() as usize; + + // Now we have the size of each local contribution. + // let mut recvbuffer = + // vec![T: Default; counts_from_processor.iter().sum::() as usize]; + let mut recvbuffer = Vec::::with_capacity(recv_len); + let buf: &mut [T] = unsafe { std::mem::transmute(recvbuffer.spare_capacity_mut()) }; + + let recv_displs: Vec = sizes + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect(); + + let mut receiv_partition = PartitionMut::new(buf, sizes, &recv_displs[..]); + + comm.all_gather_varcount_into(arr, &mut receiv_partition); + + unsafe { recvbuffer.set_len(recv_len) }; + + recvbuffer +} From 0cb3c776f59f85d0decb9f63bd4cf6057ecfa2bd Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 13 Sep 2024 18:46:41 +0100 Subject: [PATCH 08/42] Fix in coarse partitioning --- src/parallel_octree.rs | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index 388abef..fd01bac 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -194,7 +194,10 @@ pub fn points_to_morton( (keys, bounding_box) } -/// Block partition of tree +/// Block partition of tree. +/// +/// A necessary condition for the block partitioning is that +// all sorted keys are on the same level. pub fn block_partition( sorted_keys: &[MortonKey], rng: &mut R, @@ -250,7 +253,8 @@ pub fn block_partition( // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region // of our sorted keys that overlaps with the coarse tree region. - // Let's find the start of our region. + // Let's find the start of our region. The start of our region is a coarse key that is an ancestor + // of our current key. This works because the coarse tree has levels at most as high as the sorted keys. let first_key = *sorted_keys.first().unwrap(); @@ -259,21 +263,22 @@ pub fn block_partition( .take_while(|coarse_key| !coarse_key.is_ancestor(first_key)) .count(); - // Now we need to find the end index of our region. + // Now we need to find the end index of our region. For this again we find the index of our coarse tree that + // is an ancestor of our last key. let last_key = *sorted_keys.last().unwrap(); let last_coarse_index = first_coarse_index + global_coarse_tree .iter() - .skip(first_coarse_index) - .take_while(|coarse_key| coarse_key.is_ancestor(last_key)) + .take_while(|coarse_key| !coarse_key.is_ancestor(last_key)) .count(); // We now only need to iterate through between the first and last coarse index in the coarse tree. + // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key). for (w, &global_coarse_key) in izip!( - local_weights[first_coarse_index..last_coarse_index].iter_mut(), - global_coarse_tree[first_coarse_index..last_coarse_index].iter() + local_weights[first_coarse_index..=last_coarse_index].iter_mut(), + global_coarse_tree[first_coarse_index..=last_coarse_index].iter() ) { *w += sorted_keys .iter() From 6532b499a75fd2744f026b6f139d9c2fc5f7961c Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 13 Sep 2024 19:12:35 +0100 Subject: [PATCH 09/42] Testing block partitioning --- examples/mpi.rs | 8 +++++++- src/parallel_octree.rs | 13 ++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/mpi.rs b/examples/mpi.rs index e8e0490..cdfc38b 100644 --- a/examples/mpi.rs +++ b/examples/mpi.rs @@ -86,10 +86,16 @@ pub fn test_coarse_partition(rng: &mut R, co // We now linearize the keys. - let keys = parsort(&keys, comm, rng); + let keys = linearize(&keys, rng, comm); let coarse_tree = block_partition(&keys, rng, comm); + println!( + "Coarse tree on rank {} has {} keys.", + rank, + coarse_tree.len() + ); + let arr = array_to_root(&coarse_tree, comm); if rank == 0 { diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index fd01bac..5aee2d9 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -267,11 +267,10 @@ pub fn block_partition( // is an ancestor of our last key. let last_key = *sorted_keys.last().unwrap(); - let last_coarse_index = first_coarse_index - + global_coarse_tree - .iter() - .take_while(|coarse_key| !coarse_key.is_ancestor(last_key)) - .count(); + let last_coarse_index = global_coarse_tree + .iter() + .take_while(|coarse_key| !coarse_key.is_ancestor(last_key)) + .count(); // We now only need to iterate through between the first and last coarse index in the coarse tree. // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key). @@ -625,11 +624,11 @@ pub fn gather_to_all(arr: &[T], comm let size = comm.size(); - let local_len = arr.len(); + let local_len = arr.len() as i32; let mut sizes = vec![0 as i32; size as usize]; - comm.all_to_all_into(&local_len, &mut sizes); + comm.all_gather_into(&local_len, &mut sizes); let recv_len = sizes.iter().sum::() as usize; From ce53f8c7d0e9b1304f6909f111b2b85b7e19f751 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sat, 14 Sep 2024 17:24:24 +0100 Subject: [PATCH 10/42] WIP: Block partitioning --- Cargo.toml | 2 +- examples/mpi.rs | 33 +++++++++++------ src/morton.rs | 7 ++++ src/parallel_octree.rs | 82 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 110 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e089311..a44ac3c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ crate-type = ["cdylib", "lib"] [dependencies] itertools = "0.13.*" -rand = "0.8.5" +rand = { version = "0.8.5", features = ["alloc"] } bytemuck = "1.*" vtkio = "0.6.*" mpi = {version = "0.8.*", features = ["derive", "user-operations"] } diff --git a/examples/mpi.rs b/examples/mpi.rs index cdfc38b..5b46df2 100644 --- a/examples/mpi.rs +++ b/examples/mpi.rs @@ -1,8 +1,9 @@ //! Testing the hyksort component. +use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE}; use bempp_octree::morton::MortonKey; use bempp_octree::parallel_octree::{block_partition, is_sorted_array, linearize, partition}; use bempp_octree::parsort::{array_to_root, parsort}; -use itertools::Itertools; +use itertools::{izip, Itertools}; use mpi::traits::*; use rand::prelude::*; @@ -19,6 +20,23 @@ pub fn assert_linearized(arr: &Vec, comm: } } +pub fn generate_random_keys(nkeys: usize, rng: &mut R) -> Vec { + let mut result = Vec::::with_capacity(nkeys); + + let xindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + let yindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + let zindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + + for (xval, yval, zval) in izip!(xindices.iter(), yindices.iter(), zindices.iter()) { + result.push(MortonKey::from_index_and_level( + [xval, yval, zval], + DEEPEST_LEVEL as usize, + )); + } + + result +} + pub fn generate_random_tree(max_level: usize, rng: &mut R) -> Vec { pub fn add_level( keys: &mut Vec, @@ -73,20 +91,13 @@ pub fn test_linearize(rng: &mut R, comm: &C) } pub fn test_coarse_partition(rng: &mut R, comm: &C) { - let max_level = 6; - let keys = generate_random_tree(max_level, rng); + let keys = generate_random_keys(10000, rng); let rank = comm.rank(); - let arr = array_to_root(&keys, comm); - - if rank == 0 { - let arr = arr.unwrap(); - println!("Fine tree has {} elements", arr.len()); - } - // We now linearize the keys. let keys = linearize(&keys, rng, comm); + println!("There are {} keys on rank {}", keys.len(), rank); let coarse_tree = block_partition(&keys, rng, comm); @@ -102,7 +113,7 @@ pub fn test_coarse_partition(rng: &mut R, co let arr = arr.unwrap(); println!("Coarse tree has {} keys", arr.len()); assert!(MortonKey::is_complete_linear_octree(&arr)); - println!("Coarse tree is sorted and complete."); + println!("Coarse tree is sorted, linear and complete."); } } diff --git a/src/morton.rs b/src/morton.rs index 1e1e633..9601e4d 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -48,6 +48,13 @@ impl MortonKey { key } + /// A key that is not valid or well formed but guaranteed to be larger than any valid key. + /// + /// This is useful when a guaranteed upper bound is needed. + pub fn upper_bound() -> Self { + Self { value: u64::MAX } + } + /// Check if a key is invalid. pub fn invalid_key() -> Self { Self { value: 1 << 63 } diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index 5aee2d9..6095cd1 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -304,11 +304,89 @@ pub fn block_partition( }) .collect_vec(); - // We have now all the information we need to repartition the coarse tree (finally...). Let's just do it. - let coarse_tree = partition(&coarse_tree, &weights, comm); coarse_tree + + // We now need to redistribute the global tree according to the coarse tree. +} + +pub fn redistribute_with_respect_to_coarse_tree( + sorted_keys: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> Vec { + let rank = comm.rank(); + let size = comm.size(); + + if size == 1 { + return sorted_keys.to_vec(); + } + + // We want to globally redistribute keys so that the keys on each process are descendents + // of the local coarse tree keys. + + // We are using here the fact that the coarse tree is complete and sorted. + // We are sending around to each process the first local index. This + // defines bins in which we sort our keys. The keys are then sent around to the correct + // processes via an alltoallv operation. + + let my_first = *coarse_tree.first().unwrap(); + + let mut global_bins = Vec::::with_capacity(size as usize); + let global_bins_buff: &mut [MortonKey] = + unsafe { std::mem::transmute(global_bins.spare_capacity_mut()) }; + + comm.all_gather_into(&my_first, global_bins_buff); + + unsafe { global_bins.set_len(size as usize) }; + + // // We now have the first index from each process. We also want the last index from the last + // // process everywhere to make sorting into bins easier. + + // let mut last_coarse_key = MortonKey::default(); + + // if rank == size - 1 { + // last_coarse_key = *coarse_tree.last().unwrap(); + // } + + // comm.process_at_rank(size - 1) + // .broadcast_into(&mut last_coarse_key); + + global_bins.push(MortonKey::upper_bound()); + let mut ranks = vec![0 as usize; size as usize]; + + // We now have our bins. We go through our keys and assign to each key the + // rank it should be sent to. For this we are using the fact that both our + // keys and the coarse tree are sorted. + + // This iterates over each possible bin and returns also the associated rank. + let mut bin_iter = global_bins + .iter() + .tuple_windows::<(&MortonKey, &MortonKey)>() + .enumerate(); + + // We take the first element of the bin iterator. There will always be at least one. + let (mut rank, (mut bin_start, mut bin_end)) = bin_iter.next().unwrap(); + + for (&key, r) in izip!(sorted_keys.iter(), ranks.iter_mut()) { + if *bin_start <= key && key < *bin_end { + *r = rank + } else { + // Move the bin forward until it fits. There will always be a fitting bin. + while let Some((rn, (bsn, ben))) = bin_iter.next() { + if *bsn <= key && key < *ben { + *r = rn; + rank = rn; + bin_start = bsn; + bin_end = ben; + } else { + continue; + } + } + } + } + sorted_keys.to_vec() } /// Linearize a set of weighted Morton keys. From 6d4c14e975783f14711ba44e3dab12f88c391e79 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sat, 14 Sep 2024 17:24:55 +0100 Subject: [PATCH 11/42] WIP: Block partitioning --- src/parallel_octree.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index 6095cd1..8824489 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -386,6 +386,10 @@ pub fn redistribute_with_respect_to_coarse_tree( } } } + + // We have now the necessary rank for each key element. + // We do a stable sort of the sorted_keys to sort them by rank + sorted_keys.to_vec() } From 7b684ff71fe36ef2fc1192f5d85b9223e700bd6d Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 15 Sep 2024 18:53:35 +0100 Subject: [PATCH 12/42] Block partitioning works. --- examples/mpi.rs | 44 +++++++++++++++++------ src/parallel_octree.rs | 82 +++++++++++++++++++++++++++++++----------- 2 files changed, 96 insertions(+), 30 deletions(-) diff --git a/examples/mpi.rs b/examples/mpi.rs index 5b46df2..5c2f779 100644 --- a/examples/mpi.rs +++ b/examples/mpi.rs @@ -91,29 +91,53 @@ pub fn test_linearize(rng: &mut R, comm: &C) } pub fn test_coarse_partition(rng: &mut R, comm: &C) { - let keys = generate_random_keys(10000, rng); let rank = comm.rank(); + let keys = if rank == 0 { + generate_random_keys(50, rng) + } else { + generate_random_keys(1000, rng) + }; // We now linearize the keys. - let keys = linearize(&keys, rng, comm); - println!("There are {} keys on rank {}", keys.len(), rank); + let mut keys = linearize(&keys, rng, comm); - let coarse_tree = block_partition(&keys, rng, comm); + // We move most keys over from rank 0 to rank 2 to check how the partitioning works. + + let nsend = 400; + // Send the last 200 keys from rank 0 to rank 1. + + if rank == 0 { + let send_keys = &keys[keys.len() - nsend..keys.len()]; + comm.process_at_rank(1).send(send_keys); + keys = keys[0..keys.len() - nsend].to_vec(); + } + + if rank == 1 { + let mut recv_keys = vec![MortonKey::default(); nsend]; + comm.process_at_rank(0).receive_into(&mut recv_keys); + recv_keys.extend(keys.iter()); + keys = recv_keys; + } + + println!("Rank {} has {} keys. ", rank, keys.len()); + + let partitioned_tree = block_partition(&keys, rng, comm); println!( - "Coarse tree on rank {} has {} keys.", + "Partitioned tree on rank {} has {} keys.", rank, - coarse_tree.len() + partitioned_tree.len() ); - let arr = array_to_root(&coarse_tree, comm); + let arr = array_to_root(&partitioned_tree, comm); if rank == 0 { let arr = arr.unwrap(); - println!("Coarse tree has {} keys", arr.len()); - assert!(MortonKey::is_complete_linear_octree(&arr)); - println!("Coarse tree is sorted, linear and complete."); + for (elem1, elem2) in arr.iter().tuple_windows() { + assert!(*elem1 <= *elem2); + } + println!("Keys are sorted."); } } diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index 8824489..cb72ea4 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -204,6 +204,10 @@ pub fn block_partition( comm: &C, ) -> Vec { let rank = comm.rank(); + if comm.size() == 1 { + // On a single node block partitioning should not do anything. + return sorted_keys.to_vec(); + } let mut completed_region = sorted_keys .first() @@ -306,18 +310,19 @@ pub fn block_partition( let coarse_tree = partition(&coarse_tree, &weights, comm); - coarse_tree + redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm) // We now need to redistribute the global tree according to the coarse tree. } +/// Redistribute sorted keys with respect to a linear coarse tree. pub fn redistribute_with_respect_to_coarse_tree( sorted_keys: &[MortonKey], coarse_tree: &[MortonKey], comm: &C, ) -> Vec { - let rank = comm.rank(); let size = comm.size(); + let rank = comm.rank(); if size == 1 { return sorted_keys.to_vec(); @@ -354,43 +359,80 @@ pub fn redistribute_with_respect_to_coarse_tree( // .broadcast_into(&mut last_coarse_key); global_bins.push(MortonKey::upper_bound()); - let mut ranks = vec![0 as usize; size as usize]; - // We now have our bins. We go through our keys and assign to each key the - // rank it should be sent to. For this we are using the fact that both our - // keys and the coarse tree are sorted. + // We now have our bins. We go through our keys and store how + // many keys are assigned to each rank. We are using here that + // our keys and the coarse tree are both sorted. + + // This will store for each rank how many keys will be assigned to it. + let mut rank_counts = vec![0 as i32; size as usize]; // This iterates over each possible bin and returns also the associated rank. - let mut bin_iter = global_bins - .iter() - .tuple_windows::<(&MortonKey, &MortonKey)>() - .enumerate(); + let mut bin_iter = izip!( + rank_counts.iter_mut(), + global_bins + .iter() + .tuple_windows::<(&MortonKey, &MortonKey)>(), + ); // We take the first element of the bin iterator. There will always be at least one. - let (mut rank, (mut bin_start, mut bin_end)) = bin_iter.next().unwrap(); + let mut r: &mut i32; + let mut bin_start: &MortonKey; + let mut bin_end: &MortonKey; + (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); - for (&key, r) in izip!(sorted_keys.iter(), ranks.iter_mut()) { + for &key in sorted_keys.iter() { if *bin_start <= key && key < *bin_end { - *r = rank + *r += 1; } else { // Move the bin forward until it fits. There will always be a fitting bin. while let Some((rn, (bsn, ben))) = bin_iter.next() { if *bsn <= key && key < *ben { - *r = rn; - rank = rn; + *rn += 1; + r = rn; bin_start = bsn; bin_end = ben; - } else { - continue; + break; } } } } - // We have now the necessary rank for each key element. - // We do a stable sort of the sorted_keys to sort them by rank + // We now have the counts for each rank. Let's send it around via alltoallv. + + let mut counts_from_proc = vec![0 as i32; size as usize]; + + comm.all_to_all_into(&rank_counts, &mut counts_from_proc); + // Now compute the send and receive displacements. + + // We can now send around the actual elements with an alltoallv. + let send_displs: Vec = rank_counts + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp as i32) + }) + .collect(); + + let send_partition = Partition::new(&sorted_keys[..], &rank_counts[..], &send_displs[..]); + + let mut recvbuffer = vec![MortonKey::default(); counts_from_proc.iter().sum::() as usize]; + + let recv_displs: Vec = counts_from_proc + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect(); + + let mut receiv_partition = + PartitionMut::new(&mut recvbuffer[..], counts_from_proc, &recv_displs[..]); + comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); - sorted_keys.to_vec() + recvbuffer } /// Linearize a set of weighted Morton keys. From 2946e6b18b31119304209caeef730ccf1a56baa4 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Mon, 16 Sep 2024 15:29:13 +0100 Subject: [PATCH 13/42] Implementing finest out subblocks --- examples/mpi.rs | 4 +- src/morton.rs | 8 +-- src/parallel_octree.rs | 149 ++++++++++++++++++++++++++++------------- 3 files changed, 105 insertions(+), 56 deletions(-) diff --git a/examples/mpi.rs b/examples/mpi.rs index 5c2f779..1837b8c 100644 --- a/examples/mpi.rs +++ b/examples/mpi.rs @@ -127,10 +127,10 @@ pub fn test_coarse_partition(rng: &mut R, co println!( "Partitioned tree on rank {} has {} keys.", rank, - partitioned_tree.len() + partitioned_tree.0.len() ); - let arr = array_to_root(&partitioned_tree, comm); + let arr = array_to_root(&partitioned_tree.0, comm); if rank == 0 { let arr = arr.unwrap(); diff --git a/src/morton.rs b/src/morton.rs index 9601e4d..fccf871 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -662,13 +662,7 @@ impl MortonKey { ); } } - new_work_list.extend_from_slice( - keys.iter() - .copied() - .filter(|&key| key.level() == level - 1) - .collect_vec() - .as_slice(), - ); + new_work_list.extend(keys.iter().copied().filter(|&key| key.level() == level - 1)); work_list = new_work_list; // Now extend the work list with the diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs index cb72ea4..e77cdf1 100644 --- a/src/parallel_octree.rs +++ b/src/parallel_octree.rs @@ -196,17 +196,19 @@ pub fn points_to_morton( /// Block partition of tree. /// +/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned +/// keys and the associated coarse keys. /// A necessary condition for the block partitioning is that // all sorted keys are on the same level. pub fn block_partition( sorted_keys: &[MortonKey], rng: &mut R, comm: &C, -) -> Vec { +) -> (Vec, Vec) { let rank = comm.rank(); if comm.size() == 1 { // On a single node block partitioning should not do anything. - return sorted_keys.to_vec(); + return (sorted_keys.to_vec(), vec![MortonKey::root()]); } let mut completed_region = sorted_keys @@ -310,7 +312,10 @@ pub fn block_partition( let coarse_tree = partition(&coarse_tree, &weights, comm); - redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm) + ( + redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm), + coarse_tree, + ) // We now need to redistribute the global tree according to the coarse tree. } @@ -322,7 +327,6 @@ pub fn redistribute_with_respect_to_coarse_tree( comm: &C, ) -> Vec { let size = comm.size(); - let rank = comm.rank(); if size == 1 { return sorted_keys.to_vec(); @@ -346,18 +350,9 @@ pub fn redistribute_with_respect_to_coarse_tree( unsafe { global_bins.set_len(size as usize) }; - // // We now have the first index from each process. We also want the last index from the last - // // process everywhere to make sorting into bins easier. - - // let mut last_coarse_key = MortonKey::default(); - - // if rank == size - 1 { - // last_coarse_key = *coarse_tree.last().unwrap(); - // } - - // comm.process_at_rank(size - 1) - // .broadcast_into(&mut last_coarse_key); - + // We now have the first index from each process. We also want + // an upper bound for the last index of the tree to make the sorting into + // bins easier. global_bins.push(MortonKey::upper_bound()); // We now have our bins. We go through our keys and store how @@ -365,38 +360,11 @@ pub fn redistribute_with_respect_to_coarse_tree( // our keys and the coarse tree are both sorted. // This will store for each rank how many keys will be assigned to it. - let mut rank_counts = vec![0 as i32; size as usize]; - // This iterates over each possible bin and returns also the associated rank. - let mut bin_iter = izip!( - rank_counts.iter_mut(), - global_bins - .iter() - .tuple_windows::<(&MortonKey, &MortonKey)>(), - ); - - // We take the first element of the bin iterator. There will always be at least one. - let mut r: &mut i32; - let mut bin_start: &MortonKey; - let mut bin_end: &MortonKey; - (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); - - for &key in sorted_keys.iter() { - if *bin_start <= key && key < *bin_end { - *r += 1; - } else { - // Move the bin forward until it fits. There will always be a fitting bin. - while let Some((rn, (bsn, ben))) = bin_iter.next() { - if *bsn <= key && key < *ben { - *rn += 1; - r = rn; - bin_start = bsn; - bin_end = ben; - break; - } - } - } - } + let rank_counts = sort_to_bins(sorted_keys, &global_bins) + .iter() + .map(|&elem| elem as i32) + .collect_vec(); // We now have the counts for each rank. Let's send it around via alltoallv. @@ -435,6 +403,93 @@ pub fn redistribute_with_respect_to_coarse_tree( recvbuffer } +/// Create bins from sorted keys. +pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec { + let mut bin_counts = vec![0 as usize; bins.len() - 1]; + + // This iterates over each possible bin and returns also the associated rank. + let mut bin_iter = izip!( + bin_counts.iter_mut(), + bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(), + ); + + // We take the first element of the bin iterator. There will always be at least one. + let mut r: &mut usize; + let mut bin_start: &MortonKey; + let mut bin_end: &MortonKey; + (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); + + for &key in sorted_keys.iter() { + if *bin_start <= key && key < *bin_end { + *r += 1; + } else { + // Move the bin forward until it fits. There will always be a fitting bin. + while let Some((rn, (bsn, ben))) = bin_iter.next() { + if *bsn <= key && key < *ben { + *rn += 1; + r = rn; + bin_start = bsn; + bin_end = ben; + break; + } + } + } + } + + bin_counts +} + +/// Return a complete tree generated from local keys and associated coarse keys. +/// +/// The coarse keys are refined until the maximum level is reached or until each coarse key +/// is the ancestor of at most `max_keys` fine keys. +pub fn create_local_tree( + sorted_fine_keys: &[MortonKey], + coarse_keys: &[MortonKey], + mut max_level: usize, + max_keys: usize, +) -> Vec { + if max_level > DEEPEST_LEVEL as usize { + max_level = DEEPEST_LEVEL as usize; + } + + // We split the sorted fine keys into subslices so that each subslice + // is associated with a coarse slice. For this we need to add an upper bound + // coarse keys to ensure that we have suitable bins. + + let mut bins = coarse_keys.to_vec(); + bins.push(MortonKey::upper_bound()); + + let counts = sort_to_bins(&sorted_fine_keys, &bins); + + // We now know how many fine keys are associated with each coarse block. We iterate + // through and locally refine for each block that requires it. + + let mut remainder = sorted_fine_keys; + let mut new_coarse_keys = Vec::::new(); + + for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) { + let current; + (current, remainder) = remainder.split_at(count); + if coarse_key.level() < max_level && current.len() > max_keys { + // We need to refine the current split. + new_coarse_keys.extend_from_slice( + create_local_tree( + current, + coarse_key.children().as_slice(), + max_level, + max_keys, + ) + .as_slice(), + ); + } else { + new_coarse_keys.push(coarse_key) + } + } + + coarse_keys.to_vec() +} + /// Linearize a set of weighted Morton keys. pub fn linearize( keys: &[MortonKey], From 805666be26b70f0c3d2e5d20a35bc12230c5765c Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Mon, 16 Sep 2024 22:50:07 +0100 Subject: [PATCH 14/42] Added outer key generation --- src/morton.rs | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/src/morton.rs b/src/morton.rs index fccf871..8e66bfb 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -430,6 +430,42 @@ impl MortonKey { result } + /// Return the index of the key as a child of the parent, i.e. 0, 1, ..., 7. + #[inline(always)] + pub fn child_index(&self) -> usize { + if *self == MortonKey::root() { + return 0; + } + let level = self.level() as u64; + + let shift = LEVEL_DISPLACEMENT + 3 * (DEEPEST_LEVEL - level); + + ((self.value >> shift) % 8) as usize + } + + /// Return the finest descendent that is opposite to the joint corner with the siblings. + pub fn finest_outer_descendent(&self) -> MortonKey { + // First find out which child the current key is. + + let level = self.level() as u64; + + if level == DEEPEST_LEVEL { + return *self; + } + + let mut child_level = 1 + level; + let mut key = *self; + let outer_index = self.child_index() as u64; + + while child_level <= DEEPEST_LEVEL { + let shift = LEVEL_DISPLACEMENT + 3 * (DEEPEST_LEVEL - child_level); + key = MortonKey::new(1 + (key.value | outer_index << shift)); + child_level += 1; + } + + key + } + /// Linearize by sorting and removing overlaps. pub fn linearize(keys: &[MortonKey]) -> Vec { let mut new_keys = Vec::::new(); @@ -1245,4 +1281,38 @@ mod test { // Now compute the box. } + + #[test] + pub fn test_child_index() { + let key = MortonKey::from_index_and_level([1, 501, 718], 10); + + let children = key.children(); + + for (index, child) in children.iter().enumerate() { + assert_eq!(index, child.child_index()); + } + } + + #[test] + pub fn test_finest_outer_descendent() { + let key = MortonKey::from_index_and_level([0, 0, 0], 1); + + let finest_outer_descendent = key.finest_outer_descendent(); + + assert_eq!( + finest_outer_descendent, + MortonKey::from_index_and_level([0, 0, 0], DEEPEST_LEVEL as usize) + ); + + let key = MortonKey::from_index_and_level([1, 1, 0], 1); + let finest_outer_descendent = key.finest_outer_descendent(); + + assert_eq!( + finest_outer_descendent, + MortonKey::from_index_and_level( + [LEVEL_SIZE as usize - 1, LEVEL_SIZE as usize - 1, 0], + DEEPEST_LEVEL as usize + ) + ); + } } From be5d08329772b2a8cefce5a543f9d0535cb4ff5a Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Tue, 17 Sep 2024 09:41:24 +0100 Subject: [PATCH 15/42] WIP: Refactor and cleanup --- Cargo.toml | 1 + examples/battleship.rs | 2 +- examples/mpi_global_bounding_box.rs | 36 + examples/{mpi.rs => parallel_tests.rs} | 2 +- src/lib.rs | 3 +- src/octree.rs | 904 ++++++++++++++++++------- src/parallel_octree.rs | 836 ----------------------- src/parsort.rs | 45 -- src/serial.rs | 325 +++++++++ src/tools.rs | 135 ++++ 10 files changed, 1154 insertions(+), 1135 deletions(-) create mode 100644 examples/mpi_global_bounding_box.rs rename examples/{mpi.rs => parallel_tests.rs} (97%) delete mode 100644 src/parallel_octree.rs create mode 100644 src/serial.rs create mode 100644 src/tools.rs diff --git a/Cargo.toml b/Cargo.toml index a44ac3c..3c3fc1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ crate-type = ["cdylib", "lib"] [dependencies] itertools = "0.13.*" rand = { version = "0.8.5", features = ["alloc"] } +rand_chacha = "0.3.*" bytemuck = "1.*" vtkio = "0.6.*" mpi = {version = "0.8.*", features = ["derive", "user-operations"] } diff --git a/examples/battleship.rs b/examples/battleship.rs index 97f7c81..50587d9 100644 --- a/examples/battleship.rs +++ b/examples/battleship.rs @@ -4,7 +4,7 @@ use std::time::Instant; #[cfg(feature = "battleship")] -use bempp_octree::octree::Octree; +use bempp_octree::serial::Octree; #[cfg(feature = "battleship")] use vtkio::model::*; diff --git a/examples/mpi_global_bounding_box.rs b/examples/mpi_global_bounding_box.rs new file mode 100644 index 0000000..28748b2 --- /dev/null +++ b/examples/mpi_global_bounding_box.rs @@ -0,0 +1,36 @@ +//! Test the computation of a global bounding box across MPI ranks. + +use bempp_octree::octree::compute_global_bounding_box; +use mpi::traits::*; +use rand::prelude::*; +use rand_chacha::ChaCha8Rng; + +pub fn main() { + // Initialise MPI + let universe = mpi::initialize().unwrap(); + + // Get the world communicator + let comm = universe.world(); + + // Initialise a seeded Rng. + let mut rng = ChaCha8Rng::seed_from_u64(2); + + // Get the rank and size + let rank = comm.rank(); + let size = comm.size(); + + // Create `npoints` per rank. + let npoints = 10; + + // Generate random points. + + let mut points = Vec::::with_capacity(3 * npoints); + + for _ in 0..3 * npoints { + points.push(rng.gen()); + } + + // Compute the distributed bounding box. + + let bounding_box = compute_global_bounding_box(&points, &comm); +} diff --git a/examples/mpi.rs b/examples/parallel_tests.rs similarity index 97% rename from examples/mpi.rs rename to examples/parallel_tests.rs index 1837b8c..e3555e9 100644 --- a/examples/mpi.rs +++ b/examples/parallel_tests.rs @@ -1,7 +1,7 @@ //! Testing the hyksort component. use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE}; use bempp_octree::morton::MortonKey; -use bempp_octree::parallel_octree::{block_partition, is_sorted_array, linearize, partition}; +use bempp_octree::octree::{block_partition, is_sorted_array, linearize, partition}; use bempp_octree::parsort::{array_to_root, parsort}; use itertools::{izip, Itertools}; use mpi::traits::*; diff --git a/src/lib.rs b/src/lib.rs index abc07bd..c94ad07 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ pub mod constants; pub mod geometry; pub mod morton; pub mod octree; -pub mod parallel_octree; pub mod parsort; +pub mod serial; +pub mod tools; pub mod types; diff --git a/src/octree.rs b/src/octree.rs index 64cdb08..a3caa64 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -1,325 +1,727 @@ -//! Definition of a linear octree +//! Parallel Octree structure + +use std::collections::HashMap; use crate::{ - constants::{DEEPEST_LEVEL, NLEVELS}, + constants::{DEEPEST_LEVEL, NSIBLINGS}, geometry::PhysicalBox, morton::MortonKey, + parsort::parsort, + tools::gather_to_all, }; -use bytemuck; -use std::collections::HashMap; -use vtkio; - -/// A neighbour -pub struct Neighbour { - /// Direction - pub direction: [i64; 3], - /// Level - pub level: usize, - /// Morton key - pub key: MortonKey, + +use mpi::{ + datatype::{Partition, PartitionMut}, + point_to_point as p2p, + traits::{Root, Source}, +}; + +use itertools::{izip, Itertools}; +use mpi::{ + collective::SystemOperation, + traits::{CommunicatorCollectives, Destination}, +}; +use rand::Rng; + +/// Compute the global bounding box across all points on all processes. +pub fn compute_global_bounding_box( + points: &[f64], + comm: &C, +) -> PhysicalBox { + // Make sure that the points array is a multiple of 3. + assert_eq!(points.len() % 3, 0); + let points: &[[f64; 3]] = bytemuck::cast_slice(points); + + // Now compute the minimum and maximum across each dimension. + + let mut xmin = f64::MAX; + let mut xmax = f64::MIN; + + let mut ymin = f64::MAX; + let mut ymax = f64::MIN; + + let mut zmin = f64::MAX; + let mut zmax = f64::MIN; + + for point in points { + let x = point[0]; + let y = point[1]; + let z = point[2]; + + xmin = f64::min(xmin, x); + xmax = f64::max(xmax, x); + + ymin = f64::min(ymin, y); + ymax = f64::max(ymax, y); + + zmin = f64::min(zmin, z); + zmax = f64::max(zmax, z); + } + + let mut global_xmin = 0.0; + let mut global_xmax = 0.0; + + let mut global_ymin = 0.0; + let mut global_ymax = 0.0; + + let mut global_zmin = 0.0; + let mut global_zmax = 0.0; + + comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min()); + comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max()); + + comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min()); + comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max()); + + comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min()); + comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max()); + + let xdiam = global_xmax - global_xmin; + let ydiam = global_ymax - global_ymin; + let zdiam = global_zmax - global_zmin; + + let xmean = global_xmin + 0.5 * xdiam; + let ymean = global_ymin + 0.5 * ydiam; + let zmean = global_zmin + 0.5 * zdiam; + + // We increase diameters by box size on deepest level + // and use the maximum diameter to compute a + // cubic bounding box. + + let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64; + + let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap(); + + let max_diam = max_diam * (1.0 + deepest_box_diam); + + PhysicalBox::new([ + xmean - 0.5 * max_diam, + ymean - 0.5 * max_diam, + zmean - 0.5 * max_diam, + xmean + 0.5 * max_diam, + ymean + 0.5 * max_diam, + zmean + 0.5 * max_diam, + ]) } -/// An octree -pub struct Octree { - leaf_keys: Vec, - points: Vec<[f64; 3]>, - point_to_level_keys: [Vec; NLEVELS], - bounding_box: PhysicalBox, - key_counts: HashMap, - max_leaf_level: usize, - max_points_in_leaf: usize, +/// Convert points to Morton keys on specified level. +pub fn points_to_morton( + points: &[f64], + max_level: usize, + comm: &C, +) -> (Vec, PhysicalBox) { + // Make sure that the points array is a multiple of 3. + assert_eq!(points.len() % 3, 0); + + // Make sure that max level never exceeds DEEPEST_LEVEL + let max_level = if max_level > DEEPEST_LEVEL as usize { + DEEPEST_LEVEL as usize + } else { + max_level + }; + + // Compute the physical bounding box. + + let bounding_box = compute_global_bounding_box(points, comm); + + // Bunch the points in arrays of 3. + + let points: &[[f64; 3]] = bytemuck::cast_slice(points); + + let keys = points + .iter() + .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level)) + .collect_vec(); + + // Now want to get weighted Morton keys. We use a HashMap. + + let mut value_counts = HashMap::::new(); + + for key in &keys { + *value_counts.entry(*key).or_insert(0) += 1; + } + + // let weighted_keys = value_counts + // .iter() + // .map(|(&key, &weight)| WeightedMortonKey::new(key, weight)) + // .collect_vec(); + + (keys, bounding_box) } -impl Octree { - /// Create octress from points - pub fn from_points(points: &[f64], max_level: usize, max_points_per_box: usize) -> Self { - // Make sure that the points array is a multiple of 3. - assert_eq!(points.len() % 3, 0); +/// Block partition of tree. +/// +/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned +/// keys and the associated coarse keys. +/// A necessary condition for the block partitioning is that +// all sorted keys are on the same level. +pub fn block_partition( + sorted_keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> (Vec, Vec) { + let rank = comm.rank(); + if comm.size() == 1 { + // On a single node block partitioning should not do anything. + return (sorted_keys.to_vec(), vec![MortonKey::root()]); + } - // Make sure that max level never exceeds DEEPEST_LEVEL - let max_level = if max_level > DEEPEST_LEVEL as usize { - DEEPEST_LEVEL as usize - } else { - max_level - }; + let mut completed_region = sorted_keys + .first() + .unwrap() + .fill_between_keys(*sorted_keys.last().unwrap()); + + completed_region.insert(0, *sorted_keys.first().unwrap()); + completed_region.push(*sorted_keys.last().unwrap()); + + // Get the smallest level members of the completed region. + + let min_level = completed_region + .iter() + .map(|elem| elem.level()) + .min() + .unwrap(); + + // Each process selects its largest boxes. These are used to create + // a coarse tree. + + let largest_boxes = completed_region + .iter() + .filter(|elem| elem.level() == min_level) + .copied() + .collect_vec(); + + let coarse_tree = complete_tree(&largest_boxes, rng, comm); + + // We want to partition the coarse tree. But we need the correct weights. The idea + // is that we use the number of original leafs that intersect with the coarse tree + // as leafs. In order to compute this we send the coarse tree around to all processes + // so that each process computes for each coarse tree element how many of its keys + // intersect with each node of the coarse tree. We then sum up the local weight for each + // coarse tree node across all nodes to get the weight. + + let global_coarse_tree = gather_to_all(&coarse_tree, comm); - // Compute the physical bounding box. + // We also want to send around a corresponding array of ranks so that for each global coarse tree key + // we have the rank of where it originates from. - let bounding_box = PhysicalBox::from_points(points); + let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm); - // Bunch the points in arrays of 3. + // We now compute the local weights. + let mut local_weights = vec![0 as usize; global_coarse_tree.len()]; - let points: &[[f64; 3]] = bytemuck::cast_slice(points); - let npoints = points.len(); + // In the following loop we want to be a bit smart. We do not iterate through all the local elements. + // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region + // of our sorted keys that overlaps with the coarse tree region. - // We create a vector of keys for each point on each level. We compute the - // keys on the deepest level and fill the other levels by going from - // parent to parent. + // Let's find the start of our region. The start of our region is a coarse key that is an ancestor + // of our current key. This works because the coarse tree has levels at most as high as the sorted keys. - let mut point_to_level_keys: [Vec; NLEVELS] = Default::default(); - point_to_level_keys[DEEPEST_LEVEL as usize] = points + let first_key = *sorted_keys.first().unwrap(); + + let first_coarse_index = global_coarse_tree + .iter() + .take_while(|coarse_key| !coarse_key.is_ancestor(first_key)) + .count(); + + // Now we need to find the end index of our region. For this again we find the index of our coarse tree that + // is an ancestor of our last key. + let last_key = *sorted_keys.last().unwrap(); + + let last_coarse_index = global_coarse_tree + .iter() + .take_while(|coarse_key| !coarse_key.is_ancestor(last_key)) + .count(); + + // We now only need to iterate through between the first and last coarse index in the coarse tree. + // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key). + + for (w, &global_coarse_key) in izip!( + local_weights[first_coarse_index..=last_coarse_index].iter_mut(), + global_coarse_tree[first_coarse_index..=last_coarse_index].iter() + ) { + *w += sorted_keys .iter() - .map(|&point| { - MortonKey::from_physical_point(point, &bounding_box, DEEPEST_LEVEL as usize) - }) - .collect::>(); - - for index in (1..=DEEPEST_LEVEL as usize).rev() { - let mut new_vec = Vec::::with_capacity(npoints); - for &key in &point_to_level_keys[index] { - new_vec.push(key.parent()); - } - point_to_level_keys[index - 1] = new_vec; - } + .filter(|&&key| global_coarse_key.is_ancestor(key)) + .count(); + } - // We now have to create level keys. We are starting at the root and recursing - // down until each box has fewer than max_points_per_box keys. + // We now need to sum up the weights across all processes. - // First we compute the counts of each key on each level. For that we create - // for each level a Hashmap for the keys and then add up. + let mut weights = vec![0 as usize; global_coarse_tree.len()]; - let mut key_counts: HashMap = Default::default(); + comm.all_reduce_into(&local_weights, &mut weights, SystemOperation::sum()); - for keys in &point_to_level_keys { - for key in keys { - *key_counts.entry(*key).or_default() += 1; - } - } + // Each process now has all weights. However, we only need the ones for the current process. + // So we just filter the rest out. - // We can now easily create an adaptive tree by subdividing. We do this by - // a recursive function. - - let mut leaf_keys = Vec::::new(); - - fn recurse_keys( - key: MortonKey, - key_counts: &HashMap, - leaf_keys: &mut Vec, - max_points_per_box: usize, - max_level: usize, - ) { - let level = key.level(); - // A key may have not be associated with points. This happens if one of the children on - // the previous level has no points in its physical box. However, we want to create a - // complete tree. So we still add this one empty child. - if let Some(&count) = key_counts.get(&key) { - if count > max_points_per_box && level < max_level { - for child in key.children() { - recurse_keys(child, key_counts, leaf_keys, max_points_per_box, max_level); - } - } else { - leaf_keys.push(key) - } + let weights = izip!(coarse_tree_ranks, weights) + .filter_map(|(r, weight)| { + if r == rank as usize { + Some(weight) } else { - leaf_keys.push(key) + None } - } + }) + .collect_vec(); - // Now execute the recursion starting from root + let coarse_tree = partition(&coarse_tree, &weights, comm); - recurse_keys( - MortonKey::root(), - &key_counts, - &mut leaf_keys, - max_points_per_box, - max_level, - ); + ( + redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm), + coarse_tree, + ) - // The leaf keys are now a complete linear tree. But they are not yet balanced. - // In the final step we balance the leafs. + // We now need to redistribute the global tree according to the coarse tree. +} - let leaf_keys = MortonKey::balance(&leaf_keys, MortonKey::root()); +/// Redistribute sorted keys with respect to a linear coarse tree. +pub fn redistribute_with_respect_to_coarse_tree( + sorted_keys: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> Vec { + let size = comm.size(); - let mut max_leaf_level = 0; - let mut max_points_in_leaf = 0; + if size == 1 { + return sorted_keys.to_vec(); + } - for key in &leaf_keys { - max_leaf_level = max_leaf_level.max(key.level()); - max_points_in_leaf = - max_points_in_leaf.max(if let Some(&count) = key_counts.get(key) { - count - } else { - 0 - }); - } + // We want to globally redistribute keys so that the keys on each process are descendents + // of the local coarse tree keys. - Self { - leaf_keys, - points: points.to_vec(), - point_to_level_keys, - bounding_box, - key_counts, - max_leaf_level, - max_points_in_leaf, - } - } + // We are using here the fact that the coarse tree is complete and sorted. + // We are sending around to each process the first local index. This + // defines bins in which we sort our keys. The keys are then sent around to the correct + // processes via an alltoallv operation. - /// Leaf keys - pub fn leaf_keys(&self) -> &Vec { - &self.leaf_keys - } + let my_first = *coarse_tree.first().unwrap(); - /// Points - pub fn points(&self) -> &Vec<[f64; 3]> { - &self.points - } + let mut global_bins = Vec::::with_capacity(size as usize); + let global_bins_buff: &mut [MortonKey] = + unsafe { std::mem::transmute(global_bins.spare_capacity_mut()) }; - /// Get level keys for each point - pub fn point_to_level_keys(&self) -> &[Vec; NLEVELS] { - &self.point_to_level_keys - } + comm.all_gather_into(&my_first, global_bins_buff); - /// Bounding box - pub fn bounding_box(&self) -> &PhysicalBox { - &self.bounding_box - } + unsafe { global_bins.set_len(size as usize) }; + + // We now have the first index from each process. We also want + // an upper bound for the last index of the tree to make the sorting into + // bins easier. + global_bins.push(MortonKey::upper_bound()); + + // We now have our bins. We go through our keys and store how + // many keys are assigned to each rank. We are using here that + // our keys and the coarse tree are both sorted. + + // This will store for each rank how many keys will be assigned to it. + + let rank_counts = sort_to_bins(sorted_keys, &global_bins) + .iter() + .map(|&elem| elem as i32) + .collect_vec(); - /// Maximum leaf level - pub fn maximum_leaf_level(&self) -> usize { - self.max_leaf_level + // We now have the counts for each rank. Let's send it around via alltoallv. + + let mut counts_from_proc = vec![0 as i32; size as usize]; + + comm.all_to_all_into(&rank_counts, &mut counts_from_proc); + // Now compute the send and receive displacements. + + // We can now send around the actual elements with an alltoallv. + let send_displs: Vec = rank_counts + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp as i32) + }) + .collect(); + + let send_partition = Partition::new(&sorted_keys[..], &rank_counts[..], &send_displs[..]); + + let mut recvbuffer = vec![MortonKey::default(); counts_from_proc.iter().sum::() as usize]; + + let recv_displs: Vec = counts_from_proc + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect(); + + let mut receiv_partition = + PartitionMut::new(&mut recvbuffer[..], counts_from_proc, &recv_displs[..]); + comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); + + recvbuffer +} + +/// Create bins from sorted keys. +pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec { + let mut bin_counts = vec![0 as usize; bins.len() - 1]; + + // This iterates over each possible bin and returns also the associated rank. + let mut bin_iter = izip!( + bin_counts.iter_mut(), + bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(), + ); + + // We take the first element of the bin iterator. There will always be at least one. + let mut r: &mut usize; + let mut bin_start: &MortonKey; + let mut bin_end: &MortonKey; + (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); + + for &key in sorted_keys.iter() { + if *bin_start <= key && key < *bin_end { + *r += 1; + } else { + // Move the bin forward until it fits. There will always be a fitting bin. + while let Some((rn, (bsn, ben))) = bin_iter.next() { + if *bsn <= key && key < *ben { + *rn += 1; + r = rn; + bin_start = bsn; + bin_end = ben; + break; + } + } + } } - /// Maximum number of points in a leaf box - pub fn max_points_in_leaf_box(&self) -> usize { - self.max_points_in_leaf + bin_counts +} + +/// Return a complete tree generated from local keys and associated coarse keys. +/// +/// The coarse keys are refined until the maximum level is reached or until each coarse key +/// is the ancestor of at most `max_keys` fine keys. +pub fn create_local_tree( + sorted_fine_keys: &[MortonKey], + coarse_keys: &[MortonKey], + mut max_level: usize, + max_keys: usize, +) -> Vec { + if max_level > DEEPEST_LEVEL as usize { + max_level = DEEPEST_LEVEL as usize; } - /// Number of points in the box indexed by a key - pub fn number_of_points_in_key(&self, key: MortonKey) -> usize { - if let Some(&count) = self.key_counts.get(&key) { - count + // We split the sorted fine keys into subslices so that each subslice + // is associated with a coarse slice. For this we need to add an upper bound + // coarse keys to ensure that we have suitable bins. + + let mut bins = coarse_keys.to_vec(); + bins.push(MortonKey::upper_bound()); + + let counts = sort_to_bins(&sorted_fine_keys, &bins); + + // We now know how many fine keys are associated with each coarse block. We iterate + // through and locally refine for each block that requires it. + + let mut remainder = sorted_fine_keys; + let mut new_coarse_keys = Vec::::new(); + + for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) { + let current; + (current, remainder) = remainder.split_at(count); + if coarse_key.level() < max_level && current.len() > max_keys { + // We need to refine the current split. + new_coarse_keys.extend_from_slice( + create_local_tree( + current, + coarse_key.children().as_slice(), + max_level, + max_keys, + ) + .as_slice(), + ); } else { - 0 + new_coarse_keys.push(coarse_key) } } - /// Export the tree to vtk - pub fn export_to_vtk(&self, file_path: &str) { - use vtkio::model::{ - Attributes, ByteOrder, CellType, Cells, DataSet, IOBuffer, UnstructuredGridPiece, - Version, VertexNumbers, - }; + coarse_keys.to_vec() +} - // Each box has 8 corners with 3 coordinates each, hence 24 floats per key. - let mut points = Vec::::new(); - // 8 coords per box, hence 8 * nkeys values in connectivity. - let mut connectivity = Vec::::new(); - // Store the vtk offset for each box. - let mut offsets = Vec::::new(); +/// Linearize a set of weighted Morton keys. +pub fn linearize( + keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + let size = comm.size(); + let rank = comm.rank(); - let bounding_box = self.bounding_box(); + // If we only have one process we use the standard serial linearization. - // Go through the keys and add coordinates and connectivity. - // Box coordinates are already in the right order, so connectivity - // just counts up. We don't mind doubly counted vertices from two boxes. - let mut point_count = 0; - let mut key_count = 0; + if size == 1 { + return MortonKey::linearize(keys); + } - for key in self.leaf_keys().iter() { - // We only want to export non-empty boxes. - if self.number_of_points_in_key(*key) == 0 { - continue; - } - let coords = key.physical_box(bounding_box).corners(); + // We are first sorting the keys. Then in a linear process across all processors we + // go through the arrays and delete ancestors of nodes. + + let sorted_keys = parsort(&keys, comm, rng); - key_count += 1; - offsets.push(8 * key_count); + // Each process needs to send its first element to the previous process. Each process + // then goes through its own list and retains elements that are not ancestors of the + // next element. - for coord in &coords { - points.push(coord[0]); - points.push(coord[1]); - points.push(coord[2]); + let mut result = Vec::::new(); - connectivity.push(point_count); - point_count += 1; + if rank == size - 1 { + comm.process_at_rank(rank - 1) + .send(sorted_keys.first().unwrap()); + + for (&m1, &m2) in sorted_keys.iter().tuple_windows() { + // m1 is also ancestor of m2 if they are identical. + if m1.is_ancestor(m2) { + continue; + } else { + result.push(m1); } } - let vtk_file = vtkio::Vtk { - version: Version::new((1, 0)), - title: String::new(), - byte_order: ByteOrder::LittleEndian, - file_path: None, - data: DataSet::inline(UnstructuredGridPiece { - points: IOBuffer::F64(points), - cells: Cells { - cell_verts: VertexNumbers::XML { - connectivity, - offsets, - }, - types: vec![CellType::Hexahedron; key_count as usize], - }, - data: Attributes { - point: vec![], - cell: vec![], - }, - }), + result.push(*sorted_keys.last().unwrap()); + } else { + let (other, _status) = if rank > 0 { + p2p::send_receive( + sorted_keys.first().unwrap(), + &comm.process_at_rank(rank - 1), + &comm.process_at_rank(rank + 1), + ) + } else { + comm.any_process().receive::() }; + for (&m1, &m2) in sorted_keys.iter().tuple_windows() { + // m1 is also ancestor of m2 if they are identical. + if m1.is_ancestor(m2) { + continue; + } else { + result.push(m1); + } + } + + let last = *sorted_keys.last().unwrap(); - vtk_file.export_ascii(file_path).unwrap(); + if !last.is_ancestor(other) { + result.push(last) + } } - // We can now create the vtk object. + result } -#[cfg(test)] -mod test { - use super::Octree; - use rand::prelude::*; +/// Balance a sorted list of Morton keys across processors given an array of corresponding weights. +pub fn partition( + sorted_keys: &[MortonKey], + weights: &[usize], + comm: &C, +) -> Vec { + assert_eq!(sorted_keys.len(), weights.len()); - fn get_points_on_sphere(npoints: usize) -> Vec { - let mut rng = rand::rngs::StdRng::seed_from_u64(0); - let normal = rand_distr::Normal::new(0.0, 1.0).unwrap(); + let size = comm.size(); + let rank = comm.rank(); - let mut points = Vec::::with_capacity(3 * npoints); - for _ in 0..(npoints) { - let x: f64 = normal.sample(&mut rng); - let y: f64 = normal.sample(&mut rng); - let z: f64 = normal.sample(&mut rng); + // If we only have one process we simply return. - let norm = (x * x + y * y + z * z).sqrt(); + if size == 1 { + return sorted_keys.to_vec(); + } - points.push(x / norm); - points.push(y / norm); - points.push(z / norm); - } + // First scan the weight. + // We scan the local arrays, then use a global scan operation on the last element + // of each array to get the global sums and then we update the array of each rank + // with the sum from the previous ranks. + + let mut scan: Vec = weights + .iter() + .scan(0, |state, x| { + *state += *x; + Some(*state) + }) + .collect_vec(); + let scan_last = *scan.last().unwrap(); + let mut scan_result: usize = 0; + comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum()); + for elem in &mut scan { + *elem += scan_result; + } - points + let mut total_weight = if rank == size - 1 { + *scan.last().unwrap() + } else { + 0 + }; + + // Scan the weight (form cumulative sums) and broadcast the total weight (last entry on last process) + // to all other processes. + + comm.process_at_rank(size - 1) + .broadcast_into(&mut total_weight); + + let w = total_weight / (size as usize); + let k = total_weight % (size as usize); + + let mut hash_map = HashMap::>::new(); + + // Sort the elements into bins according to which process they should be sent. + + for p in 1..=size as usize { + let q = if p <= k as usize { + izip!(sorted_keys, &scan) + .filter_map(|(&key, &s)| { + if ((p - 1) * (1 + w) <= s && s < p * (w + 1)) + || (p == size as usize && (p - 1) * (1 + w) <= s) + { + Some(key) + } else { + None + } + }) + .collect_vec() + } else { + izip!(sorted_keys, &scan) + .filter_map(|(&key, &s)| { + if ((p - 1) * w + k <= s && s < p * w + k) + || (p == size as usize && (p - 1) * w + k <= s) + { + Some(key) + } else { + None + } + }) + .collect_vec() + }; + hash_map.insert(p - 1, q); } - #[test] - fn test_octree() { - use std::time::Instant; + // Now distribute the data with an all to all v. + // We create a vector of how many elements to send to each process and + // then send the actual data. + + let mut counts = vec![0 as i32; size as usize]; + let mut counts_from_processor = vec![0 as i32; size as usize]; - let npoints = 10000; - let points = get_points_on_sphere(npoints); - let max_level = 7; - let max_points_per_box = 100; + let mut all_elements = Vec::::new(); + for (index, c) in counts.iter_mut().enumerate() { + let elements = hash_map.get(&index).unwrap(); + *c = elements.len() as i32; + all_elements.extend(elements.iter()) + } + + // Send around the number of elements for each process + comm.all_to_all_into(&counts, &mut counts_from_processor); + + // We have the number of elements for each process now. Now send around + // the actual elements. + + // We can now send around the actual elements with an alltoallv. + let send_displs: Vec = counts + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp as i32) + }) + .collect(); + + let send_partition = Partition::new(&all_elements, &counts[..], &send_displs[..]); + + let mut recvbuffer = + vec![MortonKey::default(); counts_from_processor.iter().sum::() as usize]; + + let recv_displs: Vec = counts_from_processor + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect(); + + let mut receiv_partition = + PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]); + comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); + + recvbuffer.sort_unstable(); + recvbuffer +} - let start = Instant::now(); - let octree = Octree::from_points(&points, max_level, max_points_per_box); - let duration = start.elapsed(); +/// Given a distributed set of keys, generate a complete linear Octree. +pub fn complete_tree( + keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + let mut linearized_keys = linearize(keys, rng, comm); - println!("Creation time: {}", duration.as_millis()); - println!("Number of leaf keys: {}", octree.leaf_keys().len()); - println!("Bounding box: {}", octree.bounding_box()); + let size = comm.size(); + let rank = comm.rank(); + + if size == 1 { + return MortonKey::complete_tree(linearized_keys.as_slice()); } - #[test] - fn test_export() { - let fname = "_test_sphere.vtk"; - let npoints = 10000; - let points = get_points_on_sphere(npoints); - let max_level = 7; - let max_points_per_box = 100; - - let octree = Octree::from_points(&points, max_level, max_points_per_box); - - octree.export_to_vtk(fname); - println!("Maximum leaf level: {}", octree.maximum_leaf_level()); - println!( - "Maximum number of points in leaf box: {}", - octree.max_points_in_leaf_box() - ); + // Now insert on the first and last process the first and last child of the + // finest ancestor of first/last box on deepest level + + // Send first element to previous rank and insert into local keys. + // On the first process we also need to insert the first child of the finest + // ancestor of the deepest first key and first element. Correspondingly on the last process + // we need to insert the last child of the finest ancester of the deepest last key and last element. + + if rank == size - 1 { + // On last process send first element to previous processes and insert last + // possible box from region into list. + comm.process_at_rank(rank - 1) + .send(linearized_keys.first().unwrap()); + let last_key = *linearized_keys.last().unwrap(); + let deepest_last = MortonKey::deepest_last(); + if !last_key.is_ancestor(deepest_last) { + let ancestor = deepest_last.finest_common_ancestor(last_key); + linearized_keys.push(ancestor.children()[NSIBLINGS - 1]); + } + } else { + let (other, _status) = if rank > 0 { + // On intermediate process receive from the next process + // and send first element to previous process. + p2p::send_receive( + linearized_keys.first().unwrap(), + &comm.process_at_rank(rank - 1), + &comm.process_at_rank(rank + 1), + ) + } else { + // On first process insert at the beginning the first possible + // box in the region and receive the key from next process. + let first_key = *linearized_keys.first().unwrap(); + let deepest_first = MortonKey::deepest_first(); + if !first_key.is_ancestor(deepest_first) { + let ancestor = deepest_first.finest_common_ancestor(first_key); + linearized_keys.insert(0, ancestor.children()[0]); + } + + comm.process_at_rank(1).receive::() + }; + // If we are not at the last process we need to introduce the received key + // into our list. + linearized_keys.push(other); + }; + + // Now complete the regions defined by the keys on each process. + + let mut result = Vec::::new(); + + for (&key1, &key2) in linearized_keys.iter().tuple_windows() { + result.push(key1); + result.extend_from_slice(key1.fill_between_keys(key2).as_slice()); + } + + if rank == size - 1 { + result.push(*linearized_keys.last().unwrap()); } + + result } diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs deleted file mode 100644 index e77cdf1..0000000 --- a/src/parallel_octree.rs +++ /dev/null @@ -1,836 +0,0 @@ -//! Parallel Octree structure - -use std::collections::HashMap; - -use crate::{ - constants::{DEEPEST_LEVEL, NSIBLINGS}, - geometry::PhysicalBox, - morton::MortonKey, - parsort::{array_to_root, parsort}, -}; - -use mpi::{ - datatype::{Partition, PartitionMut}, - point_to_point as p2p, - traits::{Equivalence, Root, Source}, -}; - -use itertools::{izip, Itertools}; -use mpi::{ - collective::SystemOperation, - traits::{CommunicatorCollectives, Destination}, -}; -use rand::Rng; - -// /// A weighted Mortonkey contains weights to enable load balancing. -// #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Equivalence)] -// pub struct WeightedMortonKey { -// /// The actual MortonKey. -// pub key: MortonKey, -// /// The weight of the key, typically the number of points in the corresponding octant. -// pub weight: usize, -// } - -// impl WeightedMortonKey { -// /// Get a new weighted Morton key -// pub fn new(key: MortonKey, weight: usize) -> Self { -// Self { key, weight } -// } -// } - -// impl MinValue for WeightedMortonKey { -// fn min_value() -> Self { -// WeightedMortonKey { -// key: MortonKey::from_index_and_level([0, 0, 0], 0), -// weight: 0, -// } -// } -// } - -// impl MaxValue for WeightedMortonKey { -// fn max_value() -> Self { -// WeightedMortonKey { -// key: MortonKey::deepest_last(), -// weight: usize::MAX, -// } -// } -// } - -// impl Default for WeightedMortonKey { -// fn default() -> Self { -// WeightedMortonKey::new(Default::default(), 0) -// } -// } - -// impl Display for WeightedMortonKey { -// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { -// write!(f, "(Key: {}, Weight: {}", self.key, self.weight) -// } -// } - -/// Compute the global bounding box across all points on all processes. -pub fn compute_global_bounding_box( - points: &[f64], - comm: &C, -) -> PhysicalBox { - // Make sure that the points array is a multiple of 3. - assert_eq!(points.len() % 3, 0); - let points: &[[f64; 3]] = bytemuck::cast_slice(points); - - // Now compute the minimum and maximum across each dimension. - - let mut xmin = f64::MAX; - let mut xmax = f64::MIN; - - let mut ymin = f64::MAX; - let mut ymax = f64::MIN; - - let mut zmin = f64::MAX; - let mut zmax = f64::MIN; - - for point in points { - let x = point[0]; - let y = point[1]; - let z = point[2]; - - xmin = f64::min(xmin, x); - xmax = f64::max(xmax, x); - - ymin = f64::min(ymin, y); - ymax = f64::max(ymax, y); - - zmin = f64::min(zmin, z); - zmax = f64::max(zmax, z); - } - - let mut global_xmin = 0.0; - let mut global_xmax = 0.0; - - let mut global_ymin = 0.0; - let mut global_ymax = 0.0; - - let mut global_zmin = 0.0; - let mut global_zmax = 0.0; - - comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min()); - comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max()); - - comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min()); - comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max()); - - comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min()); - comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max()); - - let xdiam = global_xmax - global_xmin; - let ydiam = global_ymax - global_ymin; - let zdiam = global_zmax - global_zmin; - - let xmean = global_xmin + 0.5 * xdiam; - let ymean = global_ymin + 0.5 * ydiam; - let zmean = global_zmin + 0.5 * zdiam; - - // We increase diameters by box size on deepest level - // and use the maximum diameter to compute a - // cubic bounding box. - - let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64; - - let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap(); - - let max_diam = max_diam * (1.0 + deepest_box_diam); - - PhysicalBox::new([ - xmean - 0.5 * max_diam, - ymean - 0.5 * max_diam, - zmean - 0.5 * max_diam, - xmean + 0.5 * max_diam, - ymean + 0.5 * max_diam, - zmean + 0.5 * max_diam, - ]) -} - -/// Convert points to Morton keys on specified level. -pub fn points_to_morton( - points: &[f64], - max_level: usize, - comm: &C, -) -> (Vec, PhysicalBox) { - // Make sure that the points array is a multiple of 3. - assert_eq!(points.len() % 3, 0); - - // Make sure that max level never exceeds DEEPEST_LEVEL - let max_level = if max_level > DEEPEST_LEVEL as usize { - DEEPEST_LEVEL as usize - } else { - max_level - }; - - // Compute the physical bounding box. - - let bounding_box = compute_global_bounding_box(points, comm); - - // Bunch the points in arrays of 3. - - let points: &[[f64; 3]] = bytemuck::cast_slice(points); - - let keys = points - .iter() - .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level)) - .collect_vec(); - - // Now want to get weighted Morton keys. We use a HashMap. - - let mut value_counts = HashMap::::new(); - - for key in &keys { - *value_counts.entry(*key).or_insert(0) += 1; - } - - // let weighted_keys = value_counts - // .iter() - // .map(|(&key, &weight)| WeightedMortonKey::new(key, weight)) - // .collect_vec(); - - (keys, bounding_box) -} - -/// Block partition of tree. -/// -/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned -/// keys and the associated coarse keys. -/// A necessary condition for the block partitioning is that -// all sorted keys are on the same level. -pub fn block_partition( - sorted_keys: &[MortonKey], - rng: &mut R, - comm: &C, -) -> (Vec, Vec) { - let rank = comm.rank(); - if comm.size() == 1 { - // On a single node block partitioning should not do anything. - return (sorted_keys.to_vec(), vec![MortonKey::root()]); - } - - let mut completed_region = sorted_keys - .first() - .unwrap() - .fill_between_keys(*sorted_keys.last().unwrap()); - - completed_region.insert(0, *sorted_keys.first().unwrap()); - completed_region.push(*sorted_keys.last().unwrap()); - - // Get the smallest level members of the completed region. - - let min_level = completed_region - .iter() - .map(|elem| elem.level()) - .min() - .unwrap(); - - // Each process selects its largest boxes. These are used to create - // a coarse tree. - - let largest_boxes = completed_region - .iter() - .filter(|elem| elem.level() == min_level) - .copied() - .collect_vec(); - - let coarse_tree = complete_tree(&largest_boxes, rng, comm); - - // We want to partition the coarse tree. But we need the correct weights. The idea - // is that we use the number of original leafs that intersect with the coarse tree - // as leafs. In order to compute this we send the coarse tree around to all processes - // so that each process computes for each coarse tree element how many of its keys - // intersect with each node of the coarse tree. We then sum up the local weight for each - // coarse tree node across all nodes to get the weight. - - let global_coarse_tree = gather_to_all(&coarse_tree, comm); - - // We also want to send around a corresponding array of ranks so that for each global coarse tree key - // we have the rank of where it originates from. - - let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm); - - // We now compute the local weights. - let mut local_weights = vec![0 as usize; global_coarse_tree.len()]; - - // In the following loop we want to be a bit smart. We do not iterate through all the local elements. - // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region - // of our sorted keys that overlaps with the coarse tree region. - - // Let's find the start of our region. The start of our region is a coarse key that is an ancestor - // of our current key. This works because the coarse tree has levels at most as high as the sorted keys. - - let first_key = *sorted_keys.first().unwrap(); - - let first_coarse_index = global_coarse_tree - .iter() - .take_while(|coarse_key| !coarse_key.is_ancestor(first_key)) - .count(); - - // Now we need to find the end index of our region. For this again we find the index of our coarse tree that - // is an ancestor of our last key. - let last_key = *sorted_keys.last().unwrap(); - - let last_coarse_index = global_coarse_tree - .iter() - .take_while(|coarse_key| !coarse_key.is_ancestor(last_key)) - .count(); - - // We now only need to iterate through between the first and last coarse index in the coarse tree. - // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key). - - for (w, &global_coarse_key) in izip!( - local_weights[first_coarse_index..=last_coarse_index].iter_mut(), - global_coarse_tree[first_coarse_index..=last_coarse_index].iter() - ) { - *w += sorted_keys - .iter() - .filter(|&&key| global_coarse_key.is_ancestor(key)) - .count(); - } - - // We now need to sum up the weights across all processes. - - let mut weights = vec![0 as usize; global_coarse_tree.len()]; - - comm.all_reduce_into(&local_weights, &mut weights, SystemOperation::sum()); - - // Each process now has all weights. However, we only need the ones for the current process. - // So we just filter the rest out. - - let weights = izip!(coarse_tree_ranks, weights) - .filter_map(|(r, weight)| { - if r == rank as usize { - Some(weight) - } else { - None - } - }) - .collect_vec(); - - let coarse_tree = partition(&coarse_tree, &weights, comm); - - ( - redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm), - coarse_tree, - ) - - // We now need to redistribute the global tree according to the coarse tree. -} - -/// Redistribute sorted keys with respect to a linear coarse tree. -pub fn redistribute_with_respect_to_coarse_tree( - sorted_keys: &[MortonKey], - coarse_tree: &[MortonKey], - comm: &C, -) -> Vec { - let size = comm.size(); - - if size == 1 { - return sorted_keys.to_vec(); - } - - // We want to globally redistribute keys so that the keys on each process are descendents - // of the local coarse tree keys. - - // We are using here the fact that the coarse tree is complete and sorted. - // We are sending around to each process the first local index. This - // defines bins in which we sort our keys. The keys are then sent around to the correct - // processes via an alltoallv operation. - - let my_first = *coarse_tree.first().unwrap(); - - let mut global_bins = Vec::::with_capacity(size as usize); - let global_bins_buff: &mut [MortonKey] = - unsafe { std::mem::transmute(global_bins.spare_capacity_mut()) }; - - comm.all_gather_into(&my_first, global_bins_buff); - - unsafe { global_bins.set_len(size as usize) }; - - // We now have the first index from each process. We also want - // an upper bound for the last index of the tree to make the sorting into - // bins easier. - global_bins.push(MortonKey::upper_bound()); - - // We now have our bins. We go through our keys and store how - // many keys are assigned to each rank. We are using here that - // our keys and the coarse tree are both sorted. - - // This will store for each rank how many keys will be assigned to it. - - let rank_counts = sort_to_bins(sorted_keys, &global_bins) - .iter() - .map(|&elem| elem as i32) - .collect_vec(); - - // We now have the counts for each rank. Let's send it around via alltoallv. - - let mut counts_from_proc = vec![0 as i32; size as usize]; - - comm.all_to_all_into(&rank_counts, &mut counts_from_proc); - // Now compute the send and receive displacements. - - // We can now send around the actual elements with an alltoallv. - let send_displs: Vec = rank_counts - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp as i32) - }) - .collect(); - - let send_partition = Partition::new(&sorted_keys[..], &rank_counts[..], &send_displs[..]); - - let mut recvbuffer = vec![MortonKey::default(); counts_from_proc.iter().sum::() as usize]; - - let recv_displs: Vec = counts_from_proc - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut receiv_partition = - PartitionMut::new(&mut recvbuffer[..], counts_from_proc, &recv_displs[..]); - comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); - - recvbuffer -} - -/// Create bins from sorted keys. -pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec { - let mut bin_counts = vec![0 as usize; bins.len() - 1]; - - // This iterates over each possible bin and returns also the associated rank. - let mut bin_iter = izip!( - bin_counts.iter_mut(), - bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(), - ); - - // We take the first element of the bin iterator. There will always be at least one. - let mut r: &mut usize; - let mut bin_start: &MortonKey; - let mut bin_end: &MortonKey; - (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); - - for &key in sorted_keys.iter() { - if *bin_start <= key && key < *bin_end { - *r += 1; - } else { - // Move the bin forward until it fits. There will always be a fitting bin. - while let Some((rn, (bsn, ben))) = bin_iter.next() { - if *bsn <= key && key < *ben { - *rn += 1; - r = rn; - bin_start = bsn; - bin_end = ben; - break; - } - } - } - } - - bin_counts -} - -/// Return a complete tree generated from local keys and associated coarse keys. -/// -/// The coarse keys are refined until the maximum level is reached or until each coarse key -/// is the ancestor of at most `max_keys` fine keys. -pub fn create_local_tree( - sorted_fine_keys: &[MortonKey], - coarse_keys: &[MortonKey], - mut max_level: usize, - max_keys: usize, -) -> Vec { - if max_level > DEEPEST_LEVEL as usize { - max_level = DEEPEST_LEVEL as usize; - } - - // We split the sorted fine keys into subslices so that each subslice - // is associated with a coarse slice. For this we need to add an upper bound - // coarse keys to ensure that we have suitable bins. - - let mut bins = coarse_keys.to_vec(); - bins.push(MortonKey::upper_bound()); - - let counts = sort_to_bins(&sorted_fine_keys, &bins); - - // We now know how many fine keys are associated with each coarse block. We iterate - // through and locally refine for each block that requires it. - - let mut remainder = sorted_fine_keys; - let mut new_coarse_keys = Vec::::new(); - - for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) { - let current; - (current, remainder) = remainder.split_at(count); - if coarse_key.level() < max_level && current.len() > max_keys { - // We need to refine the current split. - new_coarse_keys.extend_from_slice( - create_local_tree( - current, - coarse_key.children().as_slice(), - max_level, - max_keys, - ) - .as_slice(), - ); - } else { - new_coarse_keys.push(coarse_key) - } - } - - coarse_keys.to_vec() -} - -/// Linearize a set of weighted Morton keys. -pub fn linearize( - keys: &[MortonKey], - rng: &mut R, - comm: &C, -) -> Vec { - let size = comm.size(); - let rank = comm.rank(); - - // If we only have one process we use the standard serial linearization. - - if size == 1 { - return MortonKey::linearize(keys); - } - - // We are first sorting the keys. Then in a linear process across all processors we - // go through the arrays and delete ancestors of nodes. - - let sorted_keys = parsort(&keys, comm, rng); - - // Each process needs to send its first element to the previous process. Each process - // then goes through its own list and retains elements that are not ancestors of the - // next element. - - let mut result = Vec::::new(); - - if rank == size - 1 { - comm.process_at_rank(rank - 1) - .send(sorted_keys.first().unwrap()); - - for (&m1, &m2) in sorted_keys.iter().tuple_windows() { - // m1 is also ancestor of m2 if they are identical. - if m1.is_ancestor(m2) { - continue; - } else { - result.push(m1); - } - } - - result.push(*sorted_keys.last().unwrap()); - } else { - let (other, _status) = if rank > 0 { - p2p::send_receive( - sorted_keys.first().unwrap(), - &comm.process_at_rank(rank - 1), - &comm.process_at_rank(rank + 1), - ) - } else { - comm.any_process().receive::() - }; - for (&m1, &m2) in sorted_keys.iter().tuple_windows() { - // m1 is also ancestor of m2 if they are identical. - if m1.is_ancestor(m2) { - continue; - } else { - result.push(m1); - } - } - - let last = *sorted_keys.last().unwrap(); - - if !last.is_ancestor(other) { - result.push(last) - } - } - - result -} - -/// Balance a sorted list of Morton keys across processors given an array of corresponding weights. -pub fn partition( - sorted_keys: &[MortonKey], - weights: &[usize], - comm: &C, -) -> Vec { - assert_eq!(sorted_keys.len(), weights.len()); - - let size = comm.size(); - let rank = comm.rank(); - - // If we only have one process we simply return. - - if size == 1 { - return sorted_keys.to_vec(); - } - - // First scan the weight. - // We scan the local arrays, then use a global scan operation on the last element - // of each array to get the global sums and then we update the array of each rank - // with the sum from the previous ranks. - - let mut scan: Vec = weights - .iter() - .scan(0, |state, x| { - *state += *x; - Some(*state) - }) - .collect_vec(); - let scan_last = *scan.last().unwrap(); - let mut scan_result: usize = 0; - comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum()); - for elem in &mut scan { - *elem += scan_result; - } - - let mut total_weight = if rank == size - 1 { - *scan.last().unwrap() - } else { - 0 - }; - - // Scan the weight (form cumulative sums) and broadcast the total weight (last entry on last process) - // to all other processes. - - comm.process_at_rank(size - 1) - .broadcast_into(&mut total_weight); - - let w = total_weight / (size as usize); - let k = total_weight % (size as usize); - - let mut hash_map = HashMap::>::new(); - - // Sort the elements into bins according to which process they should be sent. - - for p in 1..=size as usize { - let q = if p <= k as usize { - izip!(sorted_keys, &scan) - .filter_map(|(&key, &s)| { - if ((p - 1) * (1 + w) <= s && s < p * (w + 1)) - || (p == size as usize && (p - 1) * (1 + w) <= s) - { - Some(key) - } else { - None - } - }) - .collect_vec() - } else { - izip!(sorted_keys, &scan) - .filter_map(|(&key, &s)| { - if ((p - 1) * w + k <= s && s < p * w + k) - || (p == size as usize && (p - 1) * w + k <= s) - { - Some(key) - } else { - None - } - }) - .collect_vec() - }; - hash_map.insert(p - 1, q); - } - - // Now distribute the data with an all to all v. - // We create a vector of how many elements to send to each process and - // then send the actual data. - - let mut counts = vec![0 as i32; size as usize]; - let mut counts_from_processor = vec![0 as i32; size as usize]; - - let mut all_elements = Vec::::new(); - for (index, c) in counts.iter_mut().enumerate() { - let elements = hash_map.get(&index).unwrap(); - *c = elements.len() as i32; - all_elements.extend(elements.iter()) - } - - // Send around the number of elements for each process - comm.all_to_all_into(&counts, &mut counts_from_processor); - - // We have the number of elements for each process now. Now send around - // the actual elements. - - // We can now send around the actual elements with an alltoallv. - let send_displs: Vec = counts - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp as i32) - }) - .collect(); - - let send_partition = Partition::new(&all_elements, &counts[..], &send_displs[..]); - - let mut recvbuffer = - vec![MortonKey::default(); counts_from_processor.iter().sum::() as usize]; - - let recv_displs: Vec = counts_from_processor - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut receiv_partition = - PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]); - comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); - - recvbuffer.sort_unstable(); - recvbuffer -} - -/// Given a distributed set of keys, generate a complete linear Octree. -pub fn complete_tree( - keys: &[MortonKey], - rng: &mut R, - comm: &C, -) -> Vec { - let mut linearized_keys = linearize(keys, rng, comm); - - let size = comm.size(); - let rank = comm.rank(); - - if size == 1 { - return MortonKey::complete_tree(linearized_keys.as_slice()); - } - - // Now insert on the first and last process the first and last child of the - // finest ancestor of first/last box on deepest level - - // Send first element to previous rank and insert into local keys. - // On the first process we also need to insert the first child of the finest - // ancestor of the deepest first key and first element. Correspondingly on the last process - // we need to insert the last child of the finest ancester of the deepest last key and last element. - - if rank == size - 1 { - // On last process send first element to previous processes and insert last - // possible box from region into list. - comm.process_at_rank(rank - 1) - .send(linearized_keys.first().unwrap()); - let last_key = *linearized_keys.last().unwrap(); - let deepest_last = MortonKey::deepest_last(); - if !last_key.is_ancestor(deepest_last) { - let ancestor = deepest_last.finest_common_ancestor(last_key); - linearized_keys.push(ancestor.children()[NSIBLINGS - 1]); - } - } else { - let (other, _status) = if rank > 0 { - // On intermediate process receive from the next process - // and send first element to previous process. - p2p::send_receive( - linearized_keys.first().unwrap(), - &comm.process_at_rank(rank - 1), - &comm.process_at_rank(rank + 1), - ) - } else { - // On first process insert at the beginning the first possible - // box in the region and receive the key from next process. - let first_key = *linearized_keys.first().unwrap(); - let deepest_first = MortonKey::deepest_first(); - if !first_key.is_ancestor(deepest_first) { - let ancestor = deepest_first.finest_common_ancestor(first_key); - linearized_keys.insert(0, ancestor.children()[0]); - } - - comm.process_at_rank(1).receive::() - }; - // If we are not at the last process we need to introduce the received key - // into our list. - linearized_keys.push(other); - }; - - // Now complete the regions defined by the keys on each process. - - let mut result = Vec::::new(); - - for (&key1, &key2) in linearized_keys.iter().tuple_windows() { - result.push(key1); - result.extend_from_slice(key1.fill_between_keys(key2).as_slice()); - } - - if rank == size - 1 { - result.push(*linearized_keys.last().unwrap()); - } - - result -} - -/// Check if an array is sorted. -pub fn is_sorted_array(arr: &[MortonKey], comm: &C) -> Option { - let arr = array_to_root(arr, comm); - if comm.rank() == 0 { - let arr = arr.unwrap(); - for (&elem1, &elem2) in arr.iter().tuple_windows() { - if elem1 > elem2 { - return Some(false); - } - } - Some(true) - } else { - None - } -} - -/// Get global size of a distributed array. -pub fn global_size(arr: &[T], comm: &C) -> usize { - let local_size = arr.len(); - let mut global_size = 0; - - comm.all_reduce_into(&local_size, &mut global_size, SystemOperation::sum()); - - global_size -} - -/// Gather array to all processes -pub fn gather_to_all(arr: &[T], comm: &C) -> Vec { - // First we need to broadcast the individual sizes on each process. - - let size = comm.size(); - - let local_len = arr.len() as i32; - - let mut sizes = vec![0 as i32; size as usize]; - - comm.all_gather_into(&local_len, &mut sizes); - - let recv_len = sizes.iter().sum::() as usize; - - // Now we have the size of each local contribution. - // let mut recvbuffer = - // vec![T: Default; counts_from_processor.iter().sum::() as usize]; - let mut recvbuffer = Vec::::with_capacity(recv_len); - let buf: &mut [T] = unsafe { std::mem::transmute(recvbuffer.spare_capacity_mut()) }; - - let recv_displs: Vec = sizes - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut receiv_partition = PartitionMut::new(buf, sizes, &recv_displs[..]); - - comm.all_gather_varcount_into(arr, &mut receiv_partition); - - unsafe { recvbuffer.set_len(recv_len) }; - - recvbuffer -} diff --git a/src/parsort.rs b/src/parsort.rs index 16518cc..0ffe189 100644 --- a/src/parsort.rs +++ b/src/parsort.rs @@ -397,51 +397,6 @@ impl<'a, T> Iterator for Split<'a, T> { } } -/// Array to root -pub fn array_to_root( - arr: &[T], - comm: &C, -) -> Option> { - let n = arr.len() as i32; - let rank = comm.rank(); - let size = comm.size(); - let root_process = comm.process_at_rank(0); - - // We first communicate the length of the array to root. - - if rank == 0 { - // We are at root. - - let mut ranks = vec![0_i32; size as usize]; - root_process.gather_into_root(&n, &mut ranks); - - // We now have all ranks at root. Can now a varcount gather to get - // the array elements. - - let nelements = ranks.iter().sum::(); - - let mut new_arr = vec![::default(); nelements as usize]; - - let displs: Vec = ranks - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut partition = PartitionMut::new(&mut new_arr[..], ranks, &displs[..]); - - root_process.gather_varcount_into_root(arr, &mut partition); - Some(new_arr) - } else { - root_process.gather_into(&n); - root_process.gather_varcount_into(arr); - None - } -} - macro_rules! impl_min_max_value { ($type:ty) => { impl MinValue for $type { diff --git a/src/serial.rs b/src/serial.rs new file mode 100644 index 0000000..64cdb08 --- /dev/null +++ b/src/serial.rs @@ -0,0 +1,325 @@ +//! Definition of a linear octree + +use crate::{ + constants::{DEEPEST_LEVEL, NLEVELS}, + geometry::PhysicalBox, + morton::MortonKey, +}; +use bytemuck; +use std::collections::HashMap; +use vtkio; + +/// A neighbour +pub struct Neighbour { + /// Direction + pub direction: [i64; 3], + /// Level + pub level: usize, + /// Morton key + pub key: MortonKey, +} + +/// An octree +pub struct Octree { + leaf_keys: Vec, + points: Vec<[f64; 3]>, + point_to_level_keys: [Vec; NLEVELS], + bounding_box: PhysicalBox, + key_counts: HashMap, + max_leaf_level: usize, + max_points_in_leaf: usize, +} + +impl Octree { + /// Create octress from points + pub fn from_points(points: &[f64], max_level: usize, max_points_per_box: usize) -> Self { + // Make sure that the points array is a multiple of 3. + assert_eq!(points.len() % 3, 0); + + // Make sure that max level never exceeds DEEPEST_LEVEL + let max_level = if max_level > DEEPEST_LEVEL as usize { + DEEPEST_LEVEL as usize + } else { + max_level + }; + + // Compute the physical bounding box. + + let bounding_box = PhysicalBox::from_points(points); + + // Bunch the points in arrays of 3. + + let points: &[[f64; 3]] = bytemuck::cast_slice(points); + let npoints = points.len(); + + // We create a vector of keys for each point on each level. We compute the + // keys on the deepest level and fill the other levels by going from + // parent to parent. + + let mut point_to_level_keys: [Vec; NLEVELS] = Default::default(); + point_to_level_keys[DEEPEST_LEVEL as usize] = points + .iter() + .map(|&point| { + MortonKey::from_physical_point(point, &bounding_box, DEEPEST_LEVEL as usize) + }) + .collect::>(); + + for index in (1..=DEEPEST_LEVEL as usize).rev() { + let mut new_vec = Vec::::with_capacity(npoints); + for &key in &point_to_level_keys[index] { + new_vec.push(key.parent()); + } + point_to_level_keys[index - 1] = new_vec; + } + + // We now have to create level keys. We are starting at the root and recursing + // down until each box has fewer than max_points_per_box keys. + + // First we compute the counts of each key on each level. For that we create + // for each level a Hashmap for the keys and then add up. + + let mut key_counts: HashMap = Default::default(); + + for keys in &point_to_level_keys { + for key in keys { + *key_counts.entry(*key).or_default() += 1; + } + } + + // We can now easily create an adaptive tree by subdividing. We do this by + // a recursive function. + + let mut leaf_keys = Vec::::new(); + + fn recurse_keys( + key: MortonKey, + key_counts: &HashMap, + leaf_keys: &mut Vec, + max_points_per_box: usize, + max_level: usize, + ) { + let level = key.level(); + // A key may have not be associated with points. This happens if one of the children on + // the previous level has no points in its physical box. However, we want to create a + // complete tree. So we still add this one empty child. + if let Some(&count) = key_counts.get(&key) { + if count > max_points_per_box && level < max_level { + for child in key.children() { + recurse_keys(child, key_counts, leaf_keys, max_points_per_box, max_level); + } + } else { + leaf_keys.push(key) + } + } else { + leaf_keys.push(key) + } + } + + // Now execute the recursion starting from root + + recurse_keys( + MortonKey::root(), + &key_counts, + &mut leaf_keys, + max_points_per_box, + max_level, + ); + + // The leaf keys are now a complete linear tree. But they are not yet balanced. + // In the final step we balance the leafs. + + let leaf_keys = MortonKey::balance(&leaf_keys, MortonKey::root()); + + let mut max_leaf_level = 0; + let mut max_points_in_leaf = 0; + + for key in &leaf_keys { + max_leaf_level = max_leaf_level.max(key.level()); + max_points_in_leaf = + max_points_in_leaf.max(if let Some(&count) = key_counts.get(key) { + count + } else { + 0 + }); + } + + Self { + leaf_keys, + points: points.to_vec(), + point_to_level_keys, + bounding_box, + key_counts, + max_leaf_level, + max_points_in_leaf, + } + } + + /// Leaf keys + pub fn leaf_keys(&self) -> &Vec { + &self.leaf_keys + } + + /// Points + pub fn points(&self) -> &Vec<[f64; 3]> { + &self.points + } + + /// Get level keys for each point + pub fn point_to_level_keys(&self) -> &[Vec; NLEVELS] { + &self.point_to_level_keys + } + + /// Bounding box + pub fn bounding_box(&self) -> &PhysicalBox { + &self.bounding_box + } + + /// Maximum leaf level + pub fn maximum_leaf_level(&self) -> usize { + self.max_leaf_level + } + + /// Maximum number of points in a leaf box + pub fn max_points_in_leaf_box(&self) -> usize { + self.max_points_in_leaf + } + + /// Number of points in the box indexed by a key + pub fn number_of_points_in_key(&self, key: MortonKey) -> usize { + if let Some(&count) = self.key_counts.get(&key) { + count + } else { + 0 + } + } + + /// Export the tree to vtk + pub fn export_to_vtk(&self, file_path: &str) { + use vtkio::model::{ + Attributes, ByteOrder, CellType, Cells, DataSet, IOBuffer, UnstructuredGridPiece, + Version, VertexNumbers, + }; + + // Each box has 8 corners with 3 coordinates each, hence 24 floats per key. + let mut points = Vec::::new(); + // 8 coords per box, hence 8 * nkeys values in connectivity. + let mut connectivity = Vec::::new(); + // Store the vtk offset for each box. + let mut offsets = Vec::::new(); + + let bounding_box = self.bounding_box(); + + // Go through the keys and add coordinates and connectivity. + // Box coordinates are already in the right order, so connectivity + // just counts up. We don't mind doubly counted vertices from two boxes. + let mut point_count = 0; + let mut key_count = 0; + + for key in self.leaf_keys().iter() { + // We only want to export non-empty boxes. + if self.number_of_points_in_key(*key) == 0 { + continue; + } + let coords = key.physical_box(bounding_box).corners(); + + key_count += 1; + offsets.push(8 * key_count); + + for coord in &coords { + points.push(coord[0]); + points.push(coord[1]); + points.push(coord[2]); + + connectivity.push(point_count); + point_count += 1; + } + } + + let vtk_file = vtkio::Vtk { + version: Version::new((1, 0)), + title: String::new(), + byte_order: ByteOrder::LittleEndian, + file_path: None, + data: DataSet::inline(UnstructuredGridPiece { + points: IOBuffer::F64(points), + cells: Cells { + cell_verts: VertexNumbers::XML { + connectivity, + offsets, + }, + types: vec![CellType::Hexahedron; key_count as usize], + }, + data: Attributes { + point: vec![], + cell: vec![], + }, + }), + }; + + vtk_file.export_ascii(file_path).unwrap(); + } + + // We can now create the vtk object. +} + +#[cfg(test)] +mod test { + use super::Octree; + use rand::prelude::*; + + fn get_points_on_sphere(npoints: usize) -> Vec { + let mut rng = rand::rngs::StdRng::seed_from_u64(0); + let normal = rand_distr::Normal::new(0.0, 1.0).unwrap(); + + let mut points = Vec::::with_capacity(3 * npoints); + for _ in 0..(npoints) { + let x: f64 = normal.sample(&mut rng); + let y: f64 = normal.sample(&mut rng); + let z: f64 = normal.sample(&mut rng); + + let norm = (x * x + y * y + z * z).sqrt(); + + points.push(x / norm); + points.push(y / norm); + points.push(z / norm); + } + + points + } + + #[test] + fn test_octree() { + use std::time::Instant; + + let npoints = 10000; + let points = get_points_on_sphere(npoints); + let max_level = 7; + let max_points_per_box = 100; + + let start = Instant::now(); + let octree = Octree::from_points(&points, max_level, max_points_per_box); + let duration = start.elapsed(); + + println!("Creation time: {}", duration.as_millis()); + println!("Number of leaf keys: {}", octree.leaf_keys().len()); + println!("Bounding box: {}", octree.bounding_box()); + } + + #[test] + fn test_export() { + let fname = "_test_sphere.vtk"; + let npoints = 10000; + let points = get_points_on_sphere(npoints); + let max_level = 7; + let max_points_per_box = 100; + + let octree = Octree::from_points(&points, max_level, max_points_per_box); + + octree.export_to_vtk(fname); + println!("Maximum leaf level: {}", octree.maximum_leaf_level()); + println!( + "Maximum number of points in leaf box: {}", + octree.max_points_in_leaf_box() + ); + } +} diff --git a/src/tools.rs b/src/tools.rs new file mode 100644 index 0000000..394273e --- /dev/null +++ b/src/tools.rs @@ -0,0 +1,135 @@ +//! Utility routines. + +use mpi::{ + collective::SystemOperation, + datatype::PartitionMut, + traits::{CommunicatorCollectives, Equivalence, Root}, +}; + +/// Gather array to all processes +pub fn gather_to_all(arr: &[T], comm: &C) -> Vec { + // First we need to broadcast the individual sizes on each process. + + let size = comm.size(); + + let local_len = arr.len() as i32; + + let mut sizes = vec![0 as i32; size as usize]; + + comm.all_gather_into(&local_len, &mut sizes); + + let recv_len = sizes.iter().sum::() as usize; + + // Now we have the size of each local contribution. + // let mut recvbuffer = + // vec![T: Default; counts_from_processor.iter().sum::() as usize]; + let mut recvbuffer = Vec::::with_capacity(recv_len); + let buf: &mut [T] = unsafe { std::mem::transmute(recvbuffer.spare_capacity_mut()) }; + + let recv_displs: Vec = sizes + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect(); + + let mut receiv_partition = PartitionMut::new(buf, sizes, &recv_displs[..]); + + comm.all_gather_varcount_into(arr, &mut receiv_partition); + + unsafe { recvbuffer.set_len(recv_len) }; + + recvbuffer +} +/// Array to root + +/// Gather distributed array to the root rank. +/// +/// The result is a `Vec` on root and `None` on all other ranks. +pub fn gather_to_root( + arr: &[T], + comm: &C, +) -> Option> { + let n = arr.len() as i32; + let rank = comm.rank(); + let size = comm.size(); + let root_process = comm.process_at_rank(0); + + // We first communicate the length of the array to root. + + if rank == 0 { + // We are at root. + + let mut counts = vec![0_i32; size as usize]; + root_process.gather_into_root(&n, &mut counts); + + // We now have all ranks at root. Can now a varcount gather to get + // the array elements. + + let nelements = counts.iter().sum::(); + let mut new_arr = Vec::::with_capacity(nelements as usize); + let new_arr_buf: &mut [T] = unsafe { std::mem::transmute(new_arr.spare_capacity_mut()) }; + + let displs = displacements(counts.as_slice()); + + let mut partition = PartitionMut::new(new_arr_buf, counts, &displs[..]); + + root_process.gather_varcount_into_root(arr, &mut partition); + + unsafe { new_arr.set_len(nelements as usize) }; + Some(new_arr) + } else { + root_process.gather_into(&n); + root_process.gather_varcount_into(arr); + None + } +} + +/// Get global size of a distributed array. +/// +/// Computes the size and broadcoasts it to all ranks. +pub fn global_size(arr: &[T], comm: &C) -> usize { + let local_size = arr.len(); + let mut global_size = 0; + + comm.all_reduce_into(&local_size, &mut global_size, SystemOperation::sum()); + + global_size +} + +/// Check if an array is sorted. +pub fn is_sorted_array( + arr: &[MortonKey], + comm: &C, +) -> Option { + let arr = gather_to_root(arr, comm); + if comm.rank() == 0 { + let arr = arr.unwrap(); + for (&elem1, &elem2) in arr.iter().tuple_windows() { + if elem1 > elem2 { + return Some(false); + } + } + Some(true) + } else { + None + } +} + +/// Compute displacements from a vector of counts. +/// +/// This is useful for global MPI varcount operations. Let +/// count [ 3, 4, 5]. Then the corresponding displacements are +// [0, 3, 7]. Note that the last element `5` is ignored. +pub fn displacements(counts: &[i32]) -> Vec { + counts + .iter() + .scan(0, |acc, &x| { + let tmp = *acc; + *acc += x; + Some(tmp) + }) + .collect() +} From 301156c7481cb8256848782d95299e7c73ae39df Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Tue, 17 Sep 2024 23:19:53 +0100 Subject: [PATCH 16/42] WIP: Refactor --- src/parsort.rs | 32 ++++++------------------- src/tools.rs | 65 +++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 58 insertions(+), 39 deletions(-) diff --git a/src/parsort.rs b/src/parsort.rs index 0ffe189..93c9868 100644 --- a/src/parsort.rs +++ b/src/parsort.rs @@ -5,13 +5,15 @@ use std::mem::offset_of; use itertools::Itertools; use mpi::datatype::{UncommittedDatatypeRef, UncommittedUserDatatype, UserDatatype}; -use mpi::traits::{Equivalence, Root}; +use mpi::traits::Equivalence; use mpi::{ datatype::{Partition, PartitionMut}, traits::CommunicatorCollectives, }; use rand::{seq::SliceRandom, Rng}; +use crate::tools::displacements; + const OVERSAMPLING: usize = 8; /// Sortable trait that each type fed into parsort needs to satisfy. @@ -177,14 +179,7 @@ where let mut all_splitters = vec![Default::default(); n_all_splitters]; let splitters_per_rank = splitters_per_rank.iter().map(|&x| x as i32).collect_vec(); - let displs: Vec = splitters_per_rank - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); + let displs = displacements(&splitters_per_rank); let mut partition = PartitionMut::new(&mut all_splitters[..], splitters_per_rank, &displs[..]); comm.all_gather_varcount_into(&splitters, &mut partition); @@ -331,28 +326,15 @@ pub fn parsort // Each processor now knows how much he gets from all the others. // We can now send around the actual elements with an alltoallv. - let send_displs: Vec = counts - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); + + let send_displs = displacements(&counts); let send_partition = Partition::new(&arr, counts, &send_displs[..]); let mut recvbuffer = vec![UniqueItem::default(); counts_from_processor.iter().sum::() as usize]; - let recv_displs: Vec = counts_from_processor - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); + let recv_displs = displacements(&counts_from_processor); let mut receiv_partition = PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]); diff --git a/src/tools.rs b/src/tools.rs index 394273e..0811629 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -1,9 +1,13 @@ //! Utility routines. +use std::mem::MaybeUninit; + +use itertools::Itertools; use mpi::{ collective::SystemOperation, datatype::PartitionMut, - traits::{CommunicatorCollectives, Equivalence, Root}, + point_to_point as p2p, + traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source}, }; /// Gather array to all processes @@ -99,23 +103,56 @@ pub fn global_size(arr: &[T], comm: &C) -> usize global_size } +/// Communicate the first element of each local array back to the previous rank. +pub fn communicate_back( + arr: &[T], + comm: &C, +) -> Option { + let rank = comm.rank(); + let size = comm.size(); + + if rank == size - 1 { + comm.process_at_rank(rank - 1).send(arr.first().unwrap()); + return None; + } else { + let (new_last, _status) = if rank > 0 { + p2p::send_receive( + arr.first().unwrap(), + &comm.process_at_rank(rank - 1), + &comm.process_at_rank(rank + 1), + ) + } else { + comm.process_at_rank(1).receive::() + }; + Some(new_last) + } +} + /// Check if an array is sorted. -pub fn is_sorted_array( - arr: &[MortonKey], +pub fn is_sorted_array( + arr: &[T], comm: &C, -) -> Option { - let arr = gather_to_root(arr, comm); - if comm.rank() == 0 { - let arr = arr.unwrap(); - for (&elem1, &elem2) in arr.iter().tuple_windows() { - if elem1 > elem2 { - return Some(false); - } +) -> bool { + let mut sorted = true; + for (elem1, elem2) in arr.iter().tuple_windows() { + if elem1 > elem2 { + sorted = false; } - Some(true) - } else { - None } + + if let Some(next_first) = communicate_back(arr, comm) { + sorted = *arr.last().unwrap() <= next_first; + } + + let mut global_sorted: bool = false; + comm.all_reduce_into(&sorted, &mut global_sorted, SystemOperation::logical_and()); + + global_sorted +} + +/// Redistribute an array via an all_to_all_varcount operation. +pub fn redistribute(arr: &[T], counts: &[i32]) { + todo!(); } /// Compute displacements from a vector of counts. From 8f0a5e88d9212af1afcea0dda91d8f14f6e13ea9 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 20 Sep 2024 18:39:05 +0100 Subject: [PATCH 17/42] WIP: Refactor --- examples/parsort.rs | 19 ++++--------------- src/parsort.rs | 24 +++--------------------- src/tools.rs | 44 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 45 insertions(+), 42 deletions(-) diff --git a/examples/parsort.rs b/examples/parsort.rs index 6361f6c..8a20706 100644 --- a/examples/parsort.rs +++ b/examples/parsort.rs @@ -1,13 +1,11 @@ //! Testing the hyksort component. -use bempp_octree::parsort::{array_to_root, parsort}; -use itertools::Itertools; +use bempp_octree::{parsort::parsort, tools::is_sorted_array}; use mpi::traits::Communicator; use rand::prelude::*; pub fn main() { let universe = mpi::initialize().unwrap(); let world = universe.world(); - let rank = world.rank() as u64; let n_per_rank = 1000; let mut rng = rand::rngs::StdRng::seed_from_u64(0); @@ -18,20 +16,11 @@ pub fn main() { arr.push(rng.gen()); } - // let splitters = get_splitters(&arr, &world, &mut rng); - - // let bin_displs = get_bin_displacements(&arr, &splitters); - let arr = parsort(&arr, &world, &mut rng); - let arr = array_to_root(&arr, &world); - if rank == 0 { - let arr = arr.unwrap(); + assert!(is_sorted_array(&arr, &world)); - for (elem1, elem2) in arr.iter().tuple_windows() { - assert!(elem1 <= elem2); - } - println!("Sorted {} elements.", arr.len()); - println!("Finished."); + if world.rank() == 0 { + println!("Array is sorted."); } } diff --git a/src/parsort.rs b/src/parsort.rs index 93c9868..073d592 100644 --- a/src/parsort.rs +++ b/src/parsort.rs @@ -12,7 +12,7 @@ use mpi::{ }; use rand::{seq::SliceRandom, Rng}; -use crate::tools::displacements; +use crate::tools::{displacements, gather_to_all}; const OVERSAMPLING: usize = 8; @@ -162,27 +162,9 @@ where .copied() .collect::>(); - // We use an all_gatherv so that each process receives all splitters. - // For that we first communicate how many splitters each process has - // and then we send the splitters themselves. + // We gather the splitters into all ranks so that each rank has all splitters. - let nsplitters = splitters.len(); - let mut splitters_per_rank = vec![0_usize; size]; - - comm.all_gather_into(&nsplitters, &mut splitters_per_rank); - - // We now know how many splitters each process has. We now create space - // for the splitters and send them all around. - - let n_all_splitters = splitters_per_rank.iter().sum(); - - let mut all_splitters = vec![Default::default(); n_all_splitters]; - let splitters_per_rank = splitters_per_rank.iter().map(|&x| x as i32).collect_vec(); - - let displs = displacements(&splitters_per_rank); - - let mut partition = PartitionMut::new(&mut all_splitters[..], splitters_per_rank, &displs[..]); - comm.all_gather_varcount_into(&splitters, &mut partition); + let mut all_splitters = gather_to_all(&splitters, comm); // We now have all splitters available on each process. // We can now sort the splitters. Every process will then have the same list of sorted splitters. diff --git a/src/tools.rs b/src/tools.rs index 0811629..baa41a3 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -1,13 +1,14 @@ //! Utility routines. -use std::mem::MaybeUninit; - use itertools::Itertools; use mpi::{ collective::SystemOperation, - datatype::PartitionMut, + datatype::{Partition, PartitionMut}, point_to_point as p2p, - traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source}, + traits::{ + CommunicatorCollectives, Destination, Equivalence, PartitionedBuffer, PartitionedBufferMut, + Root, Source, + }, }; /// Gather array to all processes @@ -140,6 +141,10 @@ pub fn is_sorted_array( } } + if comm.size() == 1 { + return sorted; + } + if let Some(next_first) = communicate_back(arr, comm) { sorted = *arr.last().unwrap() <= next_first; } @@ -151,8 +156,35 @@ pub fn is_sorted_array( } /// Redistribute an array via an all_to_all_varcount operation. -pub fn redistribute(arr: &[T], counts: &[i32]) { - todo!(); +pub fn redistribute( + arr: &[T], + counts: &[i32], + comm: &C, +) -> Vec { + assert_eq!(counts.len(), comm.size() as usize); + + // First send the counts around via an alltoall operation. + + let mut recv_counts = vec![0 as i32; counts.len()]; + + comm.all_to_all_into(&counts[..], &mut recv_counts); + + // We have the recv_counts. Allocate space and setup the partitions. + + let nelems = recv_counts.iter().sum::() as usize; + + let mut output = Vec::::with_capacity(nelems); + let out_buf: &mut [T] = unsafe { std::mem::transmute(output.spare_capacity_mut()) }; + + let send_partition = Partition::new(arr, counts, displacements(counts)); + let mut recv_partition = + PartitionMut::new(out_buf, &recv_counts[..], displacements(&recv_counts)); + + comm.all_to_all_varcount_into(&send_partition, &mut recv_partition); + + unsafe { output.set_len(nelems) }; + + output } /// Compute displacements from a vector of counts. From 44372e3ebdc23a215125daea90ed77b02507b9dc Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sat, 21 Sep 2024 17:10:25 +0100 Subject: [PATCH 18/42] WIP: Parallel tests --- examples/mpi_global_bounding_box.rs | 18 ++-- src/octree.rs | 144 +++++----------------------- src/parsort.rs | 32 +------ 3 files changed, 38 insertions(+), 156 deletions(-) diff --git a/examples/mpi_global_bounding_box.rs b/examples/mpi_global_bounding_box.rs index 28748b2..330a168 100644 --- a/examples/mpi_global_bounding_box.rs +++ b/examples/mpi_global_bounding_box.rs @@ -1,7 +1,8 @@ //! Test the computation of a global bounding box across MPI ranks. -use bempp_octree::octree::compute_global_bounding_box; -use mpi::traits::*; +use bempp_octree::{ + geometry::PhysicalBox, octree::compute_global_bounding_box, tools::gather_to_root, +}; use rand::prelude::*; use rand_chacha::ChaCha8Rng; @@ -15,10 +16,6 @@ pub fn main() { // Initialise a seeded Rng. let mut rng = ChaCha8Rng::seed_from_u64(2); - // Get the rank and size - let rank = comm.rank(); - let size = comm.size(); - // Create `npoints` per rank. let npoints = 10; @@ -33,4 +30,13 @@ pub fn main() { // Compute the distributed bounding box. let bounding_box = compute_global_bounding_box(&points, &comm); + + // Copy all points to root and compare local bounding box there. + + if let Some(points_root) = gather_to_root(&points, &comm) { + // Compute the bounding box on root. + + let expected = PhysicalBox::from_points(&points_root); + assert_eq!(expected.coordinates(), bounding_box.coordinates()); + } } diff --git a/src/octree.rs b/src/octree.rs index a3caa64..fe21431 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -7,11 +7,10 @@ use crate::{ geometry::PhysicalBox, morton::MortonKey, parsort::parsort, - tools::gather_to_all, + tools::{communicate_back, gather_to_all, redistribute}, }; use mpi::{ - datatype::{Partition, PartitionMut}, point_to_point as p2p, traits::{Root, Source}, }; @@ -133,19 +132,6 @@ pub fn points_to_morton( .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level)) .collect_vec(); - // Now want to get weighted Morton keys. We use a HashMap. - - let mut value_counts = HashMap::::new(); - - for key in &keys { - *value_counts.entry(*key).or_insert(0) += 1; - } - - // let weighted_keys = value_counts - // .iter() - // .map(|(&key, &weight)| WeightedMortonKey::new(key, weight)) - // .collect_vec(); - (keys, bounding_box) } @@ -295,15 +281,9 @@ pub fn redistribute_with_respect_to_coarse_tree( // defines bins in which we sort our keys. The keys are then sent around to the correct // processes via an alltoallv operation. - let my_first = *coarse_tree.first().unwrap(); - - let mut global_bins = Vec::::with_capacity(size as usize); - let global_bins_buff: &mut [MortonKey] = - unsafe { std::mem::transmute(global_bins.spare_capacity_mut()) }; - - comm.all_gather_into(&my_first, global_bins_buff); + let my_first = coarse_tree.first().unwrap(); - unsafe { global_bins.set_len(size as usize) }; + let mut global_bins = gather_to_all(std::slice::from_ref(my_first), comm); // We now have the first index from each process. We also want // an upper bound for the last index of the tree to make the sorting into @@ -321,41 +301,9 @@ pub fn redistribute_with_respect_to_coarse_tree( .map(|&elem| elem as i32) .collect_vec(); - // We now have the counts for each rank. Let's send it around via alltoallv. - - let mut counts_from_proc = vec![0 as i32; size as usize]; - - comm.all_to_all_into(&rank_counts, &mut counts_from_proc); - // Now compute the send and receive displacements. - - // We can now send around the actual elements with an alltoallv. - let send_displs: Vec = rank_counts - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp as i32) - }) - .collect(); - - let send_partition = Partition::new(&sorted_keys[..], &rank_counts[..], &send_displs[..]); + // We now have the counts for each rank. Let's redistribute accordingly and return. - let mut recvbuffer = vec![MortonKey::default(); counts_from_proc.iter().sum::() as usize]; - - let recv_displs: Vec = counts_from_proc - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut receiv_partition = - PartitionMut::new(&mut recvbuffer[..], counts_from_proc, &recv_displs[..]); - comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); - - recvbuffer + redistribute(&sorted_keys, &rank_counts, comm) } /// Create bins from sorted keys. @@ -471,43 +419,28 @@ pub fn linearize( let mut result = Vec::::new(); - if rank == size - 1 { - comm.process_at_rank(rank - 1) - .send(sorted_keys.first().unwrap()); + let next_key = communicate_back(&sorted_keys, comm); - for (&m1, &m2) in sorted_keys.iter().tuple_windows() { - // m1 is also ancestor of m2 if they are identical. - if m1.is_ancestor(m2) { - continue; - } else { - result.push(m1); - } + // Treat the local keys + for (&m1, &m2) in sorted_keys.iter().tuple_windows() { + // m1 is also ancestor of m2 if they are identical. + if m1.is_ancestor(m2) { + continue; + } else { + result.push(m1); } + } + // If we are at the last process simply push the last key. + // Otherwise check whether it might be the ancestor of `next_key`, + // the first key on the next process. If yes, don't push it. Otherwise do. + + if rank == size - 1 { result.push(*sorted_keys.last().unwrap()); } else { - let (other, _status) = if rank > 0 { - p2p::send_receive( - sorted_keys.first().unwrap(), - &comm.process_at_rank(rank - 1), - &comm.process_at_rank(rank + 1), - ) - } else { - comm.any_process().receive::() - }; - for (&m1, &m2) in sorted_keys.iter().tuple_windows() { - // m1 is also ancestor of m2 if they are identical. - if m1.is_ancestor(m2) { - continue; - } else { - result.push(m1); - } - } - let last = *sorted_keys.last().unwrap(); - - if !last.is_ancestor(other) { - result.push(last) + if !last.is_ancestor(next_key.unwrap()) { + result.push(last); } } @@ -603,7 +536,6 @@ pub fn partition( // then send the actual data. let mut counts = vec![0 as i32; size as usize]; - let mut counts_from_processor = vec![0 as i32; size as usize]; let mut all_elements = Vec::::new(); for (index, c) in counts.iter_mut().enumerate() { @@ -612,39 +544,7 @@ pub fn partition( all_elements.extend(elements.iter()) } - // Send around the number of elements for each process - comm.all_to_all_into(&counts, &mut counts_from_processor); - - // We have the number of elements for each process now. Now send around - // the actual elements. - - // We can now send around the actual elements with an alltoallv. - let send_displs: Vec = counts - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp as i32) - }) - .collect(); - - let send_partition = Partition::new(&all_elements, &counts[..], &send_displs[..]); - - let mut recvbuffer = - vec![MortonKey::default(); counts_from_processor.iter().sum::() as usize]; - - let recv_displs: Vec = counts_from_processor - .iter() - .scan(0, |acc, &x| { - let tmp = *acc; - *acc += x; - Some(tmp) - }) - .collect(); - - let mut receiv_partition = - PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]); - comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); + let mut recvbuffer = redistribute(&all_elements, &counts, comm); recvbuffer.sort_unstable(); recvbuffer diff --git a/src/parsort.rs b/src/parsort.rs index 073d592..49d48ca 100644 --- a/src/parsort.rs +++ b/src/parsort.rs @@ -5,14 +5,11 @@ use std::mem::offset_of; use itertools::Itertools; use mpi::datatype::{UncommittedDatatypeRef, UncommittedUserDatatype, UserDatatype}; +use mpi::traits::CommunicatorCollectives; use mpi::traits::Equivalence; -use mpi::{ - datatype::{Partition, PartitionMut}, - traits::CommunicatorCollectives, -}; use rand::{seq::SliceRandom, Rng}; -use crate::tools::{displacements, gather_to_all}; +use crate::tools::{gather_to_all, redistribute}; const OVERSAMPLING: usize = 8; @@ -297,30 +294,9 @@ pub fn parsort .map(|&elem| elem as i32) .collect::>(); - // We now do an all_to_allv to communicate the array elements to the right processors. + // We can now redistribute the array across the processors. - // First we need to communicate how many elements everybody gets from each processor. - - let mut counts_from_processor = vec![0_i32; size]; - - comm.all_to_all_into(&counts, &mut counts_from_processor); - - // Each processor now knows how much he gets from all the others. - - // We can now send around the actual elements with an alltoallv. - - let send_displs = displacements(&counts); - - let send_partition = Partition::new(&arr, counts, &send_displs[..]); - - let mut recvbuffer = - vec![UniqueItem::default(); counts_from_processor.iter().sum::() as usize]; - - let recv_displs = displacements(&counts_from_processor); - - let mut receiv_partition = - PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]); - comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition); + let mut recvbuffer = redistribute(&arr, &counts, comm); // We now have everything in the receive buffer. Now sort the local elements and return From fc8e63d343995ee856d966bd44a9167e7a218031 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sat, 21 Sep 2024 19:23:30 +0100 Subject: [PATCH 19/42] WIP: distribute_complete_tree_test --- examples/mpi_complete_tree.rs | 38 +++++++++++++++++++++ examples/parallel_tests.rs | 9 ++--- src/morton.rs | 62 +++++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 examples/mpi_complete_tree.rs diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs new file mode 100644 index 0000000..e3fea61 --- /dev/null +++ b/examples/mpi_complete_tree.rs @@ -0,0 +1,38 @@ +//! Test the computation of a global bounding box across MPI ranks. + +use bempp_octree::{ + constants::DEEPEST_LEVEL, + geometry::PhysicalBox, + octree::{complete_tree, compute_global_bounding_box, points_to_morton}, + tools::gather_to_root, +}; +use rand::prelude::*; +use rand_chacha::ChaCha8Rng; + +pub fn main() { + // Initialise MPI + let universe = mpi::initialize().unwrap(); + + // Get the world communicator + let comm = universe.world(); + + // Initialise a seeded Rng. + let mut rng = ChaCha8Rng::seed_from_u64(2); + + // Create `npoints` per rank. + let npoints = 3; + + // Generate random points. + + let mut points = Vec::::with_capacity(3 * npoints); + + for _ in 0..3 * npoints { + points.push(rng.gen()); + } + + // Compute the Morton keys on the deepest level + let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm); + + // Generate a complete tree + let distributed_complete_tree = complete_tree(&keys, &mut rng, &comm); +} diff --git a/examples/parallel_tests.rs b/examples/parallel_tests.rs index e3555e9..9d841ea 100644 --- a/examples/parallel_tests.rs +++ b/examples/parallel_tests.rs @@ -1,15 +1,16 @@ //! Testing the hyksort component. use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE}; use bempp_octree::morton::MortonKey; -use bempp_octree::octree::{block_partition, is_sorted_array, linearize, partition}; -use bempp_octree::parsort::{array_to_root, parsort}; +use bempp_octree::octree::{block_partition, linearize, partition}; +use bempp_octree::parsort::parsort; +use bempp_octree::tools::gather_to_root; use itertools::{izip, Itertools}; use mpi::traits::*; use rand::prelude::*; pub fn assert_linearized(arr: &Vec, comm: &C) { // Check that the keys are still linearized. - let arr = array_to_root(&arr, comm); + let arr = gather_to_root(&arr, comm); if comm.rank() == 0 { let arr = arr.unwrap(); @@ -130,7 +131,7 @@ pub fn test_coarse_partition(rng: &mut R, co partitioned_tree.0.len() ); - let arr = array_to_root(&partitioned_tree.0, comm); + let arr = gather_to_root(&partitioned_tree.0, comm); if rank == 0 { let arr = arr.unwrap(); diff --git a/src/morton.rs b/src/morton.rs index 8e66bfb..eb9d49e 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -466,6 +466,33 @@ impl MortonKey { key } + /// Return the next possible Morton key on the deepest level that is not a descendent of the current key. + /// + /// If the key is already the last possible key then return None. + pub fn next_non_descendent_key(&self) -> Option { + // If we are an ancestor of deepest_last we return None as then there + // is next key. + + if self.is_ancestor(MortonKey::deepest_last()) { + return None; + } + + let level = self.level() as u64; + + let level_diff = DEEPEST_LEVEL - level; + let shift = LEVEL_DISPLACEMENT + 3 * level_diff; + + // Need to know which sibling we are. + let child_index = ((self.value >> shift) % 8) as usize; + // If we are between 0 and 6 take the next sibling and go to deepest level. + if child_index < 7 { + Some(MortonKey::new(self.value + (1 << shift) + level_diff)) + } else { + // If we are the last child go to the parent and take next key from there. + self.parent().next_non_descendent_key() + } + } + /// Linearize by sorting and removing overlaps. pub fn linearize(keys: &[MortonKey]) -> Vec { let mut new_keys = Vec::::new(); @@ -1315,4 +1342,39 @@ mod test { ) ); } + + #[test] + pub fn test_next_nondescendent_key() { + let key = MortonKey::from_index_and_level([25, 17, 6], 5); + + let children = key.children(); + + // Check the next nondescendent key for the first six children + + for (child, next_child) in children.iter().tuple_windows() { + let next_key = child.next_non_descendent_key().unwrap(); + assert_eq!(next_key.level(), DEEPEST_LEVEL as usize); + assert!(!child.is_ancestor(next_key)); + assert!(next_child.is_ancestor(next_key)); + } + + // Now check the next nondescendent key from the last child. + + let next_child = children.last().unwrap().next_non_descendent_key(); + + // Check that the next nondescendent key from the parent is the same as that of the last child. + + assert_eq!(key.next_non_descendent_key(), next_child); + + // Check that it is not a descendent of the parent and that its level is correct. + + assert_eq!(next_child.unwrap().level(), DEEPEST_LEVEL as usize); + assert!(!key.is_ancestor(next_child.unwrap())); + + // Finally make sure that an ancestor of deepest last returns None. + + assert!(MortonKey::deepest_last() + .next_non_descendent_key() + .is_none()); + } } From 25c0e84dc6ef1742b6b03ff0146344e0f56aacbe Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sat, 21 Sep 2024 20:49:45 +0100 Subject: [PATCH 20/42] WIP: Test complete tree --- examples/mpi_complete_tree.rs | 19 ++++++++-- src/octree.rs | 70 +++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index e3fea61..d3bd214 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -3,9 +3,12 @@ use bempp_octree::{ constants::DEEPEST_LEVEL, geometry::PhysicalBox, - octree::{complete_tree, compute_global_bounding_box, points_to_morton}, + octree::{ + complete_tree, compute_global_bounding_box, is_complete_linear_tree, points_to_morton, + }, tools::gather_to_root, }; +use mpi::traits::*; use rand::prelude::*; use rand_chacha::ChaCha8Rng; @@ -17,10 +20,10 @@ pub fn main() { let comm = universe.world(); // Initialise a seeded Rng. - let mut rng = ChaCha8Rng::seed_from_u64(2); + let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); // Create `npoints` per rank. - let npoints = 3; + let npoints = 10; // Generate random points. @@ -33,6 +36,16 @@ pub fn main() { // Compute the Morton keys on the deepest level let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm); + assert!(!is_complete_linear_tree(&keys, &comm)); + // Generate a complete tree let distributed_complete_tree = complete_tree(&keys, &mut rng, &comm); + + let is_complete_linear = is_complete_linear_tree(&distributed_complete_tree, &comm); + + assert!(is_complete_linear); + + if comm.rank() == 0 { + println!("Distributed tree is complete and linear."); + } } diff --git a/src/octree.rs b/src/octree.rs index fe21431..aaa01a0 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -625,3 +625,73 @@ pub fn complete_tree( result } + +/// Return true on all ranks if distributed tree is complete. Otherwise, return false. +pub fn is_complete_linear_tree(arr: &[MortonKey], comm: &C) -> bool { + // First check that the local tree on each node is complete. + + let mut complete_linear = true; + for (key1, key2) in arr.iter().tuple_windows() { + // Make sure that the keys are sorted and not duplicated. + if key1 >= key2 { + complete_linear = false; + break; + } + // The next key should be an ancestor of the next non-descendent key. + if let Some(expected_next) = key1.next_non_descendent_key() { + if !key2.is_ancestor(expected_next) { + complete_linear = false; + break; + } + } else { + // Only for the very last key there should not be a next non-descendent key. + complete_linear = false; + } + } + + // We now check the interfaces. + + if let Some(next_first) = communicate_back(arr, comm) { + // We are on any but the last rank + let last_key = arr.last().unwrap(); + + // Check that the keys are sorted and not duplicated. + if *last_key >= next_first { + complete_linear = false; + } + + // Check that the next key is an encestor of the next non-descendent. + if let Some(expected_next) = last_key.next_non_descendent_key() { + if !next_first.is_ancestor(expected_next) { + complete_linear = false; + } + } else { + complete_linear = false; + } + } else { + // We are on the last rank + // Check that the last key is ancestor of deepest last. + if !arr.last().unwrap().is_ancestor(MortonKey::deepest_last()) { + complete_linear = false; + } + } + + // Now check that at the first rank we include the deepest first. + + if comm.rank() == 0 { + if !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) { + complete_linear = false; + } + } + + // Now communicate everything together. + + let mut result = false; + comm.all_reduce_into( + &complete_linear, + &mut result, + SystemOperation::logical_and(), + ); + + result +} From ccb8be5b73f3103938ea7e7a5ecf244cb6399a8b Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 11:30:22 +0100 Subject: [PATCH 21/42] WIP: Better binning --- Cargo.toml | 1 + examples/mpi_cumsum.rs | 66 ++++++++++++++++++++ src/octree.rs | 21 ++----- src/tools.rs | 135 ++++++++++++++++++++++++++++++++++++++++- 4 files changed, 205 insertions(+), 18 deletions(-) create mode 100644 examples/mpi_cumsum.rs diff --git a/Cargo.toml b/Cargo.toml index 3c3fc1a..bff73c7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ crate-type = ["cdylib", "lib"] itertools = "0.13.*" rand = { version = "0.8.5", features = ["alloc"] } rand_chacha = "0.3.*" +num = "0.4.*" bytemuck = "1.*" vtkio = "0.6.*" mpi = {version = "0.8.*", features = ["derive", "user-operations"] } diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs new file mode 100644 index 0000000..0c59245 --- /dev/null +++ b/examples/mpi_cumsum.rs @@ -0,0 +1,66 @@ +//! Test the computation of a global bounding box across MPI ranks. + +use bempp_octree::{ + geometry::PhysicalBox, + octree::compute_global_bounding_box, + tools::{gather_to_root, global_inclusive_cumsum}, +}; +use itertools::{izip, Itertools}; +use mpi::traits::*; +use rand::prelude::*; +use rand_chacha::ChaCha8Rng; + +pub fn main() { + // Initialise MPI + let universe = mpi::initialize().unwrap(); + + // Get the world communicator + let comm = universe.world(); + + // Initialise a seeded Rng. + let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); + + // Create `npoints` per rank. + let nelems = 10; + + // Generate random numbers + + let mut elems = Vec::::with_capacity(3 * nelems); + + for _ in 0..nelems { + elems.push(rng.gen_range(0..100)); + } + + // Compute the cumulative sum. + + let global_cum_sum = global_inclusive_cumsum(&elems, &comm); + + // Copy array to root and compare with inclusive scan there. + + if let (Some(cum_sum_root), Some(original_array)) = ( + gather_to_root(&global_cum_sum, &comm), + gather_to_root(&elems, &comm), + ) { + // Scan on root + + let expected_cum_sum = original_array + .iter() + .scan(0, |state, x| { + *state = *x + *state; + Some(*state) + }) + .collect_vec(); + + // Check that the first element is not modified (inclusive cumsum) + assert_eq!( + original_array.first().unwrap(), + cum_sum_root.first().unwrap() + ); + + for (actual, expected) in izip!(cum_sum_root.iter(), expected_cum_sum.iter()) { + assert_eq!(*actual, *expected); + } + + println!("Cumulative sum computed."); + } +} diff --git a/src/octree.rs b/src/octree.rs index aaa01a0..0371f3f 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -7,7 +7,7 @@ use crate::{ geometry::PhysicalBox, morton::MortonKey, parsort::parsort, - tools::{communicate_back, gather_to_all, redistribute}, + tools::{communicate_back, gather_to_all, global_inclusive_cumsum, redistribute}, }; use mpi::{ @@ -469,19 +469,9 @@ pub fn partition( // of each array to get the global sums and then we update the array of each rank // with the sum from the previous ranks. - let mut scan: Vec = weights - .iter() - .scan(0, |state, x| { - *state += *x; - Some(*state) - }) - .collect_vec(); - let scan_last = *scan.last().unwrap(); - let mut scan_result: usize = 0; - comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum()); - for elem in &mut scan { - *elem += scan_result; - } + let scan = global_inclusive_cumsum(&weights, comm); + + // Now broadcast the total weight to all processes. let mut total_weight = if rank == size - 1 { *scan.last().unwrap() @@ -489,9 +479,6 @@ pub fn partition( 0 }; - // Scan the weight (form cumulative sums) and broadcast the total weight (last entry on last process) - // to all other processes. - comm.process_at_rank(size - 1) .broadcast_into(&mut total_weight); diff --git a/src/tools.rs b/src/tools.rs index baa41a3..46c9514 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -1,6 +1,6 @@ //! Utility routines. -use itertools::Itertools; +use itertools::{izip, Itertools}; use mpi::{ collective::SystemOperation, datatype::{Partition, PartitionMut}, @@ -10,6 +10,7 @@ use mpi::{ Root, Source, }, }; +use num::traits::Zero; /// Gather array to all processes pub fn gather_to_all(arr: &[T], comm: &C) -> Vec { @@ -187,6 +188,119 @@ pub fn redistribute( output } +/// Perform a global inclusive cumulative sum operation. +/// +/// For the array `[1, 3, 5, 7]` the output will be `[1, 4, 9, 16]`. +pub fn global_inclusive_cumsum( + arr: &[T], + comm: &C, +) -> Vec { + let mut scan: Vec = arr + .iter() + .scan(::zero(), |state, x| { + *state = *x + *state; + Some(*state) + }) + .collect_vec(); + let scan_last = *scan.last().unwrap(); + let mut scan_result = T::zero(); + comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum()); + for elem in &mut scan { + *elem = *elem + scan_result; + } + + scan +} + +/// Distribute a sorted sequence into bins. +/// +/// For an array with n elements to be distributed into p bins, +/// the array `bins` has p elements. The bins are defined by half-open intervals +/// of the form [b_j, b_{j+1})). The final bin is the half-open interval [b_{p-1}, \infty). +/// It is assumed that the bins and the elements are both sorted sequences and that +/// every element has an associated bin. +/// The function returns a p element array with the counts of how many elements go to each bin. +/// Since the sequence is sorted this fully defines what element goes into which bin. +pub fn sort_to_bins(sorted_keys: &[T], bins: &[T]) -> Vec { + let nbins = bins.len(); + + // Make sure that the smallest element of the sorted keys fits into the bins. + assert!(bins.first().unwrap() <= sorted_keys.first().unwrap()); + + // Deal with the special case that there is only one bin. + // This means that all elements are in the one bin. + if nbins == 1 { + return vec![sorted_keys.len(); 1]; + } + + let mut bin_counts = vec![0 as usize; nbins]; + + // This iterates over each possible bin and returns also the associated rank. + // The last bin position is not iterated over since for an array with p elements + // there are p-1 tuple windows. + let mut bin_iter = izip!( + bin_counts.iter_mut(), + bins.iter().tuple_windows::<(&T, &T)>(), + ); + + // We take the first element of the bin iterator. There will always be at least one since + // there are at least two bins (an actual one, and the last half infinite one) + let mut r: &mut usize; + let mut bin_start: &T; + let mut bin_end: &T; + (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); + + let mut count = 0; + 'outer: for key in sorted_keys.iter() { + if bin_start <= key && key < bin_end { + *r += 1; + count += 1; + } else { + // Move the bin forward until it fits. There will always be a fitting bin. + loop { + if let Some((rn, (bsn, ben))) = bin_iter.next() { + if bsn <= key && key < ben { + // We have found the next fitting bin for our current element. + // Can register it and go back to the outer for loop. + *rn += 1; + r = rn; + bin_start = bsn; + bin_end = ben; + count += 1; + break; + } + } else { + // We have no more fitting bin. So break the outer loop. + break 'outer; + } + } + } + } + + // We now have everything but the last bin. Just bunch the remaining elements to + // the last count. + *bin_counts.last_mut().unwrap() = sorted_keys.len() - count; + + bin_counts +} + +/// Redistribute locally sorted keys with respect to bins. +/// +/// - The array `sorted_keys` is assumed to be sorted within each process. It needs not be globally sorted. +/// - If there are `r` ranks in the communicator, the size of `bins` must be `r`. +/// - The bins are defined through half-open intervals (bin[0], bin[1]), .... This defines r-1 bins. The +/// last bin is the half-open interval [bin[r-1], \infty). +/// - All array elements must be larger or equal bin[0]. This means that each element can be sorted into a bin. +pub fn redistribute_by_bins( + sorted_keys: &[T], + bins: &[T], + comm: &C, +) -> Vec { + let counts = sort_to_bins(sorted_keys, bins); + let counts = counts.iter().map(|elem| *elem as i32).collect_vec(); + redistribute(sorted_keys, &counts, comm) +} + /// Compute displacements from a vector of counts. /// /// This is useful for global MPI varcount operations. Let @@ -202,3 +316,22 @@ pub fn displacements(counts: &[i32]) -> Vec { }) .collect() } + +#[cfg(test)] +mod test { + use itertools::Itertools; + + use super::sort_to_bins; + + #[test] + fn test_sort_to_bins() { + let elems = (0..100).collect_vec(); + let bins = [0, 17, 55]; + + let counts = sort_to_bins(&elems, &bins); + + assert_eq!(counts[0], 17); + assert_eq!(counts[1], 38); + assert_eq!(counts[2], 45); + } +} From 8a3d6e5274986eb25cd28d2d46fa517d3b178006 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 12:11:42 +0100 Subject: [PATCH 22/42] Custom min max for MPI --- src/morton.rs | 13 ---- src/parsort.rs | 176 +++++-------------------------------------------- src/tools.rs | 30 +++++++++ 3 files changed, 48 insertions(+), 171 deletions(-) diff --git a/src/morton.rs b/src/morton.rs index eb9d49e..43e4aa9 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -6,7 +6,6 @@ use crate::constants::{ Y_LOOKUP_ENCODE, Z_LOOKUP_DECODE, Z_LOOKUP_ENCODE, }; use crate::geometry::PhysicalBox; -use crate::parsort::{MaxValue, MinValue}; use itertools::izip; use itertools::Itertools; use mpi::traits::Equivalence; @@ -27,18 +26,6 @@ impl Default for MortonKey { } } -impl MinValue for MortonKey { - fn min_value() -> Self { - MortonKey::root() - } -} - -impl MaxValue for MortonKey { - fn max_value() -> Self { - MortonKey::deepest_last() - } -} - impl MortonKey { /// Create a new Morton key. Users should use `[MortonKey::from_index_and_level].` fn new(value: u64) -> Self { diff --git a/src/parsort.rs b/src/parsort.rs index 49d48ca..f38e469 100644 --- a/src/parsort.rs +++ b/src/parsort.rs @@ -9,41 +9,18 @@ use mpi::traits::CommunicatorCollectives; use mpi::traits::Equivalence; use rand::{seq::SliceRandom, Rng}; -use crate::tools::{gather_to_all, redistribute}; +use crate::tools::{gather_to_all, global_max, global_min, redistribute_by_bins}; const OVERSAMPLING: usize = 8; /// Sortable trait that each type fed into parsort needs to satisfy. pub trait ParallelSortable: - MinValue - + MaxValue - + Equivalence - + Copy - + Clone - + Default - + PartialEq - + Eq - + PartialOrd - + Ord - + Display - + Sized + Equivalence + Copy + Clone + PartialEq + Eq + PartialOrd + Ord + Display + Sized { } -impl< - T: MinValue - + MaxValue - + Equivalence - + Copy - + Clone - + Default - + PartialEq - + Eq - + PartialOrd - + Ord - + Display - + Sized, - > ParallelSortable for T +impl + ParallelSortable for T { } @@ -88,18 +65,6 @@ unsafe impl Equivalence for UniqueItem { } } -/// Return the minimum possible value of a type. -pub trait MinValue { - /// Return the min value. - fn min_value() -> Self; -} - -/// Return the maximum possible value of a type. -pub trait MaxValue { - /// Return the max value. - fn max_value() -> Self; -} - impl Display for UniqueItem { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -110,18 +75,6 @@ impl Display for UniqueItem { } } -impl MinValue for UniqueItem { - fn min_value() -> Self { - UniqueItem::new(::min_value(), 0, 0) - } -} - -impl MaxValue for UniqueItem { - fn max_value() -> Self { - UniqueItem::new(::max_value(), 0, 0) - } -} - impl UniqueItem { pub fn new(value: T, rank: usize, index: usize) -> Self { Self { value, rank, index } @@ -151,8 +104,13 @@ where OVERSAMPLING }; - // We are choosing unique splitters that neither contain - // zero nor u64::max. + // We get the global smallest and global largest element. We do not want those + // in the splitter so filter out their occurence. + + let global_min_elem = global_min(arr, comm); + let global_max_elem = global_max(arr, comm); + + // We do not want the global smallest element in the splitter. let splitters = arr .choose_multiple(rng, oversampling) @@ -171,91 +129,25 @@ where // We now insert the smallest and largest possible element if they are not already // in the splitter collection. - if *all_splitters.first().unwrap() != UniqueItem::min_value() { - all_splitters.insert(0, UniqueItem::min_value()) + if *all_splitters.first().unwrap() != global_min_elem { + all_splitters.insert(0, global_min_elem) } - if *all_splitters.last().unwrap() != UniqueItem::max_value() { - all_splitters.push(UniqueItem::max_value()); + if *all_splitters.last().unwrap() != global_max_elem { + all_splitters.push(global_max_elem); } // We now define p buckets (p is number of processors) and we return - // a p + 1 element array containing the first element of each bucket - // concluded with the largest possible element. + // a p element array containing the first element of each bucket all_splitters = split(&all_splitters, size) .map(|slice| slice.first().unwrap()) .copied() .collect::>(); - all_splitters.push(UniqueItem::max_value()); all_splitters } -fn get_counts(arr: &[UniqueItem], buckets: &[UniqueItem]) -> Vec { - // The following array will store the counts for each bucket. - - let mut counts = vec![0_usize; buckets.len() - 1]; - - // We are iterating through the array. Whenever an element is larger or equal than - // the current splitter we store the current position in `bin_displs` and advance `splitter_iter` - // by 1. - - // In the following iterator we skip the first bin displacement position as this must be the default - // zero (start of the bins). - - // Note that bucket iterator has as many elements as counts as the tuple_windows has length - // 1 smaller than the original array length. - let mut bucket_iter = buckets.iter().tuple_windows::<(_, _)>(); - - // We skip the first element as this is always zero. - let mut count_iter = counts.iter_mut(); - - let mut count: usize = 0; - let mut current_count = count_iter.next().unwrap(); - - let (mut first, mut last) = bucket_iter.next().unwrap(); - - for elem in arr { - // The test after the or sorts out the case that our set includes the maximum possible - // item and we are in the last bucket. The biggest item should be counted as belonging - // to the bucket. - if (first <= elem && elem < last) - || (*last == UniqueItem::max_value() && *elem == UniqueItem::max_value()) - { - // Element is in the right bucket. - count += 1; - continue; - } else { - // Element is not in the right bucket. - // Store counts and find the correct bucket. - *current_count = count; - loop { - (first, last) = bucket_iter.next().unwrap(); - current_count = count_iter.next().unwrap(); - if (first <= elem && elem < last) - || (*last == UniqueItem::max_value() && *elem == UniqueItem::max_value()) - { - break; - } - } - // Now have the right bucket. Reset count and continue. - count = 1; - } - } - - // Need to store the count for the last bucket in the iterator. - // This is always necessary as last iterator is half open interval. - // So we don't go into the else part of the for loop. - - *current_count = count; - - // We don't need to fill the remaining counts entries with zero - // since the array is already initialized with zero. - - counts -} - /// Parallel sort pub fn parsort( arr: &[T], @@ -287,16 +179,8 @@ pub fn parsort let buckets = get_buckets(&arr, comm, rng); - // We now compute how many elements of our array go into each bucket. - - let counts = get_counts(&arr, &buckets) - .iter() - .map(|&elem| elem as i32) - .collect::>(); - - // We can now redistribute the array across the processors. - - let mut recvbuffer = redistribute(&arr, &counts, comm); + // We now redistribute with respect to these buckets. + let mut recvbuffer = redistribute_by_bins(&arr, &buckets, comm); // We now have everything in the receive buffer. Now sort the local elements and return @@ -336,27 +220,3 @@ impl<'a, T> Iterator for Split<'a, T> { Some(chunk) } } - -macro_rules! impl_min_max_value { - ($type:ty) => { - impl MinValue for $type { - fn min_value() -> Self { - <$type>::MIN - } - } - - impl MaxValue for $type { - fn max_value() -> Self { - <$type>::MAX - } - } - }; -} - -impl_min_max_value!(usize); -impl_min_max_value!(i8); -impl_min_max_value!(i32); -impl_min_max_value!(i64); -impl_min_max_value!(u8); -impl_min_max_value!(u32); -impl_min_max_value!(u64); diff --git a/src/tools.rs b/src/tools.rs index 46c9514..46415fd 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -105,6 +105,36 @@ pub fn global_size(arr: &[T], comm: &C) -> usize global_size } +/// Get the maximum value across all ranks +pub fn global_max( + arr: &[T], + comm: &C, +) -> T { + let local_max = arr.iter().max().unwrap(); + + // Just need to initialize global_max with something. + let mut global_max = *local_max; + + comm.all_reduce_into(local_max, &mut global_max, SystemOperation::max()); + + global_max +} + +/// Get the minimum value across all ranks +pub fn global_min( + arr: &[T], + comm: &C, +) -> T { + let local_min = arr.iter().min().unwrap(); + + // Just need to initialize global_min with something. + let mut global_min = *local_min; + + comm.all_reduce_into(local_min, &mut global_min, SystemOperation::min()); + + global_min +} + /// Communicate the first element of each local array back to the previous rank. pub fn communicate_back( arr: &[T], From 4eed598e6972d30ebe92afafcfbbfcf97952b1da Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 13:55:35 +0100 Subject: [PATCH 23/42] WIP: Fixing global min max --- Cargo.toml | 1 + examples/mpi_cumsum.rs | 2 +- src/tools.rs | 39 ++++++++++++++++++++++++++++++--------- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bff73c7..a75d666 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,6 +28,7 @@ num = "0.4.*" bytemuck = "1.*" vtkio = "0.6.*" mpi = {version = "0.8.*", features = ["derive", "user-operations"] } +once_cell = "*" [profile.release] debug = 1 diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs index 0c59245..08f8c97 100644 --- a/examples/mpi_cumsum.rs +++ b/examples/mpi_cumsum.rs @@ -25,7 +25,7 @@ pub fn main() { // Generate random numbers - let mut elems = Vec::::with_capacity(3 * nelems); + let mut elems = Vec::::with_capacity(nelems); for _ in 0..nelems { elems.push(rng.gen_range(0..100)); diff --git a/src/tools.rs b/src/tools.rs index 46415fd..a55ee5c 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -2,12 +2,13 @@ use itertools::{izip, Itertools}; use mpi::{ - collective::SystemOperation, - datatype::{Partition, PartitionMut}, + collective::{SystemOperation, UserOperation}, + datatype::{DynBuffer, Partition, PartitionMut}, point_to_point as p2p, + raw::AsRaw, traits::{ - CommunicatorCollectives, Destination, Equivalence, PartitionedBuffer, PartitionedBufferMut, - Root, Source, + AsDatatype, CommunicatorCollectives, Destination, Equivalence, PartitionedBuffer, + PartitionedBufferMut, Root, Source, }, }; use num::traits::Zero; @@ -115,7 +116,17 @@ pub fn global_max( // Just need to initialize global_max with something. let mut global_max = *local_max; - comm.all_reduce_into(local_max, &mut global_max, SystemOperation::max()); + comm.all_reduce_into( + local_max, + &mut global_max, + &UserOperation::commutative(|x, y| { + let x: &[T] = x.downcast().unwrap(); + let y: &mut [T] = y.downcast().unwrap(); + for (&x_i, y_i) in x.iter().zip(y) { + *y_i = x_i.max(*y_i); + } + }), + ); global_max } @@ -125,12 +136,22 @@ pub fn global_min( arr: &[T], comm: &C, ) -> T { - let local_min = arr.iter().min().unwrap(); + let local_min = *arr.iter().min().unwrap(); // Just need to initialize global_min with something. - let mut global_min = *local_min; - - comm.all_reduce_into(local_min, &mut global_min, SystemOperation::min()); + let mut global_min = local_min; + + comm.all_reduce_into( + &local_min, + &mut global_min, + &UserOperation::commutative(|x, y| { + let x: &[T] = x.downcast().unwrap(); + let y: &mut [T] = y.downcast().unwrap(); + for (&x_i, y_i) in x.iter().zip(y) { + *y_i = x_i.min(*y_i); + } + }), + ); global_min } From a6a2cb7731a5eff09834ccd0568f0bcd47c922bd Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 14:13:18 +0100 Subject: [PATCH 24/42] Specialised parsort for Morton keys --- examples/parsort.rs | 13 ++++----- src/parsort.rs | 68 ++++++++------------------------------------- src/tools.rs | 24 ++++++++++++++++ 3 files changed, 42 insertions(+), 63 deletions(-) diff --git a/examples/parsort.rs b/examples/parsort.rs index 8a20706..5de9c07 100644 --- a/examples/parsort.rs +++ b/examples/parsort.rs @@ -1,5 +1,8 @@ //! Testing the hyksort component. -use bempp_octree::{parsort::parsort, tools::is_sorted_array}; +use bempp_octree::{ + parsort::parsort, + tools::{generate_random_keys, is_sorted_array}, +}; use mpi::traits::Communicator; use rand::prelude::*; @@ -10,13 +13,9 @@ pub fn main() { let mut rng = rand::rngs::StdRng::seed_from_u64(0); - let mut arr = Vec::::new(); + let keys = generate_random_keys(n_per_rank, &mut rng); - for _ in 0..n_per_rank { - arr.push(rng.gen()); - } - - let arr = parsort(&arr, &world, &mut rng); + let arr = parsort(&keys, &world, &mut rng); assert!(is_sorted_array(&arr, &world)); diff --git a/src/parsort.rs b/src/parsort.rs index f38e469..7191783 100644 --- a/src/parsort.rs +++ b/src/parsort.rs @@ -1,71 +1,28 @@ //! Implementation of a parallel samplesort. use std::fmt::Display; -use std::mem::offset_of; use itertools::Itertools; -use mpi::datatype::{UncommittedDatatypeRef, UncommittedUserDatatype, UserDatatype}; use mpi::traits::CommunicatorCollectives; use mpi::traits::Equivalence; use rand::{seq::SliceRandom, Rng}; +use crate::morton::MortonKey; use crate::tools::{gather_to_all, global_max, global_min, redistribute_by_bins}; const OVERSAMPLING: usize = 8; -/// Sortable trait that each type fed into parsort needs to satisfy. -pub trait ParallelSortable: - Equivalence + Copy + Clone + PartialEq + Eq + PartialOrd + Ord + Display + Sized -{ -} - -impl - ParallelSortable for T -{ -} - /// An internal struct. We convert every array element /// into this struct. The idea is that this is guaranteed to be unique /// as it encodes not only the element but also its rank and index. -#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord)] -struct UniqueItem { - pub value: T, +#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Equivalence)] +struct UniqueItem { + pub value: MortonKey, pub rank: usize, pub index: usize, } -unsafe impl Equivalence for UniqueItem { - type Out = UserDatatype; - - // Depending on the MPI implementation the below offset needs - // to be an i64 or isize. If it is an i64 Clippy warns about - // a useless conversion. But this warning is MPI implementation - // dependent. So switch off here. - - #[allow(clippy::useless_conversion)] - fn equivalent_datatype() -> Self::Out { - UserDatatype::structured::( - &[1, 1, 1], - &[ - (offset_of!(UniqueItem, value) as i64) - .try_into() - .unwrap(), - (offset_of!(UniqueItem, rank) as i64).try_into().unwrap(), - (offset_of!(UniqueItem, index) as i64) - .try_into() - .unwrap(), - ], - &[ - UncommittedUserDatatype::contiguous(1, &::equivalent_datatype()) - .as_ref(), - usize::equivalent_datatype().into(), - usize::equivalent_datatype().into(), - ], - ) - } -} - -impl Display for UniqueItem { +impl Display for UniqueItem { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, @@ -75,22 +32,21 @@ impl Display for UniqueItem { } } -impl UniqueItem { - pub fn new(value: T, rank: usize, index: usize) -> Self { +impl UniqueItem { + pub fn new(value: MortonKey, rank: usize, index: usize) -> Self { Self { value, rank, index } } } -fn to_unique_item(arr: &[T], rank: usize) -> Vec> { +fn to_unique_item(arr: &[MortonKey], rank: usize) -> Vec { arr.iter() .enumerate() .map(|(index, &item)| UniqueItem::new(item, rank, index)) .collect() } -fn get_buckets(arr: &[UniqueItem], comm: &C, rng: &mut R) -> Vec> +fn get_buckets(arr: &[UniqueItem], comm: &C, rng: &mut R) -> Vec where - T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized, { @@ -149,11 +105,11 @@ where } /// Parallel sort -pub fn parsort( - arr: &[T], +pub fn parsort( + arr: &[MortonKey], comm: &C, rng: &mut R, -) -> Vec { +) -> Vec { let size = comm.size() as usize; let rank = comm.rank() as usize; // If we only have a single rank simply sort the local array and return diff --git a/src/tools.rs b/src/tools.rs index a55ee5c..8f76b4c 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -12,6 +12,12 @@ use mpi::{ }, }; use num::traits::Zero; +use rand::Rng; + +use crate::{ + constants::{DEEPEST_LEVEL, LEVEL_SIZE}, + morton::MortonKey, +}; /// Gather array to all processes pub fn gather_to_all(arr: &[T], comm: &C) -> Vec { @@ -352,6 +358,24 @@ pub fn redistribute_by_bins( redistribute(sorted_keys, &counts, comm) } +/// Generate random keys for testing. +pub fn generate_random_keys(nkeys: usize, rng: &mut R) -> Vec { + let mut result = Vec::::with_capacity(nkeys); + + let xindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + let yindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + let zindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); + + for (xval, yval, zval) in izip!(xindices.iter(), yindices.iter(), zindices.iter()) { + result.push(MortonKey::from_index_and_level( + [xval, yval, zval], + DEEPEST_LEVEL as usize, + )); + } + + result +} + /// Compute displacements from a vector of counts. /// /// This is useful for global MPI varcount operations. Let From 2c2ed174762404a2925a1604e5360b710990eb42 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 14:14:40 +0100 Subject: [PATCH 25/42] Fixed warnings --- examples/mpi_complete_tree.rs | 6 +----- examples/mpi_cumsum.rs | 6 +----- examples/parallel_tests.rs | 3 +-- src/tools.rs | 8 ++------ 4 files changed, 5 insertions(+), 18 deletions(-) diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index d3bd214..60a095a 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -2,11 +2,7 @@ use bempp_octree::{ constants::DEEPEST_LEVEL, - geometry::PhysicalBox, - octree::{ - complete_tree, compute_global_bounding_box, is_complete_linear_tree, points_to_morton, - }, - tools::gather_to_root, + octree::{complete_tree, is_complete_linear_tree, points_to_morton}, }; use mpi::traits::*; use rand::prelude::*; diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs index 08f8c97..ab9e1b1 100644 --- a/examples/mpi_cumsum.rs +++ b/examples/mpi_cumsum.rs @@ -1,10 +1,6 @@ //! Test the computation of a global bounding box across MPI ranks. -use bempp_octree::{ - geometry::PhysicalBox, - octree::compute_global_bounding_box, - tools::{gather_to_root, global_inclusive_cumsum}, -}; +use bempp_octree::tools::{gather_to_root, global_inclusive_cumsum}; use itertools::{izip, Itertools}; use mpi::traits::*; use rand::prelude::*; diff --git a/examples/parallel_tests.rs b/examples/parallel_tests.rs index 9d841ea..38e35cb 100644 --- a/examples/parallel_tests.rs +++ b/examples/parallel_tests.rs @@ -1,8 +1,7 @@ //! Testing the hyksort component. use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE}; use bempp_octree::morton::MortonKey; -use bempp_octree::octree::{block_partition, linearize, partition}; -use bempp_octree::parsort::parsort; +use bempp_octree::octree::{block_partition, linearize}; use bempp_octree::tools::gather_to_root; use itertools::{izip, Itertools}; use mpi::traits::*; diff --git a/src/tools.rs b/src/tools.rs index 8f76b4c..075804a 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -3,13 +3,9 @@ use itertools::{izip, Itertools}; use mpi::{ collective::{SystemOperation, UserOperation}, - datatype::{DynBuffer, Partition, PartitionMut}, + datatype::{Partition, PartitionMut}, point_to_point as p2p, - raw::AsRaw, - traits::{ - AsDatatype, CommunicatorCollectives, Destination, Equivalence, PartitionedBuffer, - PartitionedBufferMut, Root, Source, - }, + traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source}, }; use num::traits::Zero; use rand::Rng; From f354653051a016e47063f4e8ab07a8621212d955 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 16:30:17 +0100 Subject: [PATCH 26/42] More cleanup --- src/octree.rs | 227 +++++++++++++++++++++++--------------------------- 1 file changed, 104 insertions(+), 123 deletions(-) diff --git a/src/octree.rs b/src/octree.rs index 0371f3f..574350d 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -1,25 +1,17 @@ //! Parallel Octree structure -use std::collections::HashMap; - use crate::{ constants::{DEEPEST_LEVEL, NSIBLINGS}, geometry::PhysicalBox, morton::MortonKey, parsort::parsort, - tools::{communicate_back, gather_to_all, global_inclusive_cumsum, redistribute}, + tools::{communicate_back, gather_to_all, global_inclusive_cumsum, redistribute, sort_to_bins}, }; -use mpi::{ - point_to_point as p2p, - traits::{Root, Source}, -}; +use mpi::traits::Root; use itertools::{izip, Itertools}; -use mpi::{ - collective::SystemOperation, - traits::{CommunicatorCollectives, Destination}, -}; +use mpi::{collective::SystemOperation, traits::CommunicatorCollectives}; use rand::Rng; /// Compute the global bounding box across all points on all processes. @@ -283,12 +275,7 @@ pub fn redistribute_with_respect_to_coarse_tree( let my_first = coarse_tree.first().unwrap(); - let mut global_bins = gather_to_all(std::slice::from_ref(my_first), comm); - - // We now have the first index from each process. We also want - // an upper bound for the last index of the tree to make the sorting into - // bins easier. - global_bins.push(MortonKey::upper_bound()); + let global_bins = gather_to_all(std::slice::from_ref(my_first), comm); // We now have our bins. We go through our keys and store how // many keys are assigned to each rank. We are using here that @@ -306,41 +293,41 @@ pub fn redistribute_with_respect_to_coarse_tree( redistribute(&sorted_keys, &rank_counts, comm) } -/// Create bins from sorted keys. -pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec { - let mut bin_counts = vec![0 as usize; bins.len() - 1]; - - // This iterates over each possible bin and returns also the associated rank. - let mut bin_iter = izip!( - bin_counts.iter_mut(), - bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(), - ); - - // We take the first element of the bin iterator. There will always be at least one. - let mut r: &mut usize; - let mut bin_start: &MortonKey; - let mut bin_end: &MortonKey; - (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); - - for &key in sorted_keys.iter() { - if *bin_start <= key && key < *bin_end { - *r += 1; - } else { - // Move the bin forward until it fits. There will always be a fitting bin. - while let Some((rn, (bsn, ben))) = bin_iter.next() { - if *bsn <= key && key < *ben { - *rn += 1; - r = rn; - bin_start = bsn; - bin_end = ben; - break; - } - } - } - } - - bin_counts -} +// /// Create bins from sorted keys. +// pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec { +// let mut bin_counts = vec![0 as usize; bins.len() - 1]; + +// // This iterates over each possible bin and returns also the associated rank. +// let mut bin_iter = izip!( +// bin_counts.iter_mut(), +// bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(), +// ); + +// // We take the first element of the bin iterator. There will always be at least one. +// let mut r: &mut usize; +// let mut bin_start: &MortonKey; +// let mut bin_end: &MortonKey; +// (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); + +// for &key in sorted_keys.iter() { +// if *bin_start <= key && key < *bin_end { +// *r += 1; +// } else { +// // Move the bin forward until it fits. There will always be a fitting bin. +// while let Some((rn, (bsn, ben))) = bin_iter.next() { +// if *bsn <= key && key < *ben { +// *rn += 1; +// r = rn; +// bin_start = bsn; +// bin_end = ben; +// break; +// } +// } +// } +// } + +// bin_counts +// } /// Return a complete tree generated from local keys and associated coarse keys. /// @@ -360,8 +347,7 @@ pub fn create_local_tree( // is associated with a coarse slice. For this we need to add an upper bound // coarse keys to ensure that we have suitable bins. - let mut bins = coarse_keys.to_vec(); - bins.push(MortonKey::upper_bound()); + let bins = coarse_keys.to_vec(); let counts = sort_to_bins(&sorted_fine_keys, &bins); @@ -390,7 +376,7 @@ pub fn create_local_tree( } } - coarse_keys.to_vec() + new_coarse_keys.to_vec() } /// Linearize a set of weighted Morton keys. @@ -485,53 +471,59 @@ pub fn partition( let w = total_weight / (size as usize); let k = total_weight % (size as usize); - let mut hash_map = HashMap::>::new(); - // Sort the elements into bins according to which process they should be sent. + // We do not need to sort the Morton keys themselves into bins but the scanned weights. + // The corresponding counts are the right counts for the Morton keys. + + let mut bins = Vec::::with_capacity(size as usize); for p in 1..=size as usize { - let q = if p <= k as usize { - izip!(sorted_keys, &scan) - .filter_map(|(&key, &s)| { - if ((p - 1) * (1 + w) <= s && s < p * (w + 1)) - || (p == size as usize && (p - 1) * (1 + w) <= s) - { - Some(key) - } else { - None - } - }) - .collect_vec() + if p <= k { + bins.push((p - 1) * (1 + w)); } else { - izip!(sorted_keys, &scan) - .filter_map(|(&key, &s)| { - if ((p - 1) * w + k <= s && s < p * w + k) - || (p == size as usize && (p - 1) * w + k <= s) - { - Some(key) - } else { - None - } - }) - .collect_vec() - }; - hash_map.insert(p - 1, q); + bins.push((p - 1) * w + k); + } } + let counts = sort_to_bins(&scan, &bins) + .iter() + .map(|elem| *elem as i32) + .collect_vec(); + + // for p in 1..=size as usize { + // let q = if p <= k as usize { + // izip!(sorted_keys, &scan) + // .filter_map(|(&key, &s)| { + // if ((p - 1) * (1 + w) <= s && s < p * (w + 1)) + // || (p == size as usize && (p - 1) * (1 + w) <= s) + // { + // Some(key) + // } else { + // None + // } + // }) + // .collect_vec() + // } else { + // izip!(sorted_keys, &scan) + // .filter_map(|(&key, &s)| { + // if ((p - 1) * w + k <= s && s < p * w + k) + // || (p == size as usize && (p - 1) * w + k <= s) + // { + // Some(key) + // } else { + // None + // } + // }) + // .collect_vec() + // }; + // hash_map.insert(p - 1, q); + // } + // Now distribute the data with an all to all v. // We create a vector of how many elements to send to each process and // then send the actual data. - let mut counts = vec![0 as i32; size as usize]; - - let mut all_elements = Vec::::new(); - for (index, c) in counts.iter_mut().enumerate() { - let elements = hash_map.get(&index).unwrap(); - *c = elements.len() as i32; - all_elements.extend(elements.iter()) - } - - let mut recvbuffer = redistribute(&all_elements, &counts, comm); + let mut recvbuffer = redistribute(&sorted_keys, &counts, comm); recvbuffer.sort_unstable(); recvbuffer @@ -560,42 +552,31 @@ pub fn complete_tree( // ancestor of the deepest first key and first element. Correspondingly on the last process // we need to insert the last child of the finest ancester of the deepest last key and last element. + let next_key = communicate_back(&linearized_keys, comm); + + if rank < size - 1 { + linearized_keys.push(next_key.unwrap()); + } + + // Now fix the first key on the first rank. + + if rank == 0 { + let first_key = linearized_keys.first().unwrap(); + let deepest_first = MortonKey::deepest_first(); + if !first_key.is_ancestor(deepest_first) { + let ancestor = deepest_first.finest_common_ancestor(*first_key); + linearized_keys.insert(0, ancestor.children()[0]); + } + } + if rank == size - 1 { - // On last process send first element to previous processes and insert last - // possible box from region into list. - comm.process_at_rank(rank - 1) - .send(linearized_keys.first().unwrap()); - let last_key = *linearized_keys.last().unwrap(); + let last_key = linearized_keys.last().unwrap(); let deepest_last = MortonKey::deepest_last(); if !last_key.is_ancestor(deepest_last) { - let ancestor = deepest_last.finest_common_ancestor(last_key); + let ancestor = deepest_last.finest_common_ancestor(*last_key); linearized_keys.push(ancestor.children()[NSIBLINGS - 1]); } - } else { - let (other, _status) = if rank > 0 { - // On intermediate process receive from the next process - // and send first element to previous process. - p2p::send_receive( - linearized_keys.first().unwrap(), - &comm.process_at_rank(rank - 1), - &comm.process_at_rank(rank + 1), - ) - } else { - // On first process insert at the beginning the first possible - // box in the region and receive the key from next process. - let first_key = *linearized_keys.first().unwrap(); - let deepest_first = MortonKey::deepest_first(); - if !first_key.is_ancestor(deepest_first) { - let ancestor = deepest_first.finest_common_ancestor(first_key); - linearized_keys.insert(0, ancestor.children()[0]); - } - - comm.process_at_rank(1).receive::() - }; - // If we are not at the last process we need to introduce the received key - // into our list. - linearized_keys.push(other); - }; + } // Now complete the regions defined by the keys on each process. From d4ebe77849b8078b3bc2afc1a650a0f771702c99 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 19:55:04 +0100 Subject: [PATCH 27/42] WIP: Better testing --- examples/mpi_complete_tree.rs | 10 ++- examples/parallel_tests.rs | 2 +- src/octree.rs | 130 ++++++++++++++++++++++++---------- 3 files changed, 98 insertions(+), 44 deletions(-) diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index 60a095a..052b214 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -2,7 +2,7 @@ use bempp_octree::{ constants::DEEPEST_LEVEL, - octree::{complete_tree, is_complete_linear_tree, points_to_morton}, + octree::{complete_tree, is_complete_linear_tree, linearize, points_to_morton}, }; use mpi::traits::*; use rand::prelude::*; @@ -32,14 +32,12 @@ pub fn main() { // Compute the Morton keys on the deepest level let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm); - assert!(!is_complete_linear_tree(&keys, &comm)); + let linear_keys = linearize(&keys, &mut rng, &comm); // Generate a complete tree - let distributed_complete_tree = complete_tree(&keys, &mut rng, &comm); + let distributed_complete_tree = complete_tree(&linear_keys, &comm); - let is_complete_linear = is_complete_linear_tree(&distributed_complete_tree, &comm); - - assert!(is_complete_linear); + assert!(is_complete_linear_tree(&distributed_complete_tree, &comm)); if comm.rank() == 0 { println!("Distributed tree is complete and linear."); diff --git a/examples/parallel_tests.rs b/examples/parallel_tests.rs index 38e35cb..8285d7c 100644 --- a/examples/parallel_tests.rs +++ b/examples/parallel_tests.rs @@ -122,7 +122,7 @@ pub fn test_coarse_partition(rng: &mut R, co println!("Rank {} has {} keys. ", rank, keys.len()); - let partitioned_tree = block_partition(&keys, rng, comm); + let partitioned_tree = block_partition(&keys, comm); println!( "Partitioned tree on rank {} has {} keys.", diff --git a/src/octree.rs b/src/octree.rs index 574350d..3185103 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -127,30 +127,27 @@ pub fn points_to_morton( (keys, bounding_box) } -/// Block partition of tree. -/// -/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned -/// keys and the associated coarse keys. -/// A necessary condition for the block partitioning is that -// all sorted keys are on the same level. -pub fn block_partition( - sorted_keys: &[MortonKey], - rng: &mut R, +/// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree. +pub fn compute_coarse_tree( + linear_keys: &[MortonKey], comm: &C, -) -> (Vec, Vec) { - let rank = comm.rank(); - if comm.size() == 1 { - // On a single node block partitioning should not do anything. - return (sorted_keys.to_vec(), vec![MortonKey::root()]); +) -> Vec { + let size = comm.size(); + + debug_assert!(is_linear_tree(linear_keys, comm)); + + // On a single node a complete coarse tree is simply the root. + if size == 1 { + return vec![MortonKey::root()]; } - let mut completed_region = sorted_keys + let mut completed_region = linear_keys .first() .unwrap() - .fill_between_keys(*sorted_keys.last().unwrap()); + .fill_between_keys(*linear_keys.last().unwrap()); - completed_region.insert(0, *sorted_keys.first().unwrap()); - completed_region.push(*sorted_keys.last().unwrap()); + completed_region.insert(0, *linear_keys.first().unwrap()); + completed_region.push(*linear_keys.last().unwrap()); // Get the smallest level members of the completed region. @@ -169,7 +166,28 @@ pub fn block_partition( .copied() .collect_vec(); - let coarse_tree = complete_tree(&largest_boxes, rng, comm); + debug_assert!(is_linear_tree(&largest_boxes, comm)); + + complete_tree(&largest_boxes, comm) +} + +/// Block partition of tree. +/// +/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned +/// keys and the associated coarse keys. +/// A necessary condition for the block partitioning is that +// all sorted keys are on the same level. +pub fn block_partition( + linear_keys: &[MortonKey], + comm: &C, +) -> (Vec, Vec) { + let rank = comm.rank(); + if comm.size() == 1 { + // On a single node block partitioning should not do anything. + return (linear_keys.to_vec(), vec![MortonKey::root()]); + } + + let coarse_tree = compute_coarse_tree(&linear_keys, comm); // We want to partition the coarse tree. But we need the correct weights. The idea // is that we use the number of original leafs that intersect with the coarse tree @@ -195,7 +213,7 @@ pub fn block_partition( // Let's find the start of our region. The start of our region is a coarse key that is an ancestor // of our current key. This works because the coarse tree has levels at most as high as the sorted keys. - let first_key = *sorted_keys.first().unwrap(); + let first_key = *linear_keys.first().unwrap(); let first_coarse_index = global_coarse_tree .iter() @@ -204,7 +222,7 @@ pub fn block_partition( // Now we need to find the end index of our region. For this again we find the index of our coarse tree that // is an ancestor of our last key. - let last_key = *sorted_keys.last().unwrap(); + let last_key = *linear_keys.last().unwrap(); let last_coarse_index = global_coarse_tree .iter() @@ -218,7 +236,7 @@ pub fn block_partition( local_weights[first_coarse_index..=last_coarse_index].iter_mut(), global_coarse_tree[first_coarse_index..=last_coarse_index].iter() ) { - *w += sorted_keys + *w += linear_keys .iter() .filter(|&&key| global_coarse_key.is_ancestor(key)) .count(); @@ -246,7 +264,7 @@ pub fn block_partition( let coarse_tree = partition(&coarse_tree, &weights, comm); ( - redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm), + redistribute_with_respect_to_coarse_tree(&linear_keys, &coarse_tree, comm), coarse_tree, ) @@ -529,19 +547,20 @@ pub fn partition( recvbuffer } -/// Given a distributed set of keys, generate a complete linear Octree. -pub fn complete_tree( - keys: &[MortonKey], - rng: &mut R, +/// Given a distributed set of linear keys, generate a complete tree. +pub fn complete_tree( + linear_keys: &[MortonKey], comm: &C, ) -> Vec { - let mut linearized_keys = linearize(keys, rng, comm); + let mut linear_keys = linear_keys.to_vec(); + + debug_assert!(is_linear_tree(&linear_keys, comm)); let size = comm.size(); let rank = comm.rank(); if size == 1 { - return MortonKey::complete_tree(linearized_keys.as_slice()); + return MortonKey::complete_tree(linear_keys.as_slice()); } // Now insert on the first and last process the first and last child of the @@ -552,29 +571,29 @@ pub fn complete_tree( // ancestor of the deepest first key and first element. Correspondingly on the last process // we need to insert the last child of the finest ancester of the deepest last key and last element. - let next_key = communicate_back(&linearized_keys, comm); + let next_key = communicate_back(&linear_keys, comm); if rank < size - 1 { - linearized_keys.push(next_key.unwrap()); + linear_keys.push(next_key.unwrap()); } // Now fix the first key on the first rank. if rank == 0 { - let first_key = linearized_keys.first().unwrap(); + let first_key = linear_keys.first().unwrap(); let deepest_first = MortonKey::deepest_first(); if !first_key.is_ancestor(deepest_first) { let ancestor = deepest_first.finest_common_ancestor(*first_key); - linearized_keys.insert(0, ancestor.children()[0]); + linear_keys.insert(0, ancestor.children()[0]); } } if rank == size - 1 { - let last_key = linearized_keys.last().unwrap(); + let last_key = linear_keys.last().unwrap(); let deepest_last = MortonKey::deepest_last(); if !last_key.is_ancestor(deepest_last) { let ancestor = deepest_last.finest_common_ancestor(*last_key); - linearized_keys.push(ancestor.children()[NSIBLINGS - 1]); + linear_keys.push(ancestor.children()[NSIBLINGS - 1]); } } @@ -582,18 +601,55 @@ pub fn complete_tree( let mut result = Vec::::new(); - for (&key1, &key2) in linearized_keys.iter().tuple_windows() { + for (&key1, &key2) in linear_keys.iter().tuple_windows() { result.push(key1); result.extend_from_slice(key1.fill_between_keys(key2).as_slice()); } if rank == size - 1 { - result.push(*linearized_keys.last().unwrap()); + result.push(*linear_keys.last().unwrap()); } + debug_assert!(is_complete_linear_tree(&result, comm)); + result } +/// Return true if the keys are linear. +pub fn is_linear_tree(arr: &[MortonKey], comm: &C) -> bool { + let mut is_linear = true; + + for (&key1, &key2) in arr.iter().tuple_windows() { + if key1 >= key2 || key1.is_ancestor(key2) { + is_linear = false; + break; + } + } + + if comm.size() == 1 { + return is_linear; + } + + // Now check the interfaces + + if let Some(next_key) = communicate_back(arr, comm) { + let last = *arr.last().unwrap(); + if last >= next_key || last.is_ancestor(next_key) { + is_linear = false; + } + } + + let mut global_is_linear = false; + + comm.all_reduce_into( + &is_linear, + &mut global_is_linear, + SystemOperation::logical_and(), + ); + + global_is_linear +} + /// Return true on all ranks if distributed tree is complete. Otherwise, return false. pub fn is_complete_linear_tree(arr: &[MortonKey], comm: &C) -> bool { // First check that the local tree on each node is complete. From c2db50b0edcb105c1a4a8cba083b7ab83d13b4af Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 21:38:48 +0100 Subject: [PATCH 28/42] WIP: Tests involving coarse tree --- examples/mpi_coarse_tree.rs | 95 ++++++++++++++++++++++ examples/parallel_tests.rs | 152 ------------------------------------ src/octree.rs | 138 +++++++++----------------------- 3 files changed, 132 insertions(+), 253 deletions(-) create mode 100644 examples/mpi_coarse_tree.rs delete mode 100644 examples/parallel_tests.rs diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs new file mode 100644 index 0000000..ef0095c --- /dev/null +++ b/examples/mpi_coarse_tree.rs @@ -0,0 +1,95 @@ +//! Test the computation of a global bounding box across MPI ranks. + +use bempp_octree::{ + constants::DEEPEST_LEVEL, + octree::{ + complete_tree, compute_coarse_tree, compute_coarse_tree_weights, is_complete_linear_tree, + linearize, load_balance, points_to_morton, redistribute_with_respect_to_coarse_tree, + }, + tools::global_size, +}; +use mpi::{collective::SystemOperation, traits::*}; +use rand::prelude::*; +use rand_chacha::ChaCha8Rng; + +pub fn main() { + // Initialise MPI + let universe = mpi::initialize().unwrap(); + + // Get the world communicator + let comm = universe.world(); + + // Initialise a seeded Rng. + let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); + + // Create `npoints` per rank. + let npoints = 10000; + + // Generate random points. + + let mut points = Vec::::with_capacity(3 * npoints); + + for _ in 0..3 * npoints { + points.push(rng.gen()); + } + + // Compute the Morton keys on the deepest level + let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm); + + // linearize the keys + let linear_keys = linearize(&keys, &mut rng, &comm); + + // Generate the coarse tree + let coarse_tree = compute_coarse_tree(&linear_keys, &comm); + assert!(is_complete_linear_tree(&coarse_tree, &comm)); + + // We now compute the weights for the coarse tree. + + let weights = compute_coarse_tree_weights(&linear_keys, &coarse_tree, &comm); + + // Assert that the global sum of the weights is identical to the number of linearized keys. + + let mut global_weight: usize = 0; + + comm.all_reduce_into( + &(weights.iter().sum::()), + &mut global_weight, + SystemOperation::sum(), + ); + + assert_eq!(global_weight, global_size(&linear_keys, &comm)); + + // Now load balance the coarse tree + + let balanced_keys = load_balance(&coarse_tree, &weights, &comm); + + // Compute the weights of the balanced keys + + let balanced_weights = compute_coarse_tree_weights(&linear_keys, &balanced_keys, &comm); + + let mut global_balanced_weight: usize = 0; + comm.all_reduce_into( + &(balanced_weights.iter().sum::()), + &mut global_balanced_weight, + SystemOperation::sum(), + ); + + // The global weight of the non-balanced keys should be identical + // to the global weigth of the balanced keys. + + assert_eq!(global_weight, global_balanced_weight); + + // Now compute the new fine keys. + + let redistributed_fine_keys = + redistribute_with_respect_to_coarse_tree(&linear_keys, &balanced_keys, &comm); + + assert_eq!( + global_size(&redistributed_fine_keys, &comm), + global_size(&linear_keys, &comm) + ); + + if comm.rank() == 0 { + println!("Coarse tree successfully created and weights computed."); + } +} diff --git a/examples/parallel_tests.rs b/examples/parallel_tests.rs deleted file mode 100644 index 8285d7c..0000000 --- a/examples/parallel_tests.rs +++ /dev/null @@ -1,152 +0,0 @@ -//! Testing the hyksort component. -use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE}; -use bempp_octree::morton::MortonKey; -use bempp_octree::octree::{block_partition, linearize}; -use bempp_octree::tools::gather_to_root; -use itertools::{izip, Itertools}; -use mpi::traits::*; -use rand::prelude::*; - -pub fn assert_linearized(arr: &Vec, comm: &C) { - // Check that the keys are still linearized. - let arr = gather_to_root(&arr, comm); - - if comm.rank() == 0 { - let arr = arr.unwrap(); - for (&elem1, &elem2) in arr.iter().tuple_windows() { - assert!(!elem1.is_ancestor(elem2)); - } - println!("{} keys are linearized.", &arr.len()); - } -} - -pub fn generate_random_keys(nkeys: usize, rng: &mut R) -> Vec { - let mut result = Vec::::with_capacity(nkeys); - - let xindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); - let yindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); - let zindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys); - - for (xval, yval, zval) in izip!(xindices.iter(), yindices.iter(), zindices.iter()) { - result.push(MortonKey::from_index_and_level( - [xval, yval, zval], - DEEPEST_LEVEL as usize, - )); - } - - result -} - -pub fn generate_random_tree(max_level: usize, rng: &mut R) -> Vec { - pub fn add_level( - keys: &mut Vec, - current: MortonKey, - rng: &mut R, - max_level: usize, - ) { - keys.push(current); - - if current.level() >= max_level { - return; - } - - let mut children = current.children(); - - // This makes sure that the tree is not sorted. - children.shuffle(rng); - - for child in children { - if rng.gen_bool(0.9) { - add_level(keys, child, rng, max_level); - } - } - } - - let mut keys = Vec::::new(); - add_level(&mut keys, MortonKey::root(), rng, max_level); - - keys -} - -pub fn test_linearize(rng: &mut R, comm: &C) { - let max_level = 6; - let keys = generate_random_tree(max_level, rng); - let rank = comm.rank(); - - // We now linearize the keys. - - if rank == 0 { - println!("Linearizing keys."); - } - let sorted_keys = linearize(&keys, rng, comm); - - // Now check that the tree is properly linearized. - - assert_linearized(&sorted_keys, comm); - if rank == 0 { - println!("Linearization successful."); - } - - // Now form the coarse tree -} - -pub fn test_coarse_partition(rng: &mut R, comm: &C) { - let rank = comm.rank(); - let keys = if rank == 0 { - generate_random_keys(50, rng) - } else { - generate_random_keys(1000, rng) - }; - - // We now linearize the keys. - - let mut keys = linearize(&keys, rng, comm); - - // We move most keys over from rank 0 to rank 2 to check how the partitioning works. - - let nsend = 400; - // Send the last 200 keys from rank 0 to rank 1. - - if rank == 0 { - let send_keys = &keys[keys.len() - nsend..keys.len()]; - comm.process_at_rank(1).send(send_keys); - keys = keys[0..keys.len() - nsend].to_vec(); - } - - if rank == 1 { - let mut recv_keys = vec![MortonKey::default(); nsend]; - comm.process_at_rank(0).receive_into(&mut recv_keys); - recv_keys.extend(keys.iter()); - keys = recv_keys; - } - - println!("Rank {} has {} keys. ", rank, keys.len()); - - let partitioned_tree = block_partition(&keys, comm); - - println!( - "Partitioned tree on rank {} has {} keys.", - rank, - partitioned_tree.0.len() - ); - - let arr = gather_to_root(&partitioned_tree.0, comm); - - if rank == 0 { - let arr = arr.unwrap(); - for (elem1, elem2) in arr.iter().tuple_windows() { - assert!(*elem1 <= *elem2); - } - println!("Keys are sorted."); - } -} - -pub fn main() { - let universe = mpi::initialize().unwrap(); - let comm = universe.world(); - let rank = comm.rank() as u64; - // Each process gets its own rng - let mut rng = rand::rngs::StdRng::seed_from_u64(rank as u64); - test_linearize(&mut rng, &comm); - test_coarse_partition(&mut rng, &comm); -} diff --git a/src/octree.rs b/src/octree.rs index 3185103..ca5261d 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -128,6 +128,7 @@ pub fn points_to_morton( } /// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree. +/// The returned coarse tree is load balanced according to the number of linear keys in each coarse block. pub fn compute_coarse_tree( linear_keys: &[MortonKey], comm: &C, @@ -171,24 +172,13 @@ pub fn compute_coarse_tree( complete_tree(&largest_boxes, comm) } -/// Block partition of tree. -/// -/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned -/// keys and the associated coarse keys. -/// A necessary condition for the block partitioning is that -// all sorted keys are on the same level. -pub fn block_partition( +/// Compute the weights of each coarse tree block as the number of linear keys associated with each coarse block. +pub fn compute_coarse_tree_weights( linear_keys: &[MortonKey], + coarse_tree: &[MortonKey], comm: &C, -) -> (Vec, Vec) { +) -> Vec { let rank = comm.rank(); - if comm.size() == 1 { - // On a single node block partitioning should not do anything. - return (linear_keys.to_vec(), vec![MortonKey::root()]); - } - - let coarse_tree = compute_coarse_tree(&linear_keys, comm); - // We want to partition the coarse tree. But we need the correct weights. The idea // is that we use the number of original leafs that intersect with the coarse tree // as leafs. In order to compute this we send the coarse tree around to all processes @@ -204,7 +194,7 @@ pub fn block_partition( let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm); // We now compute the local weights. - let mut local_weights = vec![0 as usize; global_coarse_tree.len()]; + let mut local_weight_contribution = vec![0 as usize; global_coarse_tree.len()]; // In the following loop we want to be a bit smart. We do not iterate through all the local elements. // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region @@ -233,7 +223,7 @@ pub fn block_partition( // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key). for (w, &global_coarse_key) in izip!( - local_weights[first_coarse_index..=last_coarse_index].iter_mut(), + local_weight_contribution[first_coarse_index..=last_coarse_index].iter_mut(), global_coarse_tree[first_coarse_index..=last_coarse_index].iter() ) { *w += linear_keys @@ -244,14 +234,18 @@ pub fn block_partition( // We now need to sum up the weights across all processes. - let mut weights = vec![0 as usize; global_coarse_tree.len()]; + let mut global_weights = vec![0 as usize; global_coarse_tree.len()]; - comm.all_reduce_into(&local_weights, &mut weights, SystemOperation::sum()); + comm.all_reduce_into( + &local_weight_contribution, + &mut global_weights, + SystemOperation::sum(), + ); // Each process now has all weights. However, we only need the ones for the current process. // So we just filter the rest out. - let weights = izip!(coarse_tree_ranks, weights) + izip!(coarse_tree_ranks, global_weights) .filter_map(|(r, weight)| { if r == rank as usize { Some(weight) @@ -259,28 +253,19 @@ pub fn block_partition( None } }) - .collect_vec(); - - let coarse_tree = partition(&coarse_tree, &weights, comm); - - ( - redistribute_with_respect_to_coarse_tree(&linear_keys, &coarse_tree, comm), - coarse_tree, - ) - - // We now need to redistribute the global tree according to the coarse tree. + .collect_vec() } /// Redistribute sorted keys with respect to a linear coarse tree. pub fn redistribute_with_respect_to_coarse_tree( - sorted_keys: &[MortonKey], + linear_keys: &[MortonKey], coarse_tree: &[MortonKey], comm: &C, ) -> Vec { let size = comm.size(); if size == 1 { - return sorted_keys.to_vec(); + return linear_keys.to_vec(); } // We want to globally redistribute keys so that the keys on each process are descendents @@ -301,51 +286,31 @@ pub fn redistribute_with_respect_to_coarse_tree( // This will store for each rank how many keys will be assigned to it. - let rank_counts = sort_to_bins(sorted_keys, &global_bins) + let rank_counts = sort_to_bins(linear_keys, &global_bins) .iter() .map(|&elem| elem as i32) .collect_vec(); // We now have the counts for each rank. Let's redistribute accordingly and return. - redistribute(&sorted_keys, &rank_counts, comm) -} + let result = redistribute(&linear_keys, &rank_counts, comm); + + #[cfg(debug_assertions)] + { + // Check through that the first and last key of result are descendents + // of the first and last coarse bloack. + debug_assert!(coarse_tree + .first() + .unwrap() + .is_ancestor(*result.first().unwrap())); + debug_assert!(coarse_tree + .last() + .unwrap() + .is_ancestor(*result.last().unwrap())); + } -// /// Create bins from sorted keys. -// pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec { -// let mut bin_counts = vec![0 as usize; bins.len() - 1]; - -// // This iterates over each possible bin and returns also the associated rank. -// let mut bin_iter = izip!( -// bin_counts.iter_mut(), -// bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(), -// ); - -// // We take the first element of the bin iterator. There will always be at least one. -// let mut r: &mut usize; -// let mut bin_start: &MortonKey; -// let mut bin_end: &MortonKey; -// (r, (bin_start, bin_end)) = bin_iter.next().unwrap(); - -// for &key in sorted_keys.iter() { -// if *bin_start <= key && key < *bin_end { -// *r += 1; -// } else { -// // Move the bin forward until it fits. There will always be a fitting bin. -// while let Some((rn, (bsn, ben))) = bin_iter.next() { -// if *bsn <= key && key < *ben { -// *rn += 1; -// r = rn; -// bin_start = bsn; -// bin_end = ben; -// break; -// } -// } -// } -// } - -// bin_counts -// } + result +} /// Return a complete tree generated from local keys and associated coarse keys. /// @@ -394,7 +359,7 @@ pub fn create_local_tree( } } - new_coarse_keys.to_vec() + new_coarse_keys } /// Linearize a set of weighted Morton keys. @@ -452,7 +417,7 @@ pub fn linearize( } /// Balance a sorted list of Morton keys across processors given an array of corresponding weights. -pub fn partition( +pub fn load_balance( sorted_keys: &[MortonKey], weights: &[usize], comm: &C, @@ -508,35 +473,6 @@ pub fn partition( .map(|elem| *elem as i32) .collect_vec(); - // for p in 1..=size as usize { - // let q = if p <= k as usize { - // izip!(sorted_keys, &scan) - // .filter_map(|(&key, &s)| { - // if ((p - 1) * (1 + w) <= s && s < p * (w + 1)) - // || (p == size as usize && (p - 1) * (1 + w) <= s) - // { - // Some(key) - // } else { - // None - // } - // }) - // .collect_vec() - // } else { - // izip!(sorted_keys, &scan) - // .filter_map(|(&key, &s)| { - // if ((p - 1) * w + k <= s && s < p * w + k) - // || (p == size as usize && (p - 1) * w + k <= s) - // { - // Some(key) - // } else { - // None - // } - // }) - // .collect_vec() - // }; - // hash_map.insert(p - 1, q); - // } - // Now distribute the data with an all to all v. // We create a vector of how many elements to send to each process and // then send the actual data. From 887dfcf2bc85df8356717ca6c7e500e8c74adf1d Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Sun, 22 Sep 2024 23:41:32 +0100 Subject: [PATCH 29/42] WIP: distributed balanced tree --- examples/mpi_coarse_tree.rs | 28 ++++++++++++++++++++-------- src/octree.rs | 25 +++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs index ef0095c..5874fe0 100644 --- a/examples/mpi_coarse_tree.rs +++ b/examples/mpi_coarse_tree.rs @@ -3,8 +3,9 @@ use bempp_octree::{ constants::DEEPEST_LEVEL, octree::{ - complete_tree, compute_coarse_tree, compute_coarse_tree_weights, is_complete_linear_tree, - linearize, load_balance, points_to_morton, redistribute_with_respect_to_coarse_tree, + complete_tree, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree, + is_complete_linear_tree, linearize, load_balance, points_to_morton, + redistribute_with_respect_to_coarse_tree, }, tools::global_size, }; @@ -61,15 +62,16 @@ pub fn main() { // Now load balance the coarse tree - let balanced_keys = load_balance(&coarse_tree, &weights, &comm); + let load_balanced_coarse_keys = load_balance(&coarse_tree, &weights, &comm); // Compute the weights of the balanced keys - let balanced_weights = compute_coarse_tree_weights(&linear_keys, &balanced_keys, &comm); + let load_balanced_weights = + compute_coarse_tree_weights(&linear_keys, &load_balanced_coarse_keys, &comm); let mut global_balanced_weight: usize = 0; comm.all_reduce_into( - &(balanced_weights.iter().sum::()), + &(load_balanced_weights.iter().sum::()), &mut global_balanced_weight, SystemOperation::sum(), ); @@ -81,15 +83,25 @@ pub fn main() { // Now compute the new fine keys. - let redistributed_fine_keys = - redistribute_with_respect_to_coarse_tree(&linear_keys, &balanced_keys, &comm); + let load_balanced_fine_keys = + redistribute_with_respect_to_coarse_tree(&linear_keys, &load_balanced_coarse_keys, &comm); assert_eq!( - global_size(&redistributed_fine_keys, &comm), + global_size(&load_balanced_fine_keys, &comm), global_size(&linear_keys, &comm) ); + let refined_tree = + create_local_tree(&load_balanced_fine_keys, &load_balanced_coarse_keys, 6, 100); + if comm.rank() == 0 { + println!("Coarse tree has {} keys.", load_balanced_coarse_keys.len()); + println!("Refined tree has {} keys.", refined_tree.len()); + } + + assert!(is_complete_linear_tree(&refined_tree, &comm)); + + if comm.rank() == 1 { println!("Coarse tree successfully created and weights computed."); } } diff --git a/src/octree.rs b/src/octree.rs index ca5261d..54d8168 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -316,6 +316,7 @@ pub fn redistribute_with_respect_to_coarse_tree( /// /// The coarse keys are refined until the maximum level is reached or until each coarse key /// is the ancestor of at most `max_keys` fine keys. +/// It is assumed that the level of the fine keys is at least as large as `max_level`. pub fn create_local_tree( sorted_fine_keys: &[MortonKey], coarse_keys: &[MortonKey], @@ -327,8 +328,7 @@ pub fn create_local_tree( } // We split the sorted fine keys into subslices so that each subslice - // is associated with a coarse slice. For this we need to add an upper bound - // coarse keys to ensure that we have suitable bins. + // is associated with a coarse slice. let bins = coarse_keys.to_vec(); @@ -413,6 +413,8 @@ pub fn linearize( } } + debug_assert!(is_linear_tree(&result, comm)); + result } @@ -655,3 +657,22 @@ pub fn is_complete_linear_tree(arr: &[MortonKey], co result } + +/// Return the deepest level of a distributed list of Morton keys. +pub fn deepest_level(keys: &[MortonKey], comm: &C) -> usize { + let local_deepest_level = keys.iter().map(|elem| elem.level()).max().unwrap(); + + if comm.size() == 1 { + return local_deepest_level; + } + + let mut global_deepest_level: usize = 0; + + comm.all_reduce_into( + &local_deepest_level, + &mut global_deepest_level, + SystemOperation::max(), + ); + + global_deepest_level +} From a39ef8e49ba44d4724bcd3ab078214989ee9ff74 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Mon, 23 Sep 2024 00:43:29 +0100 Subject: [PATCH 30/42] Balancing implemented --- examples/mpi_coarse_tree.rs | 18 +++++--- src/morton.rs | 4 ++ src/octree.rs | 83 ++++++++++++++++++++++++++++++++++++- 3 files changed, 98 insertions(+), 7 deletions(-) diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs index 5874fe0..db3c672 100644 --- a/examples/mpi_coarse_tree.rs +++ b/examples/mpi_coarse_tree.rs @@ -3,7 +3,7 @@ use bempp_octree::{ constants::DEEPEST_LEVEL, octree::{ - complete_tree, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree, + balance, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree, is_complete_linear_tree, linearize, load_balance, points_to_morton, redistribute_with_respect_to_coarse_tree, }, @@ -94,13 +94,19 @@ pub fn main() { let refined_tree = create_local_tree(&load_balanced_fine_keys, &load_balanced_coarse_keys, 6, 100); - if comm.rank() == 0 { - println!("Coarse tree has {} keys.", load_balanced_coarse_keys.len()); - println!("Refined tree has {} keys.", refined_tree.len()); - } - assert!(is_complete_linear_tree(&refined_tree, &comm)); + // Now balance the tree. + + let balanced_tree = balance(&refined_tree, &mut rng, &comm); + + // redistribute the balanced tree according to coarse tree + + let balanced_tree = + redistribute_with_respect_to_coarse_tree(&balanced_tree, &load_balanced_coarse_keys, &comm); + + assert!(is_complete_linear_tree(&balanced_tree, &comm)); + if comm.rank() == 1 { println!("Coarse tree successfully created and weights computed."); } diff --git a/src/morton.rs b/src/morton.rs index 43e4aa9..95ee563 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -389,6 +389,10 @@ impl MortonKey { let mut result = [MortonKey::default(); 26]; let (level, [x, y, z]) = self.decode(); + + if level == 0 { + return result; + } let level_size = 1 << level; for (direction, res) in izip!(DIRECTIONS, result.iter_mut()) { diff --git a/src/octree.rs b/src/octree.rs index 54d8168..ebc8ea1 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -1,11 +1,16 @@ //! Parallel Octree structure +use std::collections::HashSet; + use crate::{ constants::{DEEPEST_LEVEL, NSIBLINGS}, geometry::PhysicalBox, morton::MortonKey, parsort::parsort, - tools::{communicate_back, gather_to_all, global_inclusive_cumsum, redistribute, sort_to_bins}, + tools::{ + communicate_back, gather_to_all, gather_to_root, global_inclusive_cumsum, redistribute, + sort_to_bins, + }, }; use mpi::traits::Root; @@ -553,6 +558,64 @@ pub fn complete_tree( result } +/// Balance a distributed tree. +pub fn balance( + linear_keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + let deepest_level = deepest_level(linear_keys, comm); + + // Start with keys at deepest level + let mut work_list = linear_keys + .iter() + .copied() + .filter(|&key| key.level() == deepest_level) + .collect_vec(); + + let mut result = Vec::::new(); + + // Now go through and make sure that for each key siblings and neighbours of parents are added + + for level in (1..=deepest_level).rev() { + let mut parents = HashSet::::new(); + let mut new_work_list = Vec::::new(); + // We filter the work list by level and also make sure that + // only one sibling of each of the parents children is added to + // our current level list. + for key in work_list.iter() { + let parent = key.parent(); + if !parents.contains(&parent) { + parents.insert(parent); + result.extend_from_slice(key.siblings().as_slice()); + new_work_list.extend_from_slice( + parent + .neighbours() + .iter() + .copied() + .filter(|&key| key.is_valid()) + .collect_vec() + .as_slice(), + ); + } + } + new_work_list.extend( + linear_keys + .iter() + .copied() + .filter(|&key| key.level() == level - 1), + ); + + work_list = new_work_list; + // Now extend the work list with the + } + + let result = linearize(&result, rng, comm); + + debug_assert!(is_complete_linear_and_balanced(&result, comm)); + result +} + /// Return true if the keys are linear. pub fn is_linear_tree(arr: &[MortonKey], comm: &C) -> bool { let mut is_linear = true; @@ -676,3 +739,21 @@ pub fn deepest_level(keys: &[MortonKey], comm: &C) - global_deepest_level } + +/// Check if tree is balanced. +pub fn is_complete_linear_and_balanced( + arr: &[MortonKey], + comm: &C, +) -> bool { + // Send the tree to the root node and check there that it is balanced. + + let mut balanced = false; + + if let Some(arr) = gather_to_root(arr, comm) { + balanced = MortonKey::is_complete_linear_and_balanced(&arr); + } + + comm.process_at_rank(0).broadcast_into(&mut balanced); + + balanced +} From 5a8a8b75ab76733bb4d5f54e13d35a42839c14ce Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Mon, 23 Sep 2024 22:44:12 +0100 Subject: [PATCH 31/42] Added distribution of points to correct ranks. --- examples/mpi_coarse_tree.rs | 40 +++++++++++--- examples/mpi_complete_tree.rs | 7 +-- examples/mpi_global_bounding_box.rs | 10 ++-- src/geometry.rs | 36 ++++++++++--- src/morton.rs | 14 ++--- src/octree.rs | 83 ++++++++++++++++++++++++----- src/serial.rs | 20 ++++--- src/tools.rs | 20 +++++++ 8 files changed, 173 insertions(+), 57 deletions(-) diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs index db3c672..a79626d 100644 --- a/examples/mpi_coarse_tree.rs +++ b/examples/mpi_coarse_tree.rs @@ -2,12 +2,13 @@ use bempp_octree::{ constants::DEEPEST_LEVEL, + morton::MortonKey, octree::{ balance, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree, is_complete_linear_tree, linearize, load_balance, points_to_morton, - redistribute_with_respect_to_coarse_tree, + redistribute_points_with_respect_to_coarse_tree, redistribute_with_respect_to_coarse_tree, }, - tools::global_size, + tools::{communicate_back, generate_random_points, global_size, is_sorted_array}, }; use mpi::{collective::SystemOperation, traits::*}; use rand::prelude::*; @@ -28,11 +29,7 @@ pub fn main() { // Generate random points. - let mut points = Vec::::with_capacity(3 * npoints); - - for _ in 0..3 * npoints { - points.push(rng.gen()); - } + let points = generate_random_points(npoints, &mut rng, &comm); // Compute the Morton keys on the deepest level let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm); @@ -107,7 +104,34 @@ pub fn main() { assert!(is_complete_linear_tree(&balanced_tree, &comm)); - if comm.rank() == 1 { + // Redistribute original keys and points with respect to balanced coarse tree. + + let (balanced_points, balanced_keys) = redistribute_points_with_respect_to_coarse_tree( + &points, + &keys, + &load_balanced_coarse_keys, + &comm, + ); + + let upper_bound; + + if let Some(next_key) = communicate_back(&load_balanced_coarse_keys, &comm) { + upper_bound = next_key; + } else { + upper_bound = MortonKey::upper_bound(); + } + + assert!(load_balanced_coarse_keys.first().unwrap() <= balanced_keys.first().unwrap()); + assert!(*balanced_keys.last().unwrap() < upper_bound); + assert!(is_sorted_array(&balanced_keys, &comm)); + + println!( + "Rank {} has {} balanced points.", + comm.rank(), + balanced_points.len(), + ); + + if comm.rank() == 0 { println!("Coarse tree successfully created and weights computed."); } } diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index 052b214..d47fc6a 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -3,6 +3,7 @@ use bempp_octree::{ constants::DEEPEST_LEVEL, octree::{complete_tree, is_complete_linear_tree, linearize, points_to_morton}, + tools::generate_random_points, }; use mpi::traits::*; use rand::prelude::*; @@ -23,11 +24,7 @@ pub fn main() { // Generate random points. - let mut points = Vec::::with_capacity(3 * npoints); - - for _ in 0..3 * npoints { - points.push(rng.gen()); - } + let points = generate_random_points(npoints, &mut rng, &comm); // Compute the Morton keys on the deepest level let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm); diff --git a/examples/mpi_global_bounding_box.rs b/examples/mpi_global_bounding_box.rs index 330a168..144d0e5 100644 --- a/examples/mpi_global_bounding_box.rs +++ b/examples/mpi_global_bounding_box.rs @@ -1,7 +1,9 @@ //! Test the computation of a global bounding box across MPI ranks. use bempp_octree::{ - geometry::PhysicalBox, octree::compute_global_bounding_box, tools::gather_to_root, + geometry::PhysicalBox, + octree::compute_global_bounding_box, + tools::{gather_to_root, generate_random_points}, }; use rand::prelude::*; use rand_chacha::ChaCha8Rng; @@ -21,11 +23,7 @@ pub fn main() { // Generate random points. - let mut points = Vec::::with_capacity(3 * npoints); - - for _ in 0..3 * npoints { - points.push(rng.gen()); - } + let points = generate_random_points(npoints, &mut rng, &comm); // Compute the distributed bounding box. diff --git a/src/geometry.rs b/src/geometry.rs index 303fdf3..544e4d5 100644 --- a/src/geometry.rs +++ b/src/geometry.rs @@ -1,9 +1,33 @@ //! Geometry information -use bytemuck; +use mpi::traits::Equivalence; use crate::constants::DEEPEST_LEVEL; +/// Definition of a point. +#[derive(Clone, Copy, Equivalence)] +pub struct Point { + coords: [f64; 3], + global_id: usize, +} + +impl Point { + /// Create a new point from coordinates and global id. + pub fn new(coords: [f64; 3], global_id: usize) -> Self { + Self { coords, global_id } + } + + /// Return the coordintes of a point. + pub fn coords(&self) -> [f64; 3] { + self.coords + } + + /// Return the global id of the point. + pub fn global_id(&self) -> usize { + self.global_id + } +} + /// A bounding box describes geometry in which an Octree lives. pub struct PhysicalBox { coords: [f64; 6], @@ -18,11 +42,9 @@ impl PhysicalBox { } /// Give a slice of points. Compute an associated bounding box. - pub fn from_points(points: &[f64]) -> PhysicalBox { + pub fn from_points(points: &[Point]) -> PhysicalBox { assert_eq!(points.len() % 3, 0); - let points: &[[f64; 3]] = bytemuck::cast_slice(points); - let mut xmin = f64::MAX; let mut xmax = f64::MIN; @@ -33,9 +55,9 @@ impl PhysicalBox { let mut zmax = f64::MIN; for point in points { - let x = point[0]; - let y = point[1]; - let z = point[2]; + let x = point.coords()[0]; + let y = point.coords()[1]; + let z = point.coords()[2]; xmin = f64::min(xmin, x); xmax = f64::max(xmax, x); diff --git a/src/morton.rs b/src/morton.rs index 95ee563..5fee9a3 100644 --- a/src/morton.rs +++ b/src/morton.rs @@ -5,7 +5,7 @@ use crate::constants::{ LEVEL_SIZE, NINE_BIT_MASK, NSIBLINGS, X_LOOKUP_DECODE, X_LOOKUP_ENCODE, Y_LOOKUP_DECODE, Y_LOOKUP_ENCODE, Z_LOOKUP_DECODE, Z_LOOKUP_ENCODE, }; -use crate::geometry::PhysicalBox; +use crate::geometry::{PhysicalBox, Point}; use itertools::izip; use itertools::Itertools; use mpi::traits::Equivalence; @@ -155,9 +155,9 @@ impl MortonKey { /// Map a physical point within a bounding box to a Morton key on a given level. /// It is assumed that points are strictly contained within the bounding box. - pub fn from_physical_point(point: [f64; 3], bounding_box: &PhysicalBox, level: usize) -> Self { + pub fn from_physical_point(point: Point, bounding_box: &PhysicalBox, level: usize) -> Self { let level_size = 1 << level; - let reference = bounding_box.physical_to_reference(point); + let reference = bounding_box.physical_to_reference(point.coords()); let x = (reference[0] * level_size as f64) as usize; let y = (reference[1] * level_size as f64) as usize; let z = (reference[2] * level_size as f64) as usize; @@ -1284,7 +1284,7 @@ mod test { pub fn test_from_physical_point() { let bounding_box = PhysicalBox::new([-2.0, -3.0, -1.0, 4.0, 5.0, 6.0]); - let point = [1.5, -2.5, 5.0]; + let point = Point::new([1.5, -2.5, 5.0], 0); let level = 10; let key = MortonKey::from_physical_point(point, &bounding_box, level); @@ -1293,9 +1293,9 @@ mod test { let coords = physical_box.coordinates(); - assert!(coords[0] <= point[0] && point[0] < coords[3]); - assert!(coords[1] <= point[1] && point[1] < coords[4]); - assert!(coords[2] <= point[2] && point[2] < coords[5]); + assert!(coords[0] <= point.coords()[0] && point.coords()[0] < coords[3]); + assert!(coords[1] <= point.coords()[1] && point.coords()[1] < coords[4]); + assert!(coords[2] <= point.coords()[2] && point.coords()[2] < coords[5]); // Now compute the box. } diff --git a/src/octree.rs b/src/octree.rs index ebc8ea1..ccae541 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -4,7 +4,7 @@ use std::collections::HashSet; use crate::{ constants::{DEEPEST_LEVEL, NSIBLINGS}, - geometry::PhysicalBox, + geometry::{PhysicalBox, Point}, morton::MortonKey, parsort::parsort, tools::{ @@ -21,12 +21,10 @@ use rand::Rng; /// Compute the global bounding box across all points on all processes. pub fn compute_global_bounding_box( - points: &[f64], + points: &[Point], comm: &C, ) -> PhysicalBox { // Make sure that the points array is a multiple of 3. - assert_eq!(points.len() % 3, 0); - let points: &[[f64; 3]] = bytemuck::cast_slice(points); // Now compute the minimum and maximum across each dimension. @@ -40,9 +38,9 @@ pub fn compute_global_bounding_box( let mut zmax = f64::MIN; for point in points { - let x = point[0]; - let y = point[1]; - let z = point[2]; + let x = point.coords()[0]; + let y = point.coords()[1]; + let z = point.coords()[2]; xmin = f64::min(xmin, x); xmax = f64::max(xmax, x); @@ -102,13 +100,10 @@ pub fn compute_global_bounding_box( /// Convert points to Morton keys on specified level. pub fn points_to_morton( - points: &[f64], + points: &[Point], max_level: usize, comm: &C, ) -> (Vec, PhysicalBox) { - // Make sure that the points array is a multiple of 3. - assert_eq!(points.len() % 3, 0); - // Make sure that max level never exceeds DEEPEST_LEVEL let max_level = if max_level > DEEPEST_LEVEL as usize { DEEPEST_LEVEL as usize @@ -122,8 +117,6 @@ pub fn points_to_morton( // Bunch the points in arrays of 3. - let points: &[[f64; 3]] = bytemuck::cast_slice(points); - let keys = points .iter() .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level)) @@ -651,6 +644,70 @@ pub fn is_linear_tree(arr: &[MortonKey], comm: &C) - global_is_linear } +/// Redistribute points with respect to a given coarse tree +pub fn redistribute_points_with_respect_to_coarse_tree( + points: &[Point], + morton_keys_for_points: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> (Vec, Vec) { + pub fn argsort(arr: &[T]) -> Vec { + let mut sort_indices = (0..arr.len()).collect_vec(); + sort_indices.sort_unstable_by_key(|&index| arr[index]); + sort_indices + } + + pub fn reorder(arr: &[T], permutation: &[usize]) -> Vec { + let mut reordered = Vec::::with_capacity(arr.len()); + for &index in permutation.iter() { + reordered.push(arr[index]) + } + reordered + } + + assert_eq!(points.len(), morton_keys_for_points.len()); + + let size = comm.size(); + + if size == 1 { + return (points.to_vec(), morton_keys_for_points.to_vec()); + } + + let sort_indices = argsort(&morton_keys_for_points); + let sorted_keys = reorder(&morton_keys_for_points, &sort_indices); + let sorted_points = reorder(&points, &sort_indices); + + // Now get the bins + + let my_first = coarse_tree.first().unwrap(); + + let global_bins = gather_to_all(std::slice::from_ref(my_first), comm); + + // We now sort the morton indices into the bins. + + // This will store for each rank how many keys will be assigned to it. + + let counts = sort_to_bins(&sorted_keys, &global_bins) + .iter() + .map(|&elem| elem as i32) + .collect_vec(); + + // We now redistribute the points and the corresponding keys. + + let (distributed_points, distributed_keys) = ( + redistribute(&sorted_points, &counts, comm), + redistribute(&sorted_keys, &counts, comm), + ); + + // Now sort the distributed points and keys internally again. + + let sort_indices = argsort(&distributed_keys); + let sorted_keys = reorder(&distributed_keys, &sort_indices); + let sorted_points = reorder(&distributed_points, &sort_indices); + + (sorted_points, sorted_keys) +} + /// Return true on all ranks if distributed tree is complete. Otherwise, return false. pub fn is_complete_linear_tree(arr: &[MortonKey], comm: &C) -> bool { // First check that the local tree on each node is complete. diff --git a/src/serial.rs b/src/serial.rs index 64cdb08..d32b3ac 100644 --- a/src/serial.rs +++ b/src/serial.rs @@ -2,10 +2,9 @@ use crate::{ constants::{DEEPEST_LEVEL, NLEVELS}, - geometry::PhysicalBox, + geometry::{PhysicalBox, Point}, morton::MortonKey, }; -use bytemuck; use std::collections::HashMap; use vtkio; @@ -22,7 +21,7 @@ pub struct Neighbour { /// An octree pub struct Octree { leaf_keys: Vec, - points: Vec<[f64; 3]>, + points: Vec, point_to_level_keys: [Vec; NLEVELS], bounding_box: PhysicalBox, key_counts: HashMap, @@ -32,7 +31,7 @@ pub struct Octree { impl Octree { /// Create octress from points - pub fn from_points(points: &[f64], max_level: usize, max_points_per_box: usize) -> Self { + pub fn from_points(points: &[Point], max_level: usize, max_points_per_box: usize) -> Self { // Make sure that the points array is a multiple of 3. assert_eq!(points.len() % 3, 0); @@ -49,7 +48,6 @@ impl Octree { // Bunch the points in arrays of 3. - let points: &[[f64; 3]] = bytemuck::cast_slice(points); let npoints = points.len(); // We create a vector of keys for each point on each level. We compute the @@ -160,7 +158,7 @@ impl Octree { } /// Points - pub fn points(&self) -> &Vec<[f64; 3]> { + pub fn points(&self) -> &Vec { &self.points } @@ -264,14 +262,16 @@ impl Octree { #[cfg(test)] mod test { + use crate::geometry::Point; + use super::Octree; use rand::prelude::*; - fn get_points_on_sphere(npoints: usize) -> Vec { + fn get_points_on_sphere(npoints: usize) -> Vec { let mut rng = rand::rngs::StdRng::seed_from_u64(0); let normal = rand_distr::Normal::new(0.0, 1.0).unwrap(); - let mut points = Vec::::with_capacity(3 * npoints); + let mut points = Vec::::with_capacity(npoints); for _ in 0..(npoints) { let x: f64 = normal.sample(&mut rng); let y: f64 = normal.sample(&mut rng); @@ -279,9 +279,7 @@ mod test { let norm = (x * x + y * y + z * z).sqrt(); - points.push(x / norm); - points.push(y / norm); - points.push(z / norm); + points.push(Point::new([x / norm, y / norm, z / norm], 0)); } points diff --git a/src/tools.rs b/src/tools.rs index 075804a..56e0905 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -12,6 +12,7 @@ use rand::Rng; use crate::{ constants::{DEEPEST_LEVEL, LEVEL_SIZE}, + geometry::Point, morton::MortonKey, }; @@ -372,6 +373,25 @@ pub fn generate_random_keys(nkeys: usize, rng: &mut R) -> Vec result } +/// Generate random points for testing. +pub fn generate_random_points( + npoints: usize, + rng: &mut R, + comm: &C, +) -> Vec { + let mut points = Vec::::with_capacity(npoints); + let rank = comm.rank() as usize; + + for index in 0..npoints { + points.push(Point::new( + [rng.gen(), rng.gen(), rng.gen()], + npoints * rank + index, + )); + } + + points +} + /// Compute displacements from a vector of counts. /// /// This is useful for global MPI varcount operations. Let From a9caa553e2a742ed8265f3a72753b929be6b10fd Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Tue, 24 Sep 2024 23:11:24 +0100 Subject: [PATCH 32/42] Interior node topology --- src/geometry.rs | 2 - src/octree.rs | 863 +++++--------------------------------- src/octree/parallel.rs | 912 +++++++++++++++++++++++++++++++++++++++++ src/serial.rs | 1 - src/tools.rs | 8 +- 5 files changed, 1016 insertions(+), 770 deletions(-) create mode 100644 src/octree/parallel.rs diff --git a/src/geometry.rs b/src/geometry.rs index 544e4d5..62b9753 100644 --- a/src/geometry.rs +++ b/src/geometry.rs @@ -43,8 +43,6 @@ impl PhysicalBox { /// Give a slice of points. Compute an associated bounding box. pub fn from_points(points: &[Point]) -> PhysicalBox { - assert_eq!(points.len() % 3, 0); - let mut xmin = f64::MAX; let mut xmax = f64::MIN; diff --git a/src/octree.rs b/src/octree.rs index ccae541..6f175cc 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -1,816 +1,147 @@ -//! Parallel Octree structure - +pub mod parallel; use std::collections::HashSet; +use mpi::traits::CommunicatorCollectives; +pub use parallel::*; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; + use crate::{ - constants::{DEEPEST_LEVEL, NSIBLINGS}, + constants::DEEPEST_LEVEL, geometry::{PhysicalBox, Point}, morton::MortonKey, - parsort::parsort, - tools::{ - communicate_back, gather_to_all, gather_to_root, global_inclusive_cumsum, redistribute, - sort_to_bins, - }, }; -use mpi::traits::Root; - -use itertools::{izip, Itertools}; -use mpi::{collective::SystemOperation, traits::CommunicatorCollectives}; -use rand::Rng; - -/// Compute the global bounding box across all points on all processes. -pub fn compute_global_bounding_box( - points: &[Point], - comm: &C, -) -> PhysicalBox { - // Make sure that the points array is a multiple of 3. - - // Now compute the minimum and maximum across each dimension. - - let mut xmin = f64::MAX; - let mut xmax = f64::MIN; - - let mut ymin = f64::MAX; - let mut ymax = f64::MIN; - - let mut zmin = f64::MAX; - let mut zmax = f64::MIN; - - for point in points { - let x = point.coords()[0]; - let y = point.coords()[1]; - let z = point.coords()[2]; - - xmin = f64::min(xmin, x); - xmax = f64::max(xmax, x); - - ymin = f64::min(ymin, y); - ymax = f64::max(ymax, y); - - zmin = f64::min(zmin, z); - zmax = f64::max(zmax, z); - } - - let mut global_xmin = 0.0; - let mut global_xmax = 0.0; - - let mut global_ymin = 0.0; - let mut global_ymax = 0.0; - - let mut global_zmin = 0.0; - let mut global_zmax = 0.0; - - comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min()); - comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max()); - - comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min()); - comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max()); - - comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min()); - comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max()); - - let xdiam = global_xmax - global_xmin; - let ydiam = global_ymax - global_ymin; - let zdiam = global_zmax - global_zmin; - - let xmean = global_xmin + 0.5 * xdiam; - let ymean = global_ymin + 0.5 * ydiam; - let zmean = global_zmin + 0.5 * zdiam; - - // We increase diameters by box size on deepest level - // and use the maximum diameter to compute a - // cubic bounding box. - - let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64; - - let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap(); - - let max_diam = max_diam * (1.0 + deepest_box_diam); - - PhysicalBox::new([ - xmean - 0.5 * max_diam, - ymean - 0.5 * max_diam, - zmean - 0.5 * max_diam, - xmean + 0.5 * max_diam, - ymean + 0.5 * max_diam, - zmean + 0.5 * max_diam, - ]) -} - -/// Convert points to Morton keys on specified level. -pub fn points_to_morton( - points: &[Point], - max_level: usize, - comm: &C, -) -> (Vec, PhysicalBox) { - // Make sure that max level never exceeds DEEPEST_LEVEL - let max_level = if max_level > DEEPEST_LEVEL as usize { - DEEPEST_LEVEL as usize - } else { - max_level - }; - - // Compute the physical bounding box. - - let bounding_box = compute_global_bounding_box(points, comm); - - // Bunch the points in arrays of 3. - - let keys = points - .iter() - .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level)) - .collect_vec(); - - (keys, bounding_box) +/// A general structure for octrees. +pub struct Octree<'o, C> { + points: Vec, + point_keys: Vec, + coarse_tree: Vec, + leaf_tree: Vec, + coarse_tree_bounds: Vec, + bounding_box: PhysicalBox, + comm: &'o C, } -/// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree. -/// The returned coarse tree is load balanced according to the number of linear keys in each coarse block. -pub fn compute_coarse_tree( - linear_keys: &[MortonKey], - comm: &C, -) -> Vec { - let size = comm.size(); +impl<'o, C: CommunicatorCollectives> Octree<'o, C> { + /// Create a new distributed Octree. + pub fn new(points: &[Point], max_level: usize, max_leaf_points: usize, comm: &'o C) -> Self { + // We need a random number generator for sorting. For simplicity we use a ChaCha8 random number generator + // seeded with the rank of the process. + let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); - debug_assert!(is_linear_tree(linear_keys, comm)); + // First compute the Morton keys of the points. + let (point_keys, bounding_box) = points_to_morton(points, DEEPEST_LEVEL as usize, comm); - // On a single node a complete coarse tree is simply the root. - if size == 1 { - return vec![MortonKey::root()]; - } - - let mut completed_region = linear_keys - .first() - .unwrap() - .fill_between_keys(*linear_keys.last().unwrap()); + // Generate the coarse tree - completed_region.insert(0, *linear_keys.first().unwrap()); - completed_region.push(*linear_keys.last().unwrap()); + let (coarse_tree, leaf_tree) = { + // Linearize the keys. + let linear_keys = linearize(&point_keys, &mut rng, comm); - // Get the smallest level members of the completed region. + // Compute the first version of the coarse tree without load balancing. + let coarse_tree = compute_coarse_tree(&linear_keys, comm); + debug_assert!(is_complete_linear_tree(&coarse_tree, comm)); - let min_level = completed_region - .iter() - .map(|elem| elem.level()) - .min() - .unwrap(); + // We now compute the weights for the initial coarse tree. - // Each process selects its largest boxes. These are used to create - // a coarse tree. + let weights = compute_coarse_tree_weights(&linear_keys, &coarse_tree, comm); - let largest_boxes = completed_region - .iter() - .filter(|elem| elem.level() == min_level) - .copied() - .collect_vec(); + // We now load balance the initial coarse tree. This forms our final coarse tree + // that is used from now on. - debug_assert!(is_linear_tree(&largest_boxes, comm)); + let coarse_tree = load_balance(&coarse_tree, &weights, comm); - complete_tree(&largest_boxes, comm) -} - -/// Compute the weights of each coarse tree block as the number of linear keys associated with each coarse block. -pub fn compute_coarse_tree_weights( - linear_keys: &[MortonKey], - coarse_tree: &[MortonKey], - comm: &C, -) -> Vec { - let rank = comm.rank(); - // We want to partition the coarse tree. But we need the correct weights. The idea - // is that we use the number of original leafs that intersect with the coarse tree - // as leafs. In order to compute this we send the coarse tree around to all processes - // so that each process computes for each coarse tree element how many of its keys - // intersect with each node of the coarse tree. We then sum up the local weight for each - // coarse tree node across all nodes to get the weight. - - let global_coarse_tree = gather_to_all(&coarse_tree, comm); - - // We also want to send around a corresponding array of ranks so that for each global coarse tree key - // we have the rank of where it originates from. - - let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm); - - // We now compute the local weights. - let mut local_weight_contribution = vec![0 as usize; global_coarse_tree.len()]; - - // In the following loop we want to be a bit smart. We do not iterate through all the local elements. - // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region - // of our sorted keys that overlaps with the coarse tree region. - - // Let's find the start of our region. The start of our region is a coarse key that is an ancestor - // of our current key. This works because the coarse tree has levels at most as high as the sorted keys. - - let first_key = *linear_keys.first().unwrap(); - - let first_coarse_index = global_coarse_tree - .iter() - .take_while(|coarse_key| !coarse_key.is_ancestor(first_key)) - .count(); - - // Now we need to find the end index of our region. For this again we find the index of our coarse tree that - // is an ancestor of our last key. - let last_key = *linear_keys.last().unwrap(); - - let last_coarse_index = global_coarse_tree - .iter() - .take_while(|coarse_key| !coarse_key.is_ancestor(last_key)) - .count(); - - // We now only need to iterate through between the first and last coarse index in the coarse tree. - // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key). - - for (w, &global_coarse_key) in izip!( - local_weight_contribution[first_coarse_index..=last_coarse_index].iter_mut(), - global_coarse_tree[first_coarse_index..=last_coarse_index].iter() - ) { - *w += linear_keys - .iter() - .filter(|&&key| global_coarse_key.is_ancestor(key)) - .count(); - } - - // We now need to sum up the weights across all processes. - - let mut global_weights = vec![0 as usize; global_coarse_tree.len()]; - - comm.all_reduce_into( - &local_weight_contribution, - &mut global_weights, - SystemOperation::sum(), - ); - - // Each process now has all weights. However, we only need the ones for the current process. - // So we just filter the rest out. - - izip!(coarse_tree_ranks, global_weights) - .filter_map(|(r, weight)| { - if r == rank as usize { - Some(weight) - } else { - None - } - }) - .collect_vec() -} - -/// Redistribute sorted keys with respect to a linear coarse tree. -pub fn redistribute_with_respect_to_coarse_tree( - linear_keys: &[MortonKey], - coarse_tree: &[MortonKey], - comm: &C, -) -> Vec { - let size = comm.size(); - - if size == 1 { - return linear_keys.to_vec(); - } + // We also want to redistribute the fine keys with respect to the load balanced coarse trees. - // We want to globally redistribute keys so that the keys on each process are descendents - // of the local coarse tree keys. + let fine_keys = + redistribute_with_respect_to_coarse_tree(&linear_keys, &coarse_tree, comm); - // We are using here the fact that the coarse tree is complete and sorted. - // We are sending around to each process the first local index. This - // defines bins in which we sort our keys. The keys are then sent around to the correct - // processes via an alltoallv operation. + // We now create the refined tree by recursing the coarse tree until we are at max level + // or the fine tree keys per coarse tree box is small enough. + let refined_tree = + create_local_tree(&fine_keys, &coarse_tree, max_level, max_leaf_points); - let my_first = coarse_tree.first().unwrap(); + // We now need to 2:1 balance the refined tree and then redistribute again with respect to the coarse tree. - let global_bins = gather_to_all(std::slice::from_ref(my_first), comm); - - // We now have our bins. We go through our keys and store how - // many keys are assigned to each rank. We are using here that - // our keys and the coarse tree are both sorted. - - // This will store for each rank how many keys will be assigned to it. - - let rank_counts = sort_to_bins(linear_keys, &global_bins) - .iter() - .map(|&elem| elem as i32) - .collect_vec(); - - // We now have the counts for each rank. Let's redistribute accordingly and return. - - let result = redistribute(&linear_keys, &rank_counts, comm); - - #[cfg(debug_assertions)] - { - // Check through that the first and last key of result are descendents - // of the first and last coarse bloack. - debug_assert!(coarse_tree - .first() - .unwrap() - .is_ancestor(*result.first().unwrap())); - debug_assert!(coarse_tree - .last() - .unwrap() - .is_ancestor(*result.last().unwrap())); - } - - result -} - -/// Return a complete tree generated from local keys and associated coarse keys. -/// -/// The coarse keys are refined until the maximum level is reached or until each coarse key -/// is the ancestor of at most `max_keys` fine keys. -/// It is assumed that the level of the fine keys is at least as large as `max_level`. -pub fn create_local_tree( - sorted_fine_keys: &[MortonKey], - coarse_keys: &[MortonKey], - mut max_level: usize, - max_keys: usize, -) -> Vec { - if max_level > DEEPEST_LEVEL as usize { - max_level = DEEPEST_LEVEL as usize; - } - - // We split the sorted fine keys into subslices so that each subslice - // is associated with a coarse slice. - - let bins = coarse_keys.to_vec(); - - let counts = sort_to_bins(&sorted_fine_keys, &bins); - - // We now know how many fine keys are associated with each coarse block. We iterate - // through and locally refine for each block that requires it. - - let mut remainder = sorted_fine_keys; - let mut new_coarse_keys = Vec::::new(); - - for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) { - let current; - (current, remainder) = remainder.split_at(count); - if coarse_key.level() < max_level && current.len() > max_keys { - // We need to refine the current split. - new_coarse_keys.extend_from_slice( - create_local_tree( - current, - coarse_key.children().as_slice(), - max_level, - max_keys, - ) - .as_slice(), + let refined_tree = redistribute_with_respect_to_coarse_tree( + &balance(&refined_tree, &mut rng, comm), + &coarse_tree, + comm, ); - } else { - new_coarse_keys.push(coarse_key) - } - } - - new_coarse_keys -} - -/// Linearize a set of weighted Morton keys. -pub fn linearize( - keys: &[MortonKey], - rng: &mut R, - comm: &C, -) -> Vec { - let size = comm.size(); - let rank = comm.rank(); - - // If we only have one process we use the standard serial linearization. - - if size == 1 { - return MortonKey::linearize(keys); - } - - // We are first sorting the keys. Then in a linear process across all processors we - // go through the arrays and delete ancestors of nodes. - - let sorted_keys = parsort(&keys, comm, rng); - - // Each process needs to send its first element to the previous process. Each process - // then goes through its own list and retains elements that are not ancestors of the - // next element. - - let mut result = Vec::::new(); - - let next_key = communicate_back(&sorted_keys, comm); - - // Treat the local keys - for (&m1, &m2) in sorted_keys.iter().tuple_windows() { - // m1 is also ancestor of m2 if they are identical. - if m1.is_ancestor(m2) { - continue; - } else { - result.push(m1); - } - } - - // If we are at the last process simply push the last key. - // Otherwise check whether it might be the ancestor of `next_key`, - // the first key on the next process. If yes, don't push it. Otherwise do. - - if rank == size - 1 { - result.push(*sorted_keys.last().unwrap()); - } else { - let last = *sorted_keys.last().unwrap(); - if !last.is_ancestor(next_key.unwrap()) { - result.push(last); - } - } - - debug_assert!(is_linear_tree(&result, comm)); - - result -} - -/// Balance a sorted list of Morton keys across processors given an array of corresponding weights. -pub fn load_balance( - sorted_keys: &[MortonKey], - weights: &[usize], - comm: &C, -) -> Vec { - assert_eq!(sorted_keys.len(), weights.len()); - - let size = comm.size(); - let rank = comm.rank(); - - // If we only have one process we simply return. - - if size == 1 { - return sorted_keys.to_vec(); - } - - // First scan the weight. - // We scan the local arrays, then use a global scan operation on the last element - // of each array to get the global sums and then we update the array of each rank - // with the sum from the previous ranks. - let scan = global_inclusive_cumsum(&weights, comm); + (coarse_tree, refined_tree) - // Now broadcast the total weight to all processes. + // redistribute the balanced tree according to coarse tree + }; - let mut total_weight = if rank == size - 1 { - *scan.last().unwrap() - } else { - 0 - }; - - comm.process_at_rank(size - 1) - .broadcast_into(&mut total_weight); - - let w = total_weight / (size as usize); - let k = total_weight % (size as usize); - - // Sort the elements into bins according to which process they should be sent. - // We do not need to sort the Morton keys themselves into bins but the scanned weights. - // The corresponding counts are the right counts for the Morton keys. - - let mut bins = Vec::::with_capacity(size as usize); - - for p in 1..=size as usize { - if p <= k { - bins.push((p - 1) * (1 + w)); - } else { - bins.push((p - 1) * w + k); - } - } - - let counts = sort_to_bins(&scan, &bins) - .iter() - .map(|elem| *elem as i32) - .collect_vec(); - - // Now distribute the data with an all to all v. - // We create a vector of how many elements to send to each process and - // then send the actual data. - - let mut recvbuffer = redistribute(&sorted_keys, &counts, comm); - - recvbuffer.sort_unstable(); - recvbuffer -} - -/// Given a distributed set of linear keys, generate a complete tree. -pub fn complete_tree( - linear_keys: &[MortonKey], - comm: &C, -) -> Vec { - let mut linear_keys = linear_keys.to_vec(); - - debug_assert!(is_linear_tree(&linear_keys, comm)); - - let size = comm.size(); - let rank = comm.rank(); - - if size == 1 { - return MortonKey::complete_tree(linear_keys.as_slice()); - } - - // Now insert on the first and last process the first and last child of the - // finest ancestor of first/last box on deepest level - - // Send first element to previous rank and insert into local keys. - // On the first process we also need to insert the first child of the finest - // ancestor of the deepest first key and first element. Correspondingly on the last process - // we need to insert the last child of the finest ancester of the deepest last key and last element. - - let next_key = communicate_back(&linear_keys, comm); - - if rank < size - 1 { - linear_keys.push(next_key.unwrap()); - } - - // Now fix the first key on the first rank. - - if rank == 0 { - let first_key = linear_keys.first().unwrap(); - let deepest_first = MortonKey::deepest_first(); - if !first_key.is_ancestor(deepest_first) { - let ancestor = deepest_first.finest_common_ancestor(*first_key); - linear_keys.insert(0, ancestor.children()[0]); - } - } - - if rank == size - 1 { - let last_key = linear_keys.last().unwrap(); - let deepest_last = MortonKey::deepest_last(); - if !last_key.is_ancestor(deepest_last) { - let ancestor = deepest_last.finest_common_ancestor(*last_key); - linear_keys.push(ancestor.children()[NSIBLINGS - 1]); - } - } - - // Now complete the regions defined by the keys on each process. - - let mut result = Vec::::new(); - - for (&key1, &key2) in linear_keys.iter().tuple_windows() { - result.push(key1); - result.extend_from_slice(key1.fill_between_keys(key2).as_slice()); - } - - if rank == size - 1 { - result.push(*linear_keys.last().unwrap()); - } - - debug_assert!(is_complete_linear_tree(&result, comm)); - - result -} - -/// Balance a distributed tree. -pub fn balance( - linear_keys: &[MortonKey], - rng: &mut R, - comm: &C, -) -> Vec { - let deepest_level = deepest_level(linear_keys, comm); - - // Start with keys at deepest level - let mut work_list = linear_keys - .iter() - .copied() - .filter(|&key| key.level() == deepest_level) - .collect_vec(); - - let mut result = Vec::::new(); - - // Now go through and make sure that for each key siblings and neighbours of parents are added - - for level in (1..=deepest_level).rev() { - let mut parents = HashSet::::new(); - let mut new_work_list = Vec::::new(); - // We filter the work list by level and also make sure that - // only one sibling of each of the parents children is added to - // our current level list. - for key in work_list.iter() { - let parent = key.parent(); - if !parents.contains(&parent) { - parents.insert(parent); - result.extend_from_slice(key.siblings().as_slice()); - new_work_list.extend_from_slice( - parent - .neighbours() - .iter() - .copied() - .filter(|&key| key.is_valid()) - .collect_vec() - .as_slice(), - ); - } - } - new_work_list.extend( - linear_keys - .iter() - .copied() - .filter(|&key| key.level() == level - 1), + let (points, point_keys) = redistribute_points_with_respect_to_coarse_tree( + points, + &point_keys, + &coarse_tree, + comm, ); - work_list = new_work_list; - // Now extend the work list with the - } - - let result = linearize(&result, rng, comm); + let coarse_tree_bounds = get_tree_bins(&coarse_tree, comm); - debug_assert!(is_complete_linear_and_balanced(&result, comm)); - result -} - -/// Return true if the keys are linear. -pub fn is_linear_tree(arr: &[MortonKey], comm: &C) -> bool { - let mut is_linear = true; - - for (&key1, &key2) in arr.iter().tuple_windows() { - if key1 >= key2 || key1.is_ancestor(key2) { - is_linear = false; - break; + Self { + points: points.to_vec(), + point_keys, + coarse_tree, + leaf_tree, + coarse_tree_bounds, + bounding_box, + comm, } } - if comm.size() == 1 { - return is_linear; + /// Return the keys associated with the redistributed points. + pub fn point_keys(&self) -> &Vec { + &self.point_keys } - // Now check the interfaces - - if let Some(next_key) = communicate_back(arr, comm) { - let last = *arr.last().unwrap(); - if last >= next_key || last.is_ancestor(next_key) { - is_linear = false; - } + /// Return the bounding box. + pub fn bounding_box(&self) -> &PhysicalBox { + &self.bounding_box } - let mut global_is_linear = false; - - comm.all_reduce_into( - &is_linear, - &mut global_is_linear, - SystemOperation::logical_and(), - ); - - global_is_linear -} - -/// Redistribute points with respect to a given coarse tree -pub fn redistribute_points_with_respect_to_coarse_tree( - points: &[Point], - morton_keys_for_points: &[MortonKey], - coarse_tree: &[MortonKey], - comm: &C, -) -> (Vec, Vec) { - pub fn argsort(arr: &[T]) -> Vec { - let mut sort_indices = (0..arr.len()).collect_vec(); - sort_indices.sort_unstable_by_key(|&index| arr[index]); - sort_indices - } - - pub fn reorder(arr: &[T], permutation: &[usize]) -> Vec { - let mut reordered = Vec::::with_capacity(arr.len()); - for &index in permutation.iter() { - reordered.push(arr[index]) - } - reordered - } - - assert_eq!(points.len(), morton_keys_for_points.len()); - - let size = comm.size(); - - if size == 1 { - return (points.to_vec(), morton_keys_for_points.to_vec()); + /// Return the associated coarse tree. + pub fn coarse_tree(&self) -> &Vec { + &self.coarse_tree } - let sort_indices = argsort(&morton_keys_for_points); - let sorted_keys = reorder(&morton_keys_for_points, &sort_indices); - let sorted_points = reorder(&points, &sort_indices); - - // Now get the bins - - let my_first = coarse_tree.first().unwrap(); - - let global_bins = gather_to_all(std::slice::from_ref(my_first), comm); - - // We now sort the morton indices into the bins. - - // This will store for each rank how many keys will be assigned to it. - - let counts = sort_to_bins(&sorted_keys, &global_bins) - .iter() - .map(|&elem| elem as i32) - .collect_vec(); - - // We now redistribute the points and the corresponding keys. - - let (distributed_points, distributed_keys) = ( - redistribute(&sorted_points, &counts, comm), - redistribute(&sorted_keys, &counts, comm), - ); - - // Now sort the distributed points and keys internally again. - - let sort_indices = argsort(&distributed_keys); - let sorted_keys = reorder(&distributed_keys, &sort_indices); - let sorted_points = reorder(&distributed_points, &sort_indices); - - (sorted_points, sorted_keys) -} - -/// Return true on all ranks if distributed tree is complete. Otherwise, return false. -pub fn is_complete_linear_tree(arr: &[MortonKey], comm: &C) -> bool { - // First check that the local tree on each node is complete. - - let mut complete_linear = true; - for (key1, key2) in arr.iter().tuple_windows() { - // Make sure that the keys are sorted and not duplicated. - if key1 >= key2 { - complete_linear = false; - break; - } - // The next key should be an ancestor of the next non-descendent key. - if let Some(expected_next) = key1.next_non_descendent_key() { - if !key2.is_ancestor(expected_next) { - complete_linear = false; - break; - } - } else { - // Only for the very last key there should not be a next non-descendent key. - complete_linear = false; - } + /// Return the points. + /// + /// Points are distributed across the nodes as part of the tree generation. + /// This function returns the redistributed points. + pub fn points(&self) -> &Vec { + &self.points } - // We now check the interfaces. - - if let Some(next_first) = communicate_back(arr, comm) { - // We are on any but the last rank - let last_key = arr.last().unwrap(); - - // Check that the keys are sorted and not duplicated. - if *last_key >= next_first { - complete_linear = false; - } - - // Check that the next key is an encestor of the next non-descendent. - if let Some(expected_next) = last_key.next_non_descendent_key() { - if !next_first.is_ancestor(expected_next) { - complete_linear = false; - } - } else { - complete_linear = false; - } - } else { - // We are on the last rank - // Check that the last key is ancestor of deepest last. - if !arr.last().unwrap().is_ancestor(MortonKey::deepest_last()) { - complete_linear = false; - } + /// Return the leaf tree. + pub fn leaf_tree(&self) -> &Vec { + &self.leaf_tree } - // Now check that at the first rank we include the deepest first. - - if comm.rank() == 0 { - if !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) { - complete_linear = false; - } + /// Get the coarse tree bounds. + /// + /// This returns an array of size the number of ranks, + /// where each element consists of the smallest Morton key in + /// the corresponding rank. + pub fn coarse_tree_bounds(&self) -> &Vec { + &self.coarse_tree_bounds } - // Now communicate everything together. - - let mut result = false; - comm.all_reduce_into( - &complete_linear, - &mut result, - SystemOperation::logical_and(), - ); - - result -} - -/// Return the deepest level of a distributed list of Morton keys. -pub fn deepest_level(keys: &[MortonKey], comm: &C) -> usize { - let local_deepest_level = keys.iter().map(|elem| elem.level()).max().unwrap(); - - if comm.size() == 1 { - return local_deepest_level; + /// Return the communicator. + pub fn comm(&self) -> &C { + self.comm } - let mut global_deepest_level: usize = 0; + pub fn generate_status(&self) { + let mut keys_with_status = HashSet::::new(); - comm.all_reduce_into( - &local_deepest_level, - &mut global_deepest_level, - SystemOperation::max(), - ); + // Start from the leafs and work up the tree. - global_deepest_level -} - -/// Check if tree is balanced. -pub fn is_complete_linear_and_balanced( - arr: &[MortonKey], - comm: &C, -) -> bool { - // Send the tree to the root node and check there that it is balanced. - - let mut balanced = false; - - if let Some(arr) = gather_to_root(arr, comm) { - balanced = MortonKey::is_complete_linear_and_balanced(&arr); + for leaf in self.leaf_tree() {} } - - comm.process_at_rank(0).broadcast_into(&mut balanced); - - balanced } diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs new file mode 100644 index 0000000..2981a08 --- /dev/null +++ b/src/octree/parallel.rs @@ -0,0 +1,912 @@ +//! Parallel Octree structure + +use std::collections::{HashMap, HashSet}; + +use crate::{ + constants::{DEEPEST_LEVEL, NSIBLINGS}, + geometry::{PhysicalBox, Point}, + morton::MortonKey, + parsort::parsort, + tools::{ + communicate_back, gather_to_all, gather_to_root, global_inclusive_cumsum, redistribute, + sort_to_bins, + }, +}; + +use mpi::traits::{Equivalence, Root}; + +use itertools::{izip, Itertools}; +use mpi::{collective::SystemOperation, traits::CommunicatorCollectives}; +use rand::Rng; + +/// Structure to store ghost keys and their original ranks. +/// +/// The status is +/// - 0 for a local interior node. +/// - 1 for a local leaf node. +/// - 2 for a global node. +/// - 3 for a ghost node. +#[derive(Copy, Clone, Equivalence)] +pub struct KeyWithStatus { + key: MortonKey, + // Ideally we would use a typed enum that + // combines rank and status. But this is not + // supported by the rsmpi Equivalence Macro. + status: usize, + rank: usize, +} + +impl KeyWithStatus { + /// Create a new ghost. + pub fn new(key: MortonKey, status: usize, rank: usize) -> Self { + Self { key, status, rank } + } +} + +/// Compute the global bounding box across all points on all processes. +pub fn compute_global_bounding_box( + points: &[Point], + comm: &C, +) -> PhysicalBox { + // Make sure that the points array is a multiple of 3. + + // Now compute the minimum and maximum across each dimension. + + let mut xmin = f64::MAX; + let mut xmax = f64::MIN; + + let mut ymin = f64::MAX; + let mut ymax = f64::MIN; + + let mut zmin = f64::MAX; + let mut zmax = f64::MIN; + + for point in points { + let x = point.coords()[0]; + let y = point.coords()[1]; + let z = point.coords()[2]; + + xmin = f64::min(xmin, x); + xmax = f64::max(xmax, x); + + ymin = f64::min(ymin, y); + ymax = f64::max(ymax, y); + + zmin = f64::min(zmin, z); + zmax = f64::max(zmax, z); + } + + let mut global_xmin = 0.0; + let mut global_xmax = 0.0; + + let mut global_ymin = 0.0; + let mut global_ymax = 0.0; + + let mut global_zmin = 0.0; + let mut global_zmax = 0.0; + + comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min()); + comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max()); + + comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min()); + comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max()); + + comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min()); + comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max()); + + let xdiam = global_xmax - global_xmin; + let ydiam = global_ymax - global_ymin; + let zdiam = global_zmax - global_zmin; + + let xmean = global_xmin + 0.5 * xdiam; + let ymean = global_ymin + 0.5 * ydiam; + let zmean = global_zmin + 0.5 * zdiam; + + // We increase diameters by box size on deepest level + // and use the maximum diameter to compute a + // cubic bounding box. + + let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64; + + let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap(); + + let max_diam = max_diam * (1.0 + deepest_box_diam); + + PhysicalBox::new([ + xmean - 0.5 * max_diam, + ymean - 0.5 * max_diam, + zmean - 0.5 * max_diam, + xmean + 0.5 * max_diam, + ymean + 0.5 * max_diam, + zmean + 0.5 * max_diam, + ]) +} + +/// Convert points to Morton keys on specified level. +pub fn points_to_morton( + points: &[Point], + max_level: usize, + comm: &C, +) -> (Vec, PhysicalBox) { + // Make sure that max level never exceeds DEEPEST_LEVEL + let max_level = if max_level > DEEPEST_LEVEL as usize { + DEEPEST_LEVEL as usize + } else { + max_level + }; + + // Compute the physical bounding box. + + let bounding_box = compute_global_bounding_box(points, comm); + + // Bunch the points in arrays of 3. + + let keys = points + .iter() + .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level)) + .collect_vec(); + + (keys, bounding_box) +} + +/// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree. +/// The returned coarse tree is load balanced according to the number of linear keys in each coarse block. +pub fn compute_coarse_tree( + linear_keys: &[MortonKey], + comm: &C, +) -> Vec { + let size = comm.size(); + + debug_assert!(is_linear_tree(linear_keys, comm)); + + // On a single node a complete coarse tree is simply the root. + if size == 1 { + return vec![MortonKey::root()]; + } + + let mut completed_region = linear_keys + .first() + .unwrap() + .fill_between_keys(*linear_keys.last().unwrap()); + + completed_region.insert(0, *linear_keys.first().unwrap()); + completed_region.push(*linear_keys.last().unwrap()); + + // Get the smallest level members of the completed region. + + let min_level = completed_region + .iter() + .map(|elem| elem.level()) + .min() + .unwrap(); + + // Each process selects its largest boxes. These are used to create + // a coarse tree. + + let largest_boxes = completed_region + .iter() + .filter(|elem| elem.level() == min_level) + .copied() + .collect_vec(); + + debug_assert!(is_linear_tree(&largest_boxes, comm)); + + complete_tree(&largest_boxes, comm) +} + +/// Compute the weights of each coarse tree block as the number of linear keys associated with each coarse block. +pub fn compute_coarse_tree_weights( + linear_keys: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> Vec { + let rank = comm.rank(); + // We want to partition the coarse tree. But we need the correct weights. The idea + // is that we use the number of original leafs that intersect with the coarse tree + // as leafs. In order to compute this we send the coarse tree around to all processes + // so that each process computes for each coarse tree element how many of its keys + // intersect with each node of the coarse tree. We then sum up the local weight for each + // coarse tree node across all nodes to get the weight. + + let global_coarse_tree = gather_to_all(&coarse_tree, comm); + + // We also want to send around a corresponding array of ranks so that for each global coarse tree key + // we have the rank of where it originates from. + + let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm); + + // We now compute the local weights. + let mut local_weight_contribution = vec![0 as usize; global_coarse_tree.len()]; + + // In the following loop we want to be a bit smart. We do not iterate through all the local elements. + // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region + // of our sorted keys that overlaps with the coarse tree region. + + // Let's find the start of our region. The start of our region is a coarse key that is an ancestor + // of our current key. This works because the coarse tree has levels at most as high as the sorted keys. + + let first_key = *linear_keys.first().unwrap(); + + let first_coarse_index = global_coarse_tree + .iter() + .take_while(|coarse_key| !coarse_key.is_ancestor(first_key)) + .count(); + + // Now we need to find the end index of our region. For this again we find the index of our coarse tree that + // is an ancestor of our last key. + let last_key = *linear_keys.last().unwrap(); + + let last_coarse_index = global_coarse_tree + .iter() + .take_while(|coarse_key| !coarse_key.is_ancestor(last_key)) + .count(); + + // We now only need to iterate through between the first and last coarse index in the coarse tree. + // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key). + + for (w, &global_coarse_key) in izip!( + local_weight_contribution[first_coarse_index..=last_coarse_index].iter_mut(), + global_coarse_tree[first_coarse_index..=last_coarse_index].iter() + ) { + *w += linear_keys + .iter() + .filter(|&&key| global_coarse_key.is_ancestor(key)) + .count(); + } + + // We now need to sum up the weights across all processes. + + let mut global_weights = vec![0 as usize; global_coarse_tree.len()]; + + comm.all_reduce_into( + &local_weight_contribution, + &mut global_weights, + SystemOperation::sum(), + ); + + // Each process now has all weights. However, we only need the ones for the current process. + // So we just filter the rest out. + + izip!(coarse_tree_ranks, global_weights) + .filter_map(|(r, weight)| { + if r == rank as usize { + Some(weight) + } else { + None + } + }) + .collect_vec() +} + +/// Redistribute sorted keys with respect to a linear coarse tree. +pub fn redistribute_with_respect_to_coarse_tree( + linear_keys: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> Vec { + let size = comm.size(); + + if size == 1 { + return linear_keys.to_vec(); + } + + // We want to globally redistribute keys so that the keys on each process are descendents + // of the local coarse tree keys. + + // We are using here the fact that the coarse tree is complete and sorted. + // We are sending around to each process the first local index. This + // defines bins in which we sort our keys. The keys are then sent around to the correct + // processes via an alltoallv operation. + + let my_first = coarse_tree.first().unwrap(); + + let global_bins = gather_to_all(std::slice::from_ref(my_first), comm); + + // We now have our bins. We go through our keys and store how + // many keys are assigned to each rank. We are using here that + // our keys and the coarse tree are both sorted. + + // This will store for each rank how many keys will be assigned to it. + + let rank_counts = sort_to_bins(linear_keys, &global_bins) + .iter() + .map(|&elem| elem as i32) + .collect_vec(); + + // We now have the counts for each rank. Let's redistribute accordingly and return. + + let result = redistribute(&linear_keys, &rank_counts, comm); + + #[cfg(debug_assertions)] + { + // Check through that the first and last key of result are descendents + // of the first and last coarse bloack. + debug_assert!(coarse_tree + .first() + .unwrap() + .is_ancestor(*result.first().unwrap())); + debug_assert!(coarse_tree + .last() + .unwrap() + .is_ancestor(*result.last().unwrap())); + } + + result +} + +/// Return a complete tree generated from local keys and associated coarse keys. +/// +/// The coarse keys are refined until the maximum level is reached or until each coarse key +/// is the ancestor of at most `max_keys` fine keys. +/// It is assumed that the level of the fine keys is at least as large as `max_level`. +pub fn create_local_tree( + sorted_fine_keys: &[MortonKey], + coarse_keys: &[MortonKey], + mut max_level: usize, + max_keys: usize, +) -> Vec { + if max_level > DEEPEST_LEVEL as usize { + max_level = DEEPEST_LEVEL as usize; + } + + // We split the sorted fine keys into subslices so that each subslice + // is associated with a coarse slice. + + let bins = coarse_keys.to_vec(); + + let counts = sort_to_bins(&sorted_fine_keys, &bins); + + // We now know how many fine keys are associated with each coarse block. We iterate + // through and locally refine for each block that requires it. + + let mut remainder = sorted_fine_keys; + let mut new_coarse_keys = Vec::::new(); + + for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) { + let current; + (current, remainder) = remainder.split_at(count); + if coarse_key.level() < max_level && current.len() > max_keys { + // We need to refine the current split. + new_coarse_keys.extend_from_slice( + create_local_tree( + current, + coarse_key.children().as_slice(), + max_level, + max_keys, + ) + .as_slice(), + ); + } else { + new_coarse_keys.push(coarse_key) + } + } + + new_coarse_keys +} + +/// Linearize a set of weighted Morton keys. +pub fn linearize( + keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + let size = comm.size(); + let rank = comm.rank(); + + // If we only have one process we use the standard serial linearization. + + if size == 1 { + return MortonKey::linearize(keys); + } + + // We are first sorting the keys. Then in a linear process across all processors we + // go through the arrays and delete ancestors of nodes. + + let sorted_keys = parsort(&keys, comm, rng); + + // Each process needs to send its first element to the previous process. Each process + // then goes through its own list and retains elements that are not ancestors of the + // next element. + + let mut result = Vec::::new(); + + let next_key = communicate_back(&sorted_keys, comm); + + // Treat the local keys + for (&m1, &m2) in sorted_keys.iter().tuple_windows() { + // m1 is also ancestor of m2 if they are identical. + if m1.is_ancestor(m2) { + continue; + } else { + result.push(m1); + } + } + + // If we are at the last process simply push the last key. + // Otherwise check whether it might be the ancestor of `next_key`, + // the first key on the next process. If yes, don't push it. Otherwise do. + + if rank == size - 1 { + result.push(*sorted_keys.last().unwrap()); + } else { + let last = *sorted_keys.last().unwrap(); + if !last.is_ancestor(next_key.unwrap()) { + result.push(last); + } + } + + debug_assert!(is_linear_tree(&result, comm)); + + result +} + +/// Balance a sorted list of Morton keys across processors given an array of corresponding weights. +pub fn load_balance( + sorted_keys: &[MortonKey], + weights: &[usize], + comm: &C, +) -> Vec { + assert_eq!(sorted_keys.len(), weights.len()); + + let size = comm.size(); + let rank = comm.rank(); + + // If we only have one process we simply return. + + if size == 1 { + return sorted_keys.to_vec(); + } + + // First scan the weight. + // We scan the local arrays, then use a global scan operation on the last element + // of each array to get the global sums and then we update the array of each rank + // with the sum from the previous ranks. + + let scan = global_inclusive_cumsum(&weights, comm); + + // Now broadcast the total weight to all processes. + + let mut total_weight = if rank == size - 1 { + *scan.last().unwrap() + } else { + 0 + }; + + comm.process_at_rank(size - 1) + .broadcast_into(&mut total_weight); + + let w = total_weight / (size as usize); + let k = total_weight % (size as usize); + + // Sort the elements into bins according to which process they should be sent. + // We do not need to sort the Morton keys themselves into bins but the scanned weights. + // The corresponding counts are the right counts for the Morton keys. + + let mut bins = Vec::::with_capacity(size as usize); + + for p in 1..=size as usize { + if p <= k { + bins.push((p - 1) * (1 + w)); + } else { + bins.push((p - 1) * w + k); + } + } + + let counts = sort_to_bins(&scan, &bins) + .iter() + .map(|elem| *elem as i32) + .collect_vec(); + + // Now distribute the data with an all to all v. + // We create a vector of how many elements to send to each process and + // then send the actual data. + + let mut recvbuffer = redistribute(&sorted_keys, &counts, comm); + + recvbuffer.sort_unstable(); + recvbuffer +} + +/// Given a distributed set of linear keys, generate a complete tree. +pub fn complete_tree( + linear_keys: &[MortonKey], + comm: &C, +) -> Vec { + let mut linear_keys = linear_keys.to_vec(); + + debug_assert!(is_linear_tree(&linear_keys, comm)); + + let size = comm.size(); + let rank = comm.rank(); + + if size == 1 { + return MortonKey::complete_tree(linear_keys.as_slice()); + } + + // Now insert on the first and last process the first and last child of the + // finest ancestor of first/last box on deepest level + + // Send first element to previous rank and insert into local keys. + // On the first process we also need to insert the first child of the finest + // ancestor of the deepest first key and first element. Correspondingly on the last process + // we need to insert the last child of the finest ancester of the deepest last key and last element. + + let next_key = communicate_back(&linear_keys, comm); + + if rank < size - 1 { + linear_keys.push(next_key.unwrap()); + } + + // Now fix the first key on the first rank. + + if rank == 0 { + let first_key = linear_keys.first().unwrap(); + let deepest_first = MortonKey::deepest_first(); + if !first_key.is_ancestor(deepest_first) { + let ancestor = deepest_first.finest_common_ancestor(*first_key); + linear_keys.insert(0, ancestor.children()[0]); + } + } + + if rank == size - 1 { + let last_key = linear_keys.last().unwrap(); + let deepest_last = MortonKey::deepest_last(); + if !last_key.is_ancestor(deepest_last) { + let ancestor = deepest_last.finest_common_ancestor(*last_key); + linear_keys.push(ancestor.children()[NSIBLINGS - 1]); + } + } + + // Now complete the regions defined by the keys on each process. + + let mut result = Vec::::new(); + + for (&key1, &key2) in linear_keys.iter().tuple_windows() { + result.push(key1); + result.extend_from_slice(key1.fill_between_keys(key2).as_slice()); + } + + if rank == size - 1 { + result.push(*linear_keys.last().unwrap()); + } + + debug_assert!(is_complete_linear_tree(&result, comm)); + + result +} + +/// Balance a distributed tree. +pub fn balance( + linear_keys: &[MortonKey], + rng: &mut R, + comm: &C, +) -> Vec { + let deepest_level = deepest_level(linear_keys, comm); + + // Start with keys at deepest level + let mut work_list = linear_keys + .iter() + .copied() + .filter(|&key| key.level() == deepest_level) + .collect_vec(); + + let mut result = Vec::::new(); + + // Now go through and make sure that for each key siblings and neighbours of parents are added + + for level in (1..=deepest_level).rev() { + let mut parents = HashSet::::new(); + let mut new_work_list = Vec::::new(); + // We filter the work list by level and also make sure that + // only one sibling of each of the parents children is added to + // our current level list. + for key in work_list.iter() { + let parent = key.parent(); + if !parents.contains(&parent) { + parents.insert(parent); + result.extend_from_slice(key.siblings().as_slice()); + new_work_list.extend_from_slice( + parent + .neighbours() + .iter() + .copied() + .filter(|&key| key.is_valid()) + .collect_vec() + .as_slice(), + ); + } + } + new_work_list.extend( + linear_keys + .iter() + .copied() + .filter(|&key| key.level() == level - 1), + ); + + work_list = new_work_list; + // Now extend the work list with the + } + + let result = linearize(&result, rng, comm); + + debug_assert!(is_complete_linear_and_balanced(&result, comm)); + result +} + +/// Return true if the keys are linear. +pub fn is_linear_tree(arr: &[MortonKey], comm: &C) -> bool { + let mut is_linear = true; + + for (&key1, &key2) in arr.iter().tuple_windows() { + if key1 >= key2 || key1.is_ancestor(key2) { + is_linear = false; + break; + } + } + + if comm.size() == 1 { + return is_linear; + } + + // Now check the interfaces + + if let Some(next_key) = communicate_back(arr, comm) { + let last = *arr.last().unwrap(); + if last >= next_key || last.is_ancestor(next_key) { + is_linear = false; + } + } + + let mut global_is_linear = false; + + comm.all_reduce_into( + &is_linear, + &mut global_is_linear, + SystemOperation::logical_and(), + ); + + global_is_linear +} + +/// Redistribute points with respect to a given coarse tree +pub fn redistribute_points_with_respect_to_coarse_tree( + points: &[Point], + morton_keys_for_points: &[MortonKey], + coarse_tree: &[MortonKey], + comm: &C, +) -> (Vec, Vec) { + pub fn argsort(arr: &[T]) -> Vec { + let mut sort_indices = (0..arr.len()).collect_vec(); + sort_indices.sort_unstable_by_key(|&index| arr[index]); + sort_indices + } + + pub fn reorder(arr: &[T], permutation: &[usize]) -> Vec { + let mut reordered = Vec::::with_capacity(arr.len()); + for &index in permutation.iter() { + reordered.push(arr[index]) + } + reordered + } + + assert_eq!(points.len(), morton_keys_for_points.len()); + + let size = comm.size(); + + if size == 1 { + return (points.to_vec(), morton_keys_for_points.to_vec()); + } + + let sort_indices = argsort(&morton_keys_for_points); + let sorted_keys = reorder(&morton_keys_for_points, &sort_indices); + let sorted_points = reorder(&points, &sort_indices); + + // Now get the bins + + let my_first = coarse_tree.first().unwrap(); + + let global_bins = gather_to_all(std::slice::from_ref(my_first), comm); + + // We now sort the morton indices into the bins. + + // This will store for each rank how many keys will be assigned to it. + + let counts = sort_to_bins(&sorted_keys, &global_bins) + .iter() + .map(|&elem| elem as i32) + .collect_vec(); + + // We now redistribute the points and the corresponding keys. + + let (distributed_points, distributed_keys) = ( + redistribute(&sorted_points, &counts, comm), + redistribute(&sorted_keys, &counts, comm), + ); + + // Now sort the distributed points and keys internally again. + + let sort_indices = argsort(&distributed_keys); + let sorted_keys = reorder(&distributed_keys, &sort_indices); + let sorted_points = reorder(&distributed_points, &sort_indices); + + (sorted_points, sorted_keys) +} + +/// Return true on all ranks if distributed tree is complete. Otherwise, return false. +pub fn is_complete_linear_tree(arr: &[MortonKey], comm: &C) -> bool { + // First check that the local tree on each node is complete. + + let mut complete_linear = true; + for (key1, key2) in arr.iter().tuple_windows() { + // Make sure that the keys are sorted and not duplicated. + if key1 >= key2 { + complete_linear = false; + break; + } + // The next key should be an ancestor of the next non-descendent key. + if let Some(expected_next) = key1.next_non_descendent_key() { + if !key2.is_ancestor(expected_next) { + complete_linear = false; + break; + } + } else { + // Only for the very last key there should not be a next non-descendent key. + complete_linear = false; + } + } + + // We now check the interfaces. + + if let Some(next_first) = communicate_back(arr, comm) { + // We are on any but the last rank + let last_key = arr.last().unwrap(); + + // Check that the keys are sorted and not duplicated. + if *last_key >= next_first { + complete_linear = false; + } + + // Check that the next key is an encestor of the next non-descendent. + if let Some(expected_next) = last_key.next_non_descendent_key() { + if !next_first.is_ancestor(expected_next) { + complete_linear = false; + } + } else { + complete_linear = false; + } + } else { + // We are on the last rank + // Check that the last key is ancestor of deepest last. + if !arr.last().unwrap().is_ancestor(MortonKey::deepest_last()) { + complete_linear = false; + } + } + + // Now check that at the first rank we include the deepest first. + + if comm.rank() == 0 { + if !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) { + complete_linear = false; + } + } + + // Now communicate everything together. + + let mut result = false; + comm.all_reduce_into( + &complete_linear, + &mut result, + SystemOperation::logical_and(), + ); + + result +} + +/// Return the deepest level of a distributed list of Morton keys. +pub fn deepest_level(keys: &[MortonKey], comm: &C) -> usize { + let local_deepest_level = keys.iter().map(|elem| elem.level()).max().unwrap(); + + if comm.size() == 1 { + return local_deepest_level; + } + + let mut global_deepest_level: usize = 0; + + comm.all_reduce_into( + &local_deepest_level, + &mut global_deepest_level, + SystemOperation::max(), + ); + + global_deepest_level +} + +/// Check if tree is balanced. +pub fn is_complete_linear_and_balanced( + arr: &[MortonKey], + comm: &C, +) -> bool { + // Send the tree to the root node and check there that it is balanced. + + let mut balanced = false; + + if let Some(arr) = gather_to_root(arr, comm) { + balanced = MortonKey::is_complete_linear_and_balanced(&arr); + } + + comm.process_at_rank(0).broadcast_into(&mut balanced); + + balanced +} + +/// For a complete linear bin get on each process the first key of all processes. +/// +/// This information can be used to query on which process a key is living. +pub fn get_tree_bins( + complete_linear_tree: &[MortonKey], + comm: &C, +) -> Vec { + gather_to_all( + std::slice::from_ref(complete_linear_tree.first().unwrap()), + comm, + ) +} + +/// For a sorted array return either position of the key or positioin directly before search key. +pub fn get_key_index(arr: &[MortonKey], key: MortonKey) -> usize { + // Does a binary search of the key. If the key is found with Ok(..) + // the exact index is returned of the found key. If the key is not found + // the closest larger index is returned. So we subtract one to get the closest + // smaller index. + + match arr.binary_search(&key) { + Ok(index) => index, + Err(index) => index - 1, + } +} + +/// Check if a key is associated with the current rank. +/// +/// Note that the key does not need to exist as leaf. It just needs +/// to be descendent of a coarse key on the current rank. +pub fn key_on_current_rank( + key: MortonKey, + coarse_tree_bounds: &[MortonKey], + rank: usize, + size: usize, +) -> bool { + if rank == size - 1 { + key >= *coarse_tree_bounds.last().unwrap() + } else { + coarse_tree_bounds[rank] <= key && key < coarse_tree_bounds[rank + 1] + } +} + +#[cfg(test)] +mod test { + use crate::{ + octree::get_key_index, + tools::{generate_random_keys, seeded_rng}, + }; + + #[test] + fn test_get_key_rank() { + let mut rng = seeded_rng(0); + + let mut keys = generate_random_keys(50, &mut rng); + + keys.sort_unstable(); + + let mid = keys[25]; + + assert_eq!(25, get_key_index(&keys, mid)); + + // Now remove the mid index and do the same again. + + keys.remove(25); + + // The result should be 24. + + assert_eq!(24, get_key_index(&keys, mid)); + } +} diff --git a/src/serial.rs b/src/serial.rs index d32b3ac..0249bb8 100644 --- a/src/serial.rs +++ b/src/serial.rs @@ -33,7 +33,6 @@ impl Octree { /// Create octress from points pub fn from_points(points: &[Point], max_level: usize, max_points_per_box: usize) -> Self { // Make sure that the points array is a multiple of 3. - assert_eq!(points.len() % 3, 0); // Make sure that max level never exceeds DEEPEST_LEVEL let max_level = if max_level > DEEPEST_LEVEL as usize { diff --git a/src/tools.rs b/src/tools.rs index 56e0905..dec8779 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -8,7 +8,8 @@ use mpi::{ traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source}, }; use num::traits::Zero; -use rand::Rng; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; use crate::{ constants::{DEEPEST_LEVEL, LEVEL_SIZE}, @@ -392,6 +393,11 @@ pub fn generate_random_points( points } +/// Get a seeded rng +pub fn seeded_rng(seed: usize) -> ChaCha8Rng { + ChaCha8Rng::seed_from_u64(seed as u64) +} + /// Compute displacements from a vector of counts. /// /// This is useful for global MPI varcount operations. Let From 31765dbed9eb207faaf7bb6c61a840acc9fb14e3 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Wed, 25 Sep 2024 12:05:23 +0100 Subject: [PATCH 33/42] WIP: Ghosts --- src/octree.rs | 95 +++++++++++++++++++++++++++++++++++++++--- src/octree/parallel.rs | 24 ----------- 2 files changed, 90 insertions(+), 29 deletions(-) diff --git a/src/octree.rs b/src/octree.rs index 6f175cc..b0ac95c 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -1,7 +1,7 @@ pub mod parallel; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; -use mpi::traits::CommunicatorCollectives; +use mpi::traits::{CommunicatorCollectives, Equivalence}; pub use parallel::*; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; @@ -10,8 +10,17 @@ use crate::{ constants::DEEPEST_LEVEL, geometry::{PhysicalBox, Point}, morton::MortonKey, + tools::gather_to_all, }; +#[derive(PartialEq, Eq, Hash, Copy, Clone)] +pub enum KeyStatus { + LocalLeaf, + LocalInterior, + Global, + Ghost(usize), +} + /// A general structure for octrees. pub struct Octree<'o, C> { points: Vec, @@ -84,6 +93,8 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { let coarse_tree_bounds = get_tree_bins(&coarse_tree, comm); + // Duplicate the coarse tree across all nodes + Self { points: points.to_vec(), point_keys, @@ -137,11 +148,85 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { self.comm } - pub fn generate_status(&self) { - let mut keys_with_status = HashSet::::new(); + /// Generate all leaf and interior keys. + pub fn generate_all_keys(&self) -> HashMap { + let rank = self.comm().rank() as usize; + let size = self.comm().size() as usize; + + let mut all_keys = HashMap::::new(); // Start from the leafs and work up the tree. - for leaf in self.leaf_tree() {} + // First deal with the parents of the coarse tree. These are different + // as they may exist on multiple nodes, so receive a different label. + + let mut leaf_keys: HashSet = + HashSet::from_iter(self.leaf_tree().iter().copied()); + + for &key in self.coarse_tree() { + // Need to distingush if coarse tree node is already a leaf or not. + if leaf_keys.contains(&key) { + all_keys.insert(key, KeyStatus::LocalLeaf); + leaf_keys.remove(&key); + } else { + all_keys.insert(key, KeyStatus::LocalInterior); + } + + // We now iterate the parents of the coarse tree. There is no guarantee + // that the parents only exist on a single rank. Hence, they get the `Global` + // tag. + + let mut parent = key.parent(); + while parent.level() > 0 && !all_keys.contains_key(&parent) { + all_keys.insert(parent, KeyStatus::Global); + parent = parent.parent(); + } + } + + // We now deal with the fine leafs and their ancestors. + + for leaf in leaf_keys { + debug_assert!(!all_keys.contains_key(&leaf)); + all_keys.insert(leaf, KeyStatus::LocalLeaf); + let mut parent = leaf.parent(); + while parent.level() > 0 && !all_keys.contains_key(&parent) { + all_keys.insert(parent, KeyStatus::LocalInterior); + parent = parent.parent(); + } + } + + // This maps from rank to the keys that we want to send to the ranks + let mut rank_key = HashMap::>::new(); + for index in 0..size - 1 { + rank_key.insert(index, Vec::::new()); + } + + for (&key, &status) in all_keys.iter() { + // We need not send around global keys to neighbors. + if status == KeyStatus::Global { + continue; + } + for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) { + // Get rank of the neighbour + let neighbour_rank = get_key_index(&self.coarse_tree_bounds(), neighbor); + rank_key.entry(neighbour); + } + } + + // We now know which key needs to be sent to which rank. + // Turn to array, get the counts and send around. + + let (arr, counts) = { + let mut arr = Vec::::new(); + let mut counts = Vec::::new(); + for index in 0..size - 1 { + let value = rank_key.get(&index).unwrap(); + arr.extend(value.iter()); + counts.push(value.len()); + } + (arr, counts) + } + + all_keys } } diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs index 2981a08..28949c3 100644 --- a/src/octree/parallel.rs +++ b/src/octree/parallel.rs @@ -19,30 +19,6 @@ use itertools::{izip, Itertools}; use mpi::{collective::SystemOperation, traits::CommunicatorCollectives}; use rand::Rng; -/// Structure to store ghost keys and their original ranks. -/// -/// The status is -/// - 0 for a local interior node. -/// - 1 for a local leaf node. -/// - 2 for a global node. -/// - 3 for a ghost node. -#[derive(Copy, Clone, Equivalence)] -pub struct KeyWithStatus { - key: MortonKey, - // Ideally we would use a typed enum that - // combines rank and status. But this is not - // supported by the rsmpi Equivalence Macro. - status: usize, - rank: usize, -} - -impl KeyWithStatus { - /// Create a new ghost. - pub fn new(key: MortonKey, status: usize, rank: usize) -> Self { - Self { key, status, rank } - } -} - /// Compute the global bounding box across all points on all processes. pub fn compute_global_bounding_box( points: &[Point], From 17d44c60e7eae0919ad57e7befac81f7ce82c166 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Tue, 1 Oct 2024 12:32:14 +0100 Subject: [PATCH 34/42] WIP: Neighbour keys --- src/octree.rs | 57 ++++++++++++++++++++++++++---------------- src/octree/parallel.rs | 8 +++--- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/src/octree.rs b/src/octree.rs index b0ac95c..8a40463 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -1,9 +1,10 @@ pub mod parallel; use std::collections::{HashMap, HashSet}; +use itertools::Itertools; use mpi::traits::{CommunicatorCollectives, Equivalence}; pub use parallel::*; -use rand::{Rng, SeedableRng}; +use rand::SeedableRng; use rand_chacha::ChaCha8Rng; use crate::{ @@ -95,6 +96,8 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { // Duplicate the coarse tree across all nodes + // let coarse_tree = gather_to_all(&coarse_tree, comm); + Self { points: points.to_vec(), point_keys, @@ -154,36 +157,40 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { let size = self.comm().size() as usize; let mut all_keys = HashMap::::new(); + let mut leaf_keys: HashSet = + HashSet::from_iter(self.leaf_tree().iter().copied()); - // Start from the leafs and work up the tree. + let mut global_keys = HashSet::::new(); // First deal with the parents of the coarse tree. These are different // as they may exist on multiple nodes, so receive a different label. - let mut leaf_keys: HashSet = - HashSet::from_iter(self.leaf_tree().iter().copied()); - for &key in self.coarse_tree() { - // Need to distingush if coarse tree node is already a leaf or not. - if leaf_keys.contains(&key) { - all_keys.insert(key, KeyStatus::LocalLeaf); - leaf_keys.remove(&key); - } else { - all_keys.insert(key, KeyStatus::LocalInterior); - } - - // We now iterate the parents of the coarse tree. There is no guarantee - // that the parents only exist on a single rank. Hence, they get the `Global` - // tag. - let mut parent = key.parent(); while parent.level() > 0 && !all_keys.contains_key(&parent) { - all_keys.insert(parent, KeyStatus::Global); + global_keys.insert(parent); parent = parent.parent(); } } + // We now send around the parents of the coarse tree to every node. These will + // be global keys. + + let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), self.comm()); + + // We can now insert the global keys into `all_keys` with the `Global` label. + // There may be duplicates in the `global_keys` array. So need to check for that. + + for &key in &global_keys { + if !all_keys.contains_key(&key) { + all_keys.insert(key, KeyStatus::Global); + } + } + // We now deal with the fine leafs and their ancestors. + // The leafs of the coarse tree will also be either part + // of the fine tree leafs or will be interior keys. In either + // case the following loop catches them. for leaf in leaf_keys { debug_assert!(!all_keys.contains_key(&leaf)); @@ -207,9 +214,17 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { continue; } for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) { + // If the neighbour is a global key then continue. + if let Some(&value) = all_keys.get(&neighbor) { + if value == KeyStatus::Global { + continue; + } + } // Get rank of the neighbour - let neighbour_rank = get_key_index(&self.coarse_tree_bounds(), neighbor); - rank_key.entry(neighbour); + let neighbor_rank = get_key_index(&self.coarse_tree_bounds(), neighbor); + rank_key + .entry(neighbor_rank) + .and_modify(|keys| keys.push(key)); } } @@ -225,7 +240,7 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { counts.push(value.len()); } (arr, counts) - } + }; all_keys } diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs index 28949c3..5f8348f 100644 --- a/src/octree/parallel.rs +++ b/src/octree/parallel.rs @@ -336,14 +336,14 @@ pub fn create_local_tree( // through and locally refine for each block that requires it. let mut remainder = sorted_fine_keys; - let mut new_coarse_keys = Vec::::new(); + let mut refined_keys = Vec::::new(); for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) { let current; (current, remainder) = remainder.split_at(count); if coarse_key.level() < max_level && current.len() > max_keys { // We need to refine the current split. - new_coarse_keys.extend_from_slice( + refined_keys.extend_from_slice( create_local_tree( current, coarse_key.children().as_slice(), @@ -353,11 +353,11 @@ pub fn create_local_tree( .as_slice(), ); } else { - new_coarse_keys.push(coarse_key) + refined_keys.push(coarse_key) } } - new_coarse_keys + refined_keys } /// Linearize a set of weighted Morton keys. From baa5bb91062241e20ad808e7b3ffa8e560db5a0f Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Thu, 3 Oct 2024 21:02:34 +0100 Subject: [PATCH 35/42] Testing creation of all keys. --- examples/mpi_coarse_tree.rs | 5 +- examples/mpi_complete_tree.rs | 2 +- examples/mpi_cumsum.rs | 4 +- src/octree.rs | 113 ++++---------------------- src/octree/parallel.rs | 146 ++++++++++++++++++++++++++++++---- src/tools.rs | 10 +-- 6 files changed, 159 insertions(+), 121 deletions(-) diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs index a79626d..b8f1095 100644 --- a/examples/mpi_coarse_tree.rs +++ b/examples/mpi_coarse_tree.rs @@ -10,7 +10,10 @@ use bempp_octree::{ }, tools::{communicate_back, generate_random_points, global_size, is_sorted_array}, }; -use mpi::{collective::SystemOperation, traits::*}; +use mpi::{ + collective::SystemOperation, + traits::{Communicator, CommunicatorCollectives}, +}; use rand::prelude::*; use rand_chacha::ChaCha8Rng; diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index d47fc6a..b0a5f43 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -5,7 +5,7 @@ use bempp_octree::{ octree::{complete_tree, is_complete_linear_tree, linearize, points_to_morton}, tools::generate_random_points, }; -use mpi::traits::*; +use mpi::traits::Communicator; use rand::prelude::*; use rand_chacha::ChaCha8Rng; diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs index ab9e1b1..1b3fcc3 100644 --- a/examples/mpi_cumsum.rs +++ b/examples/mpi_cumsum.rs @@ -2,7 +2,7 @@ use bempp_octree::tools::{gather_to_root, global_inclusive_cumsum}; use itertools::{izip, Itertools}; -use mpi::traits::*; +use mpi::traits::Communicator; use rand::prelude::*; use rand_chacha::ChaCha8Rng; @@ -42,7 +42,7 @@ pub fn main() { let expected_cum_sum = original_array .iter() .scan(0, |state, x| { - *state = *x + *state; + *state += *x; Some(*state) }) .collect_vec(); diff --git a/src/octree.rs b/src/octree.rs index 8a40463..537fa1f 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -1,8 +1,8 @@ +//! Definition of Octree. pub mod parallel; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; -use itertools::Itertools; -use mpi::traits::{CommunicatorCollectives, Equivalence}; +use mpi::traits::CommunicatorCollectives; pub use parallel::*; use rand::SeedableRng; use rand_chacha::ChaCha8Rng; @@ -11,14 +11,18 @@ use crate::{ constants::DEEPEST_LEVEL, geometry::{PhysicalBox, Point}, morton::MortonKey, - tools::gather_to_all, }; +/// Stores what type of key it is. #[derive(PartialEq, Eq, Hash, Copy, Clone)] -pub enum KeyStatus { +pub enum KeyType { + /// A local leaf. LocalLeaf, + /// A local interior key. LocalInterior, + /// A global key. Global, + /// A ghost key from a specific process. Ghost(usize), } @@ -29,6 +33,7 @@ pub struct Octree<'o, C> { coarse_tree: Vec, leaf_tree: Vec, coarse_tree_bounds: Vec, + all_keys: HashMap, bounding_box: PhysicalBox, comm: &'o C, } @@ -98,12 +103,15 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { // let coarse_tree = gather_to_all(&coarse_tree, comm); + let all_keys = generate_all_keys(&leaf_tree, &coarse_tree, &coarse_tree_bounds, comm); + Self { points: points.to_vec(), point_keys, coarse_tree, leaf_tree, coarse_tree_bounds, + all_keys, bounding_box, comm, } @@ -151,97 +159,8 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { self.comm } - /// Generate all leaf and interior keys. - pub fn generate_all_keys(&self) -> HashMap { - let rank = self.comm().rank() as usize; - let size = self.comm().size() as usize; - - let mut all_keys = HashMap::::new(); - let mut leaf_keys: HashSet = - HashSet::from_iter(self.leaf_tree().iter().copied()); - - let mut global_keys = HashSet::::new(); - - // First deal with the parents of the coarse tree. These are different - // as they may exist on multiple nodes, so receive a different label. - - for &key in self.coarse_tree() { - let mut parent = key.parent(); - while parent.level() > 0 && !all_keys.contains_key(&parent) { - global_keys.insert(parent); - parent = parent.parent(); - } - } - - // We now send around the parents of the coarse tree to every node. These will - // be global keys. - - let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), self.comm()); - - // We can now insert the global keys into `all_keys` with the `Global` label. - // There may be duplicates in the `global_keys` array. So need to check for that. - - for &key in &global_keys { - if !all_keys.contains_key(&key) { - all_keys.insert(key, KeyStatus::Global); - } - } - - // We now deal with the fine leafs and their ancestors. - // The leafs of the coarse tree will also be either part - // of the fine tree leafs or will be interior keys. In either - // case the following loop catches them. - - for leaf in leaf_keys { - debug_assert!(!all_keys.contains_key(&leaf)); - all_keys.insert(leaf, KeyStatus::LocalLeaf); - let mut parent = leaf.parent(); - while parent.level() > 0 && !all_keys.contains_key(&parent) { - all_keys.insert(parent, KeyStatus::LocalInterior); - parent = parent.parent(); - } - } - - // This maps from rank to the keys that we want to send to the ranks - let mut rank_key = HashMap::>::new(); - for index in 0..size - 1 { - rank_key.insert(index, Vec::::new()); - } - - for (&key, &status) in all_keys.iter() { - // We need not send around global keys to neighbors. - if status == KeyStatus::Global { - continue; - } - for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) { - // If the neighbour is a global key then continue. - if let Some(&value) = all_keys.get(&neighbor) { - if value == KeyStatus::Global { - continue; - } - } - // Get rank of the neighbour - let neighbor_rank = get_key_index(&self.coarse_tree_bounds(), neighbor); - rank_key - .entry(neighbor_rank) - .and_modify(|keys| keys.push(key)); - } - } - - // We now know which key needs to be sent to which rank. - // Turn to array, get the counts and send around. - - let (arr, counts) = { - let mut arr = Vec::::new(); - let mut counts = Vec::::new(); - for index in 0..size - 1 { - let value = rank_key.get(&index).unwrap(); - arr.extend(value.iter()); - counts.push(value.len()); - } - (arr, counts) - }; - - all_keys + /// Return a map of all keys. + pub fn all_keys(&self) -> &HashMap { + &self.all_keys } } diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs index 5f8348f..1b873fb 100644 --- a/src/octree/parallel.rs +++ b/src/octree/parallel.rs @@ -19,6 +19,8 @@ use itertools::{izip, Itertools}; use mpi::{collective::SystemOperation, traits::CommunicatorCollectives}; use rand::Rng; +use super::KeyType; + /// Compute the global bounding box across all points on all processes. pub fn compute_global_bounding_box( points: &[Point], @@ -184,7 +186,7 @@ pub fn compute_coarse_tree_weights( // intersect with each node of the coarse tree. We then sum up the local weight for each // coarse tree node across all nodes to get the weight. - let global_coarse_tree = gather_to_all(&coarse_tree, comm); + let global_coarse_tree = gather_to_all(coarse_tree, comm); // We also want to send around a corresponding array of ranks so that for each global coarse tree key // we have the rank of where it originates from. @@ -192,7 +194,7 @@ pub fn compute_coarse_tree_weights( let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm); // We now compute the local weights. - let mut local_weight_contribution = vec![0 as usize; global_coarse_tree.len()]; + let mut local_weight_contribution = vec![0; global_coarse_tree.len()]; // In the following loop we want to be a bit smart. We do not iterate through all the local elements. // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region @@ -232,7 +234,7 @@ pub fn compute_coarse_tree_weights( // We now need to sum up the weights across all processes. - let mut global_weights = vec![0 as usize; global_coarse_tree.len()]; + let mut global_weights = vec![0; global_coarse_tree.len()]; comm.all_reduce_into( &local_weight_contribution, @@ -291,7 +293,7 @@ pub fn redistribute_with_respect_to_coarse_tree( // We now have the counts for each rank. Let's redistribute accordingly and return. - let result = redistribute(&linear_keys, &rank_counts, comm); + let result = redistribute(linear_keys, &rank_counts, comm); #[cfg(debug_assertions)] { @@ -330,7 +332,7 @@ pub fn create_local_tree( let bins = coarse_keys.to_vec(); - let counts = sort_to_bins(&sorted_fine_keys, &bins); + let counts = sort_to_bins(sorted_fine_keys, &bins); // We now know how many fine keys are associated with each coarse block. We iterate // through and locally refine for each block that requires it. @@ -378,7 +380,7 @@ pub fn linearize( // We are first sorting the keys. Then in a linear process across all processors we // go through the arrays and delete ancestors of nodes. - let sorted_keys = parsort(&keys, comm, rng); + let sorted_keys = parsort(keys, comm, rng); // Each process needs to send its first element to the previous process. Each process // then goes through its own list and retains elements that are not ancestors of the @@ -438,7 +440,7 @@ pub fn load_balance( // of each array to get the global sums and then we update the array of each rank // with the sum from the previous ranks. - let scan = global_inclusive_cumsum(&weights, comm); + let scan = global_inclusive_cumsum(weights, comm); // Now broadcast the total weight to all processes. @@ -477,7 +479,7 @@ pub fn load_balance( // We create a vector of how many elements to send to each process and // then send the actual data. - let mut recvbuffer = redistribute(&sorted_keys, &counts, comm); + let mut recvbuffer = redistribute(sorted_keys, &counts, comm); recvbuffer.sort_unstable(); recvbuffer @@ -673,9 +675,9 @@ pub fn redistribute_points_with_respect_to_coarse_tree(arr: &[MortonKey], co // Now check that at the first rank we include the deepest first. - if comm.rank() == 0 { - if !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) { - complete_linear = false; - } + if comm.rank() == 0 && !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) { + complete_linear = false; } // Now communicate everything together. @@ -858,6 +858,122 @@ pub fn key_on_current_rank( } } +/// Generate all leaf and interior keys. +pub fn generate_all_keys( + leaf_tree: &[MortonKey], + coarse_tree: &[MortonKey], + coarse_tree_bounds: &[MortonKey], + comm: &C, +) -> HashMap { + /// This struct combines rank and key information for sending ghosts to neighbors. + #[derive(Copy, Clone, Equivalence)] + struct KeyWithRank { + key: MortonKey, + rank: usize, + } + + let rank = comm.rank() as usize; + let size = comm.size() as usize; + + let mut all_keys = HashMap::::new(); + let leaf_keys: HashSet = HashSet::from_iter(leaf_tree.iter().copied()); + + let mut global_keys = HashSet::::new(); + + // First deal with the parents of the coarse tree. These are different + // as they may exist on multiple nodes, so receive a different label. + + for &key in coarse_tree { + let mut parent = key.parent(); + while parent.level() > 0 && !all_keys.contains_key(&parent) { + global_keys.insert(parent); + parent = parent.parent(); + } + } + + // We now send around the parents of the coarse tree to every node. These will + // be global keys. + + let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), comm); + + // We can now insert the global keys into `all_keys` with the `Global` label. + + for &key in &global_keys { + all_keys.entry(key).or_insert(KeyType::Global); + } + + // We now deal with the fine leafs and their ancestors. + // The leafs of the coarse tree will also be either part + // of the fine tree leafs or will be interior keys. In either + // case the following loop catches them. + + for leaf in leaf_keys { + debug_assert!(!all_keys.contains_key(&leaf)); + all_keys.insert(leaf, KeyType::LocalLeaf); + let mut parent = leaf.parent(); + while parent.level() > 0 && !all_keys.contains_key(&parent) { + all_keys.insert(parent, KeyType::LocalInterior); + parent = parent.parent(); + } + } + + // This maps from rank to the keys that we want to send to the ranks + let mut rank_send_ghost = HashMap::>::new(); + for index in 0..size - 1 { + rank_send_ghost.insert(index, Vec::::new()); + } + + for (&key, &status) in all_keys.iter() { + // We need not send around global keys to neighbors. + if status == KeyType::Global { + continue; + } + for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) { + // If the neighbour is a global key then continue. + if let Some(&value) = all_keys.get(&neighbor) { + if value == KeyType::Global { + continue; + } + } + // Get rank of the neighbour + let neighbor_rank = get_key_index(coarse_tree_bounds, neighbor); + rank_send_ghost + .entry(neighbor_rank) + .and_modify(|keys| keys.push(KeyWithRank { key, rank })); + } + } + + // We now know which key needs to be sent to which rank. + // Turn to array, get the counts and send around. + + let (arr, counts) = { + let mut arr = Vec::::new(); + let mut counts = Vec::::new(); + for index in 0..size - 1 { + let keys = rank_send_ghost.get(&index).unwrap(); + arr.extend(keys.iter()); + counts.push(keys.len() as i32); + } + (arr, counts) + }; + + // These are all the keys that are neighbors to our keys. We now go through + // and store those that do not live on our tree as into `all_keys` with a label + // of `Ghost`. + let ghost_keys = redistribute(&arr, &counts, comm); + + for key in &ghost_keys { + if key.rank == rank { + // Don't need to add the keys that are already on the rank. + continue; + } + debug_assert!(!all_keys.contains_key(&key.key)); + all_keys.insert(key.key, KeyType::Ghost(key.rank)); + } + + all_keys +} + #[cfg(test)] mod test { use crate::{ diff --git a/src/tools.rs b/src/tools.rs index dec8779..a404f88 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -25,7 +25,7 @@ pub fn gather_to_all(arr: &[T], comm let local_len = arr.len() as i32; - let mut sizes = vec![0 as i32; size as usize]; + let mut sizes = vec![0; size as usize]; comm.all_gather_into(&local_len, &mut sizes); @@ -170,7 +170,7 @@ pub fn communicate_back( if rank == size - 1 { comm.process_at_rank(rank - 1).send(arr.first().unwrap()); - return None; + None } else { let (new_last, _status) = if rank > 0 { p2p::send_receive( @@ -221,9 +221,9 @@ pub fn redistribute( // First send the counts around via an alltoall operation. - let mut recv_counts = vec![0 as i32; counts.len()]; + let mut recv_counts = vec![0; counts.len()]; - comm.all_to_all_into(&counts[..], &mut recv_counts); + comm.all_to_all_into(counts, &mut recv_counts); // We have the recv_counts. Allocate space and setup the partitions. @@ -288,7 +288,7 @@ pub fn sort_to_bins(sorted_keys: &[T], bins: &[T]) -> Vec { return vec![sorted_keys.len(); 1]; } - let mut bin_counts = vec![0 as usize; nbins]; + let mut bin_counts = vec![0; nbins]; // This iterates over each possible bin and returns also the associated rank. // The last bin position is not iterated over since for an array with p elements From 6f01f6c585a51b98cb7afd73a32796bb2e537dac Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 4 Oct 2024 11:06:41 +0100 Subject: [PATCH 36/42] Testing parallel octree --- examples/mpi_complete_tree.rs | 75 ++++++++++++++--- src/octree.rs | 6 +- src/octree/parallel.rs | 151 ++++++++++++++++++++-------------- src/tools.rs | 4 + 4 files changed, 162 insertions(+), 74 deletions(-) diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index b0a5f43..78a0c23 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -1,10 +1,11 @@ -//! Test the computation of a global bounding box across MPI ranks. +//! Test the computation of a complete octree. use bempp_octree::{ - constants::DEEPEST_LEVEL, - octree::{complete_tree, is_complete_linear_tree, linearize, points_to_morton}, - tools::generate_random_points, + morton::MortonKey, + octree::{is_complete_linear_and_balanced, KeyType, Octree}, + tools::{gather_to_all, generate_random_points}, }; +use itertools::Itertools; use mpi::traits::Communicator; use rand::prelude::*; use rand_chacha::ChaCha8Rng; @@ -20,21 +21,73 @@ pub fn main() { let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); // Create `npoints` per rank. - let npoints = 10; + let npoints = 10000; // Generate random points. let points = generate_random_points(npoints, &mut rng, &comm); - // Compute the Morton keys on the deepest level - let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm); + let tree = Octree::new(&points, 15, 50, &comm); - let linear_keys = linearize(&keys, &mut rng, &comm); + // We now check that each node of the tree has all its neighbors available. - // Generate a complete tree - let distributed_complete_tree = complete_tree(&linear_keys, &comm); + let leaf_tree = tree.leaf_tree(); + let all_keys = tree.all_keys(); - assert!(is_complete_linear_tree(&distributed_complete_tree, &comm)); + assert!(is_complete_linear_and_balanced(leaf_tree, &comm)); + for &key in leaf_tree { + let mut parent = key; + while parent.level() > 0 { + // Check that the key itself is there. + assert!(all_keys.contains_key(&key)); + // Check that all its neighbours are there. + for neighbor in parent.neighbours().iter().filter(|&key| key.is_valid()) { + if !all_keys.contains_key(neighbor) { + println!( + "Missing neighbor: {}. Key type {:#?}", + neighbor, + all_keys.get(&parent).unwrap() + ); + } + assert!(all_keys.contains_key(neighbor)); + } + parent = parent.parent(); + // Check that the parent is there. + assert!(all_keys.contains_key(&parent)); + } + } + + // At the end check that the root of the tree is also contained. + assert!(all_keys.contains_key(&MortonKey::root())); + + // Count the number of ghosts on each rank + + // Count the number of global keys on each rank. + + // Assert that all ghosts are from a different rank and count them. + + let nghosts = all_keys + .iter() + .filter_map(|(_, &value)| { + if let KeyType::Ghost(rank) = value { + assert!(rank != comm.size() as usize); + Some(rank) + } else { + None + } + }) + .count(); + + let nglobal = all_keys + .iter() + .filter(|(_, &value)| matches!(value, KeyType::Global)) + .count(); + + // Assert that all globals across all ranks have the same count. + + let nglobals = gather_to_all(std::slice::from_ref(&nglobal), &comm); + + assert_eq!(nglobals.iter().unique().count(), 1); if comm.rank() == 0 { println!("Distributed tree is complete and linear."); diff --git a/src/octree.rs b/src/octree.rs index 537fa1f..5dbd076 100644 --- a/src/octree.rs +++ b/src/octree.rs @@ -14,7 +14,7 @@ use crate::{ }; /// Stores what type of key it is. -#[derive(PartialEq, Eq, Hash, Copy, Clone)] +#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)] pub enum KeyType { /// A local leaf. LocalLeaf, @@ -55,7 +55,10 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { let linear_keys = linearize(&point_keys, &mut rng, comm); // Compute the first version of the coarse tree without load balancing. + // We want to ensure that it is 2:1 balanced. let coarse_tree = compute_coarse_tree(&linear_keys, comm); + + let coarse_tree = balance(&coarse_tree, &mut rng, comm); debug_assert!(is_complete_linear_tree(&coarse_tree, comm)); // We now compute the weights for the initial coarse tree. @@ -66,7 +69,6 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> { // that is used from now on. let coarse_tree = load_balance(&coarse_tree, &weights, comm); - // We also want to redistribute the fine keys with respect to the load balanced coarse trees. let fine_keys = diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs index 1b873fb..4a8a80f 100644 --- a/src/octree/parallel.rs +++ b/src/octree/parallel.rs @@ -559,6 +559,13 @@ pub fn balance( rng: &mut R, comm: &C, ) -> Vec { + // Treat the case that the length of the keys is one and is only the root. + // This would lead to an empty output below as we only iterate up to level 1. + + if linear_keys.len() == 1 && *linear_keys.first().unwrap() == MortonKey::root() { + return vec![MortonKey::root()]; + } + let deepest_level = deepest_level(linear_keys, comm); // Start with keys at deepest level @@ -602,7 +609,6 @@ pub fn balance( ); work_list = new_work_list; - // Now extend the work list with the } let result = linearize(&result, rng, comm); @@ -653,6 +659,10 @@ pub fn redistribute_points_with_respect_to_coarse_tree (Vec, Vec) { + if comm.size() == 1 { + return (points.to_vec(), morton_keys_for_points.to_vec()); + } + pub fn argsort(arr: &[T]) -> Vec { let mut sort_indices = (0..arr.len()).collect_vec(); sort_indices.sort_unstable_by_key(|&index| arr[index]); @@ -878,28 +888,32 @@ pub fn generate_all_keys( let mut all_keys = HashMap::::new(); let leaf_keys: HashSet = HashSet::from_iter(leaf_tree.iter().copied()); - let mut global_keys = HashSet::::new(); + // If size == 1 we simply create locally the keys, so don't need to treat the global keys. - // First deal with the parents of the coarse tree. These are different - // as they may exist on multiple nodes, so receive a different label. + if size > 1 { + let mut global_keys = HashSet::::new(); - for &key in coarse_tree { - let mut parent = key.parent(); - while parent.level() > 0 && !all_keys.contains_key(&parent) { - global_keys.insert(parent); - parent = parent.parent(); + // First deal with the parents of the coarse tree. These are different + // as they may exist on multiple nodes, so receive a different label. + + for &key in coarse_tree { + let mut parent = key.parent(); + while parent.level() > 0 && !all_keys.contains_key(&parent) { + global_keys.insert(parent); + parent = parent.parent(); + } } - } - // We now send around the parents of the coarse tree to every node. These will - // be global keys. + // We now send around the parents of the coarse tree to every node. These will + // be global keys. - let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), comm); + let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), comm); - // We can now insert the global keys into `all_keys` with the `Global` label. + // We can now insert the global keys into `all_keys` with the `Global` label. - for &key in &global_keys { - all_keys.entry(key).or_insert(KeyType::Global); + for &key in &global_keys { + all_keys.entry(key).or_insert(KeyType::Global); + } } // We now deal with the fine leafs and their ancestors. @@ -917,58 +931,73 @@ pub fn generate_all_keys( } } - // This maps from rank to the keys that we want to send to the ranks - let mut rank_send_ghost = HashMap::>::new(); - for index in 0..size - 1 { - rank_send_ghost.insert(index, Vec::::new()); - } + // Need to explicitly add the root at the end. + all_keys.entry(MortonKey::root()).or_insert(KeyType::Global); - for (&key, &status) in all_keys.iter() { - // We need not send around global keys to neighbors. - if status == KeyType::Global { - continue; - } - for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) { - // If the neighbour is a global key then continue. - if let Some(&value) = all_keys.get(&neighbor) { - if value == KeyType::Global { - continue; - } - } - // Get rank of the neighbour - let neighbor_rank = get_key_index(coarse_tree_bounds, neighbor); - rank_send_ghost - .entry(neighbor_rank) - .and_modify(|keys| keys.push(KeyWithRank { key, rank })); - } - } + // We only need to deal with ghosts if the size is larger than 1. - // We now know which key needs to be sent to which rank. - // Turn to array, get the counts and send around. + if size > 1 { + // This maps from rank to the keys that we want to send to the ranks - let (arr, counts) = { - let mut arr = Vec::::new(); - let mut counts = Vec::::new(); - for index in 0..size - 1 { - let keys = rank_send_ghost.get(&index).unwrap(); - arr.extend(keys.iter()); - counts.push(keys.len() as i32); + let mut rank_send_ghost = HashMap::>::new(); + for index in 0..size { + rank_send_ghost.insert(index, Vec::::new()); } - (arr, counts) - }; - // These are all the keys that are neighbors to our keys. We now go through - // and store those that do not live on our tree as into `all_keys` with a label - // of `Ghost`. - let ghost_keys = redistribute(&arr, &counts, comm); + let mut send_to_all = Vec::::new(); - for key in &ghost_keys { - if key.rank == rank { - // Don't need to add the keys that are already on the rank. - continue; + for (&key, &status) in all_keys.iter() { + // We need not send around global keys to neighbors. + if status == KeyType::Global { + continue; + } + for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) { + // If the neighbour is a global key then continue. + if all_keys + .get(&neighbor) + .is_some_and(|&value| value == KeyType::Global) + { + // Global keys exist on all nodes, so need to send their neighbors to all nodes. + send_to_all.push(KeyWithRank { key, rank }); + } else { + // Get rank of the neighbour + let neighbor_rank = get_key_index(coarse_tree_bounds, neighbor); + rank_send_ghost + .entry(neighbor_rank) + .and_modify(|keys| keys.push(KeyWithRank { key, rank })); + } + } + } + + let send_ghost_to_all = gather_to_all(&send_to_all, comm); + // We now know which key needs to be sent to which rank. + // Turn to array, get the counts and send around. + + let (arr, counts) = { + let mut arr = Vec::::new(); + let mut counts = Vec::::new(); + for index in 0..size { + let keys = rank_send_ghost.get(&index).unwrap(); + arr.extend(keys.iter()); + counts.push(keys.len() as i32); + } + (arr, counts) + }; + + // These are all the keys that are neighbors to our keys. We now go through + // and store those that do not live on our tree as into `all_keys` with a label + // of `Ghost`. + let mut ghost_keys = redistribute(&arr, &counts, comm); + // Add the neighbors of any global key. + ghost_keys.extend(send_ghost_to_all.iter()); + + for key in &ghost_keys { + if key.rank == rank { + // Don't need to add the keys that are already on the rank. + continue; + } + all_keys.insert(key.key, KeyType::Ghost(key.rank)); } - debug_assert!(!all_keys.contains_key(&key.key)); - all_keys.insert(key.key, KeyType::Ghost(key.rank)); } all_keys diff --git a/src/tools.rs b/src/tools.rs index a404f88..ad4d8b5 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -168,6 +168,10 @@ pub fn communicate_back( let rank = comm.rank(); let size = comm.size(); + if size == 1 { + return None; + } + if rank == size - 1 { comm.process_at_rank(rank - 1).send(arr.first().unwrap()); None From 5edcfb0d80955b974bc4837b386525075767cb0a Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 4 Oct 2024 11:26:59 +0100 Subject: [PATCH 37/42] Dealing with neighbours --- examples/mpi_complete_tree.rs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index 78a0c23..3c727ab 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -36,19 +36,14 @@ pub fn main() { assert!(is_complete_linear_and_balanced(leaf_tree, &comm)); for &key in leaf_tree { - let mut parent = key; + // We only check interior keys. Leaf keys may not have a neighbor + // on the same level. + let mut parent = key.parent(); while parent.level() > 0 { // Check that the key itself is there. assert!(all_keys.contains_key(&key)); // Check that all its neighbours are there. for neighbor in parent.neighbours().iter().filter(|&key| key.is_valid()) { - if !all_keys.contains_key(neighbor) { - println!( - "Missing neighbor: {}. Key type {:#?}", - neighbor, - all_keys.get(&parent).unwrap() - ); - } assert!(all_keys.contains_key(neighbor)); } parent = parent.parent(); From de1b130a4d36c754c8e67b9e8ca7a3ad5fc34d55 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 4 Oct 2024 12:33:19 +0100 Subject: [PATCH 38/42] Fixed doc bugs --- examples/mpi_complete_tree.rs | 6 ++++++ src/tools.rs | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index 3c727ab..bcff3b8 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -73,6 +73,12 @@ pub fn main() { }) .count(); + if comm.size() == 0 { + assert_eq!(nghosts, 0); + } else { + assert!(nghosts > 0); + } + let nglobal = all_keys .iter() .filter(|(_, &value)| matches!(value, KeyType::Global)) diff --git a/src/tools.rs b/src/tools.rs index ad4d8b5..dbe8edd 100644 --- a/src/tools.rs +++ b/src/tools.rs @@ -347,9 +347,9 @@ pub fn sort_to_bins(sorted_keys: &[T], bins: &[T]) -> Vec { /// /// - The array `sorted_keys` is assumed to be sorted within each process. It needs not be globally sorted. /// - If there are `r` ranks in the communicator, the size of `bins` must be `r`. -/// - The bins are defined through half-open intervals (bin[0], bin[1]), .... This defines r-1 bins. The -/// last bin is the half-open interval [bin[r-1], \infty). -/// - All array elements must be larger or equal bin[0]. This means that each element can be sorted into a bin. +/// - The bins are defined through half-open intervals `(bin[0], bin[1])`, .... This defines r-1 bins. The +/// last bin is the half-open interval `[bin[r-1], \infty)`. +/// - All array elements must be larger or equal `bin[0]`. This means that each element can be sorted into a bin. pub fn redistribute_by_bins( sorted_keys: &[T], bins: &[T], From ff8ecfe9a6eebc4cd3bab63ca757f03606344a24 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 4 Oct 2024 16:31:24 +0100 Subject: [PATCH 39/42] Removed unnecessary dependency --- Cargo.toml | 9 +++++---- examples/mpi_complete_tree.rs | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a75d666..ffd3a3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,9 @@ battleship = [] name = "bempp-octree" version = "0.0.1-dev" edition = "2021" -authors = ["Srinath Kailasa , Timo Betcke "] +authors = [ + "Srinath Kailasa , Timo Betcke ", +] description = "A library to create Octrees" license = "BSD-3-Clause" homepage = "https://github.com/bempp/octree" @@ -27,12 +29,11 @@ rand_chacha = "0.3.*" num = "0.4.*" bytemuck = "1.*" vtkio = "0.6.*" -mpi = {version = "0.8.*", features = ["derive", "user-operations"] } -once_cell = "*" +mpi = { version = "0.8.*", features = ["derive", "user-operations"] } [profile.release] debug = 1 - + [dev-dependencies] rand_distr = "0.4.3" #criterion = { version = "0.5.*", features = ["html_reports"]} diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index bcff3b8..a99a5ca 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -56,7 +56,6 @@ pub fn main() { assert!(all_keys.contains_key(&MortonKey::root())); // Count the number of ghosts on each rank - // Count the number of global keys on each rank. // Assert that all ghosts are from a different rank and count them. From 70eebe7bc3e1898fe502d62a040948060bb28ecf Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 4 Oct 2024 16:38:20 +0100 Subject: [PATCH 40/42] Removed bytemuck --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ffd3a3e..63eb3ca 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,7 +27,6 @@ itertools = "0.13.*" rand = { version = "0.8.5", features = ["alloc"] } rand_chacha = "0.3.*" num = "0.4.*" -bytemuck = "1.*" vtkio = "0.6.*" mpi = { version = "0.8.*", features = ["derive", "user-operations"] } From c8e5ac7b1ee1623ee794bd81782fe2def62197f0 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 4 Oct 2024 16:41:37 +0100 Subject: [PATCH 41/42] Fixed error --- examples/mpi_complete_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs index a99a5ca..68633e1 100644 --- a/examples/mpi_complete_tree.rs +++ b/examples/mpi_complete_tree.rs @@ -72,7 +72,7 @@ pub fn main() { }) .count(); - if comm.size() == 0 { + if comm.size() == 1 { assert_eq!(nghosts, 0); } else { assert!(nghosts > 0); From eacb84899ce74ccc711f64c27989e472a19a03b5 Mon Sep 17 00:00:00 2001 From: Timo Betcke Date: Fri, 4 Oct 2024 16:45:27 +0100 Subject: [PATCH 42/42] Deleted coarse tree example --- examples/mpi_coarse_tree.rs | 140 ------------------------------------ 1 file changed, 140 deletions(-) delete mode 100644 examples/mpi_coarse_tree.rs diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs deleted file mode 100644 index b8f1095..0000000 --- a/examples/mpi_coarse_tree.rs +++ /dev/null @@ -1,140 +0,0 @@ -//! Test the computation of a global bounding box across MPI ranks. - -use bempp_octree::{ - constants::DEEPEST_LEVEL, - morton::MortonKey, - octree::{ - balance, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree, - is_complete_linear_tree, linearize, load_balance, points_to_morton, - redistribute_points_with_respect_to_coarse_tree, redistribute_with_respect_to_coarse_tree, - }, - tools::{communicate_back, generate_random_points, global_size, is_sorted_array}, -}; -use mpi::{ - collective::SystemOperation, - traits::{Communicator, CommunicatorCollectives}, -}; -use rand::prelude::*; -use rand_chacha::ChaCha8Rng; - -pub fn main() { - // Initialise MPI - let universe = mpi::initialize().unwrap(); - - // Get the world communicator - let comm = universe.world(); - - // Initialise a seeded Rng. - let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64); - - // Create `npoints` per rank. - let npoints = 10000; - - // Generate random points. - - let points = generate_random_points(npoints, &mut rng, &comm); - - // Compute the Morton keys on the deepest level - let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm); - - // linearize the keys - let linear_keys = linearize(&keys, &mut rng, &comm); - - // Generate the coarse tree - let coarse_tree = compute_coarse_tree(&linear_keys, &comm); - assert!(is_complete_linear_tree(&coarse_tree, &comm)); - - // We now compute the weights for the coarse tree. - - let weights = compute_coarse_tree_weights(&linear_keys, &coarse_tree, &comm); - - // Assert that the global sum of the weights is identical to the number of linearized keys. - - let mut global_weight: usize = 0; - - comm.all_reduce_into( - &(weights.iter().sum::()), - &mut global_weight, - SystemOperation::sum(), - ); - - assert_eq!(global_weight, global_size(&linear_keys, &comm)); - - // Now load balance the coarse tree - - let load_balanced_coarse_keys = load_balance(&coarse_tree, &weights, &comm); - - // Compute the weights of the balanced keys - - let load_balanced_weights = - compute_coarse_tree_weights(&linear_keys, &load_balanced_coarse_keys, &comm); - - let mut global_balanced_weight: usize = 0; - comm.all_reduce_into( - &(load_balanced_weights.iter().sum::()), - &mut global_balanced_weight, - SystemOperation::sum(), - ); - - // The global weight of the non-balanced keys should be identical - // to the global weigth of the balanced keys. - - assert_eq!(global_weight, global_balanced_weight); - - // Now compute the new fine keys. - - let load_balanced_fine_keys = - redistribute_with_respect_to_coarse_tree(&linear_keys, &load_balanced_coarse_keys, &comm); - - assert_eq!( - global_size(&load_balanced_fine_keys, &comm), - global_size(&linear_keys, &comm) - ); - - let refined_tree = - create_local_tree(&load_balanced_fine_keys, &load_balanced_coarse_keys, 6, 100); - - assert!(is_complete_linear_tree(&refined_tree, &comm)); - - // Now balance the tree. - - let balanced_tree = balance(&refined_tree, &mut rng, &comm); - - // redistribute the balanced tree according to coarse tree - - let balanced_tree = - redistribute_with_respect_to_coarse_tree(&balanced_tree, &load_balanced_coarse_keys, &comm); - - assert!(is_complete_linear_tree(&balanced_tree, &comm)); - - // Redistribute original keys and points with respect to balanced coarse tree. - - let (balanced_points, balanced_keys) = redistribute_points_with_respect_to_coarse_tree( - &points, - &keys, - &load_balanced_coarse_keys, - &comm, - ); - - let upper_bound; - - if let Some(next_key) = communicate_back(&load_balanced_coarse_keys, &comm) { - upper_bound = next_key; - } else { - upper_bound = MortonKey::upper_bound(); - } - - assert!(load_balanced_coarse_keys.first().unwrap() <= balanced_keys.first().unwrap()); - assert!(*balanced_keys.last().unwrap() < upper_bound); - assert!(is_sorted_array(&balanced_keys, &comm)); - - println!( - "Rank {} has {} balanced points.", - comm.rank(), - balanced_points.len(), - ); - - if comm.rank() == 0 { - println!("Coarse tree successfully created and weights computed."); - } -}