From 8a13a8db309009dd9382c9c0017bcfa3a7c08729 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Wed, 11 Sep 2024 23:19:48 +0100
Subject: [PATCH 01/42] WIP: Partitioning

---
 examples/mpi.rs        |  49 +++++
 src/lib.rs             |   1 +
 src/morton.rs          |  16 +-
 src/parallel_octree.rs | 397 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 462 insertions(+), 1 deletion(-)
 create mode 100644 examples/mpi.rs
 create mode 100644 src/parallel_octree.rs
diff --git a/examples/mpi.rs b/examples/mpi.rs
new file mode 100644
index 0000000..05a88b9
--- /dev/null
+++ b/examples/mpi.rs
@@ -0,0 +1,49 @@
+//! Testing the hyksort component.
+use bempp_octree::morton::MortonKey;
+use bempp_octree::parallel_octree::partition;
+use bempp_octree::parsort::{array_to_root, parsort};
+use itertools::Itertools;
+use mpi::traits::Communicator;
+use rand::prelude::*;
+
+pub fn main() {
+    let universe = mpi::initialize().unwrap();
+    let world = universe.world();
+    let rank = world.rank() as u64;
+    let n_per_rank = 10;
+
+    let mut rng = rand::rngs::StdRng::seed_from_u64(0);
+
+    let mut arr = Vec::<MortonKey>::new();
+    let mut weights = Vec::<usize>::new();
+
+    for index in 0..n_per_rank {
+        arr.push(MortonKey::from_index_and_level([0, 0, 0], 0));
+        weights.push(1);
+    }
+
+    // let t = n_per_rank * rank as usize;
+    // let mut index_sum = if rank == 0 { 0 } else { (t * (t - 1)) / 2 };
+    // for index in n_per_rank * (rank as usize)..(n_per_rank * (1 + rank as usize)) {
+    //     arr.push(MortonKey::from_index_and_level([0, 0, 0], 0));
+    //     weights.push(index_sum);
+    //     index_sum += index;
+    //     // weights.push(rng.gen_range(1..20));
+    // }
+
+    let partitioned = partition(&arr, &weights, &world);
+
+    println!("Rank: {}, Len: {}", rank, partitioned.len());
+
+    let arr = array_to_root(&partitioned, &world);
+
+    if rank == 0 {
+        let arr = arr.unwrap();
+
+        for (elem1, elem2) in arr.iter().tuple_windows() {
+            assert!(elem1 <= elem2);
+        }
+        println!("{} elements are sorted.", arr.len());
+        println!("Finished.");
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 3f5fbcc..abc07bd 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -6,5 +6,6 @@ pub mod constants;
 pub mod geometry;
 pub mod morton;
 pub mod octree;
+pub mod parallel_octree;
 pub mod parsort;
 pub mod types;
diff --git a/src/morton.rs b/src/morton.rs
index 6701ead..0f93c66 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -6,15 +6,17 @@ use crate::constants::{
     Y_LOOKUP_ENCODE, Z_LOOKUP_DECODE, Z_LOOKUP_ENCODE,
 };
 use crate::geometry::PhysicalBox;
+use crate::parsort::{MaxValue, MinValue};
 use itertools::izip;
 use itertools::Itertools;
+use mpi::traits::Equivalence;
 use std::collections::HashSet;
 
 /// A morton key
 ///
 /// This is a distinct type to distinguish from u64
 /// numbers.
-#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Equivalence)]
 pub struct MortonKey {
     value: u64,
 }
@@ -25,6 +27,18 @@ impl Default for MortonKey {
     }
 }
 
+impl MinValue for MortonKey {
+    fn min_value() -> Self {
+        MortonKey::root()
+    }
+}
+
+impl MaxValue for MortonKey {
+    fn max_value() -> Self {
+        MortonKey::deepest_last()
+    }
+}
+
 impl MortonKey {
     /// Create a new Morton key. Users should use `[MortonKey::from_index_and_level].`
     fn new(value: u64) -> Self {
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
new file mode 100644
index 0000000..1db18e0
--- /dev/null
+++ b/src/parallel_octree.rs
@@ -0,0 +1,397 @@
+//! Parallel Octree structure
+
+use std::{borrow::BorrowMut, collections::HashMap, fmt::Display};
+
+use crate::{
+    constants::{DEEPEST_LEVEL, NLEVELS},
+    geometry::PhysicalBox,
+    morton::MortonKey,
+    parsort::{parsort, MaxValue, MinValue},
+};
+
+use mpi::{
+    datatype::{Partition, PartitionMut},
+    point_to_point as p2p,
+    traits::{Root, Source},
+};
+
+use itertools::{izip, Itertools};
+use mpi::{
+    collective::SystemOperation,
+    datatype::UncommittedUserDatatype,
+    topology::Process,
+    traits::{CommunicatorCollectives, Destination, Equivalence},
+};
+use rand::Rng;
+
+// /// A weighted Mortonkey contains weights to enable load balancing.
+// #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Equivalence)]
+// pub struct WeightedMortonKey {
+//     /// The actual MortonKey.
+//     pub key: MortonKey,
+//     /// The weight of the key, typically the number of points in the corresponding octant.
+//     pub weight: usize,
+// }
+
+// impl WeightedMortonKey {
+//     /// Get a new weighted Morton key
+//     pub fn new(key: MortonKey, weight: usize) -> Self {
+//         Self { key, weight }
+//     }
+// }
+
+// impl MinValue for WeightedMortonKey {
+//     fn min_value() -> Self {
+//         WeightedMortonKey {
+//             key: MortonKey::from_index_and_level([0, 0, 0], 0),
+//             weight: 0,
+//         }
+//     }
+// }
+
+// impl MaxValue for WeightedMortonKey {
+//     fn max_value() -> Self {
+//         WeightedMortonKey {
+//             key: MortonKey::deepest_last(),
+//             weight: usize::MAX,
+//         }
+//     }
+// }
+
+// impl Default for WeightedMortonKey {
+//     fn default() -> Self {
+//         WeightedMortonKey::new(Default::default(), 0)
+//     }
+// }
+
+// impl Display for WeightedMortonKey {
+//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+//         write!(f, "(Key: {}, Weight: {}", self.key, self.weight)
+//     }
+// }
+
+/// Compute the global bounding box across all points on all processes.
+pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
+    points: &[f64],
+    comm: &C,
+) -> PhysicalBox {
+    // Make sure that the points array is a multiple of 3.
+    assert_eq!(points.len() % 3, 0);
+    let points: &[[f64; 3]] = bytemuck::cast_slice(points);
+
+    // Now compute the minimum and maximum across each dimension.
+
+    let mut xmin = f64::MAX;
+    let mut xmax = f64::MIN;
+
+    let mut ymin = f64::MAX;
+    let mut ymax = f64::MIN;
+
+    let mut zmin = f64::MAX;
+    let mut zmax = f64::MIN;
+
+    for point in points {
+        let x = point[0];
+        let y = point[1];
+        let z = point[2];
+
+        xmin = f64::min(xmin, x);
+        xmax = f64::max(xmax, x);
+
+        ymin = f64::min(ymin, y);
+        ymax = f64::max(ymax, y);
+
+        zmin = f64::min(zmin, z);
+        zmax = f64::max(zmax, z);
+    }
+
+    let mut global_xmin = 0.0;
+    let mut global_xmax = 0.0;
+
+    let mut global_ymin = 0.0;
+    let mut global_ymax = 0.0;
+
+    let mut global_zmin = 0.0;
+    let mut global_zmax = 0.0;
+
+    comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min());
+    comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max());
+
+    comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min());
+    comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max());
+
+    comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min());
+    comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max());
+
+    let xdiam = global_xmax - global_xmin;
+    let ydiam = global_ymax - global_ymin;
+    let zdiam = global_zmax - global_zmin;
+
+    let xmean = global_xmin + 0.5 * xdiam;
+    let ymean = global_ymin + 0.5 * ydiam;
+    let zmean = global_zmin + 0.5 * zdiam;
+
+    // We increase diameters by box size on deepest level
+    // and use the maximum diameter to compute a
+    // cubic bounding box.
+
+    let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64;
+
+    let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap();
+
+    let max_diam = max_diam * (1.0 + deepest_box_diam);
+
+    PhysicalBox::new([
+        xmean - 0.5 * max_diam,
+        ymean - 0.5 * max_diam,
+        zmean - 0.5 * max_diam,
+        xmean + 0.5 * max_diam,
+        ymean + 0.5 * max_diam,
+        zmean + 0.5 * max_diam,
+    ])
+}
+
+/// Convert points to Morton keys on specified level.
+pub fn points_to_morton<C: CommunicatorCollectives>(
+    points: &[f64],
+    max_level: usize,
+    comm: &C,
+) -> (Vec<MortonKey>, PhysicalBox) {
+    // Make sure that the points array is a multiple of 3.
+    assert_eq!(points.len() % 3, 0);
+
+    // Make sure that max level never exceeds DEEPEST_LEVEL
+    let max_level = if max_level > DEEPEST_LEVEL as usize {
+        DEEPEST_LEVEL as usize
+    } else {
+        max_level
+    };
+
+    // Compute the physical bounding box.
+
+    let bounding_box = compute_global_bounding_box(points, comm);
+
+    // Bunch the points in arrays of 3.
+
+    let points: &[[f64; 3]] = bytemuck::cast_slice(points);
+
+    let keys = points
+        .iter()
+        .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level))
+        .collect_vec();
+
+    // Now want to get weighted Morton keys. We use a HashMap.
+
+    let mut value_counts = HashMap::<MortonKey, usize>::new();
+
+    for key in &keys {
+        *value_counts.entry(*key).or_insert(0) += 1;
+    }
+
+    // let weighted_keys = value_counts
+    //     .iter()
+    //     .map(|(&key, &weight)| WeightedMortonKey::new(key, weight))
+    //     .collect_vec();
+
+    (keys, bounding_box)
+}
+
+pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
+    keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) {
+    // First we sort the array of weighted keys.
+
+    let sorted_keys = parsort(&keys, comm, rng);
+
+    let mut completed_region =
+        MortonKey::complete_region(&[*sorted_keys.first().unwrap(), *sorted_keys.last().unwrap()]);
+
+    // Get the smallest level members of the completed region.
+
+    let min_level = completed_region
+        .iter()
+        .map(|elem| elem.level())
+        .min()
+        .unwrap();
+
+    let largest_boxes = completed_region
+        .iter()
+        .filter(|elem| elem.level() == min_level);
+}
+
+/// Linearize a set of weighted Morton keys.
+pub fn linearize<R: Rng, C: CommunicatorCollectives>(
+    keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    // We are first sorting the keys. Then in a linear process across all processors we
+    // go through the arrays and delete ancestors of nodes.
+
+    let sorted_keys = parsort(&keys, comm, rng);
+
+    let size = comm.size();
+    let rank = comm.rank();
+
+    // Each process needs to send its first element to the previous process. Each process
+    // then goes through its own list and retains elements that are not ancestors of the
+    // next element.
+
+    let mut result = Vec::<MortonKey>::new();
+
+    if rank == size - 1 {
+        comm.process_at_rank(rank - 1)
+            .send(sorted_keys.first().unwrap());
+
+        for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
+            // m1 is also ancestor of m2 if they are identical.
+            if m1.is_ancestor(m2) {
+                continue;
+            } else {
+                result.push(m1);
+            }
+        }
+
+        result.push(*sorted_keys.last().unwrap());
+    } else {
+        let (other, _status) = if rank > 0 {
+            p2p::send_receive(
+                sorted_keys.first().unwrap(),
+                &comm.process_at_rank(rank - 1),
+                &comm.process_at_rank(rank + 1),
+            )
+        } else {
+            comm.any_process().receive::<MortonKey>()
+        };
+        for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
+            // m1 is also ancestor of m2 if they are identical.
+            if m1.is_ancestor(m2) {
+                continue;
+            } else {
+                result.push(m1);
+            }
+        }
+
+        let last = *sorted_keys.last().unwrap();
+
+        if !last.is_ancestor(other) {
+            result.push(last)
+        }
+    }
+
+    result
+}
+
+/// Balance a sorted list of Morton keys across processors given an array of corresponding weights.
+pub fn partition<C: CommunicatorCollectives>(
+    sorted_keys: &[MortonKey],
+    weights: &[usize],
+    comm: &C,
+) -> Vec<MortonKey> {
+    assert_eq!(sorted_keys.len(), weights.len());
+
+    let size = comm.size();
+    let rank = comm.rank();
+
+    // First scan the weight.
+
+    let mut scan: Vec<usize> = vec![0; sorted_keys.len()];
+    comm.scan_into(weights, scan.as_mut_slice(), SystemOperation::sum());
+
+    let mut total_weight = if rank == size - 1 {
+        *scan.last().unwrap()
+    } else {
+        0
+    };
+
+    // Scan the weight (form cumulative sums) and broadcast the total weight (last entry on last process)
+    // to all other processes.
+
+    comm.process_at_rank(size - 1)
+        .broadcast_into(&mut total_weight);
+
+    let w = total_weight / (size as usize);
+    let k = total_weight % (size as usize);
+
+    let mut hash_map = HashMap::<usize, Vec<MortonKey>>::new();
+
+    // Sort the elements into bins according to which process they should be sent.
+
+    for p in 1..=size as usize {
+        let q = if p <= k as usize {
+            izip!(sorted_keys, &scan)
+                .filter_map(|(&key, &s)| {
+                    if (p - 1) * (1 + w) <= s && s < p * (w + 1) {
+                        Some(key)
+                    } else {
+                        None
+                    }
+                })
+                .collect_vec()
+        } else {
+            izip!(sorted_keys, &scan)
+                .filter_map(|(&key, &s)| {
+                    if (p - 1) * w + k <= s && s < p * w + k {
+                        Some(key)
+                    } else {
+                        None
+                    }
+                })
+                .collect_vec()
+        };
+        hash_map.insert(p - 1, q);
+    }
+
+    // Now distribute the data with an all to all v.
+    // We create a vector of how many elements to send to each process and
+    // then send the actual data.
+
+    let mut counts = vec![0 as i32; size as usize];
+    let mut counts_from_processor = vec![0 as i32; size as usize];
+    let mut all_elements = Vec::<MortonKey>::new();
+    for (index, c) in counts.iter_mut().enumerate() {
+        let elements = hash_map.get(&index).unwrap();
+        *c = elements.len() as i32;
+        all_elements.extend(elements.iter())
+    }
+
+    // Send around the number of elements for each process
+
+    comm.all_to_all_into(&counts, &mut counts_from_processor);
+
+    // We have the number of elements for each process now. Now send around
+    // the actual elements.
+
+    // We can now send around the actual elements with an alltoallv.
+    let send_displs: Vec<i32> = counts
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp as i32)
+        })
+        .collect();
+
+    let send_partition = Partition::new(&all_elements, &counts[..], &send_displs[..]);
+
+    let mut recvbuffer =
+        vec![MortonKey::default(); counts_from_processor.iter().sum::<i32>() as usize];
+
+    let recv_displs: Vec<i32> = counts_from_processor
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect();
+
+    let mut receiv_partition =
+        PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]);
+    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
+
+    recvbuffer.sort_unstable();
+    recvbuffer
+}

From b39e69e127128b445e9c5b7b07b78e72db785268 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Thu, 12 Sep 2024 10:02:57 +0100
Subject: [PATCH 02/42] Partitioning is working.

---
 examples/mpi.rs        | 11 ++++++++---
 src/parallel_octree.rs | 28 +++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/examples/mpi.rs b/examples/mpi.rs
index 05a88b9..469d8d1 100644
--- a/examples/mpi.rs
+++ b/examples/mpi.rs
@@ -17,9 +17,14 @@ pub fn main() {
     let mut arr = Vec::<MortonKey>::new();
     let mut weights = Vec::<usize>::new();
 
-    for index in 0..n_per_rank {
-        arr.push(MortonKey::from_index_and_level([0, 0, 0], 0));
-        weights.push(1);
+    for index in n_per_rank * rank..n_per_rank * (rank + 1) {
+        arr.push(MortonKey::from_index_and_level([index as usize, 0, 0], 10));
+    }
+
+    let arr = parsort(&arr, &world, &mut rng);
+
+    for index in 0..arr.len() {
+        weights.push((rank * n_per_rank) as usize + index);
     }
 
     // let t = n_per_rank * rank as usize;
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index 1db18e0..d686394 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -296,9 +296,23 @@ pub fn partition<C: CommunicatorCollectives>(
     let rank = comm.rank();
 
     // First scan the weight.
+    // We scan the local arrays, then use a global scan operation on the last element
+    // of each array to get the global sums and then we update the array of each rank
+    // with the sum from the previous ranks.
 
-    let mut scan: Vec<usize> = vec![0; sorted_keys.len()];
-    comm.scan_into(weights, scan.as_mut_slice(), SystemOperation::sum());
+    let mut scan: Vec<usize> = weights
+        .iter()
+        .scan(0, |state, x| {
+            *state += *x;
+            Some(*state)
+        })
+        .collect_vec();
+    let scan_last = *scan.last().unwrap();
+    let mut scan_result: usize = 0;
+    comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum());
+    for elem in &mut scan {
+        *elem += scan_result;
+    }
 
     let mut total_weight = if rank == size - 1 {
         *scan.last().unwrap()
@@ -323,7 +337,9 @@ pub fn partition<C: CommunicatorCollectives>(
         let q = if p <= k as usize {
             izip!(sorted_keys, &scan)
                 .filter_map(|(&key, &s)| {
-                    if (p - 1) * (1 + w) <= s && s < p * (w + 1) {
+                    if ((p - 1) * (1 + w) <= s && s < p * (w + 1))
+                        || (p == size as usize && (p - 1) * (1 + w) <= s)
+                    {
                         Some(key)
                     } else {
                         None
@@ -333,7 +349,9 @@ pub fn partition<C: CommunicatorCollectives>(
         } else {
             izip!(sorted_keys, &scan)
                 .filter_map(|(&key, &s)| {
-                    if (p - 1) * w + k <= s && s < p * w + k {
+                    if ((p - 1) * w + k <= s && s < p * w + k)
+                        || (p == size as usize && (p - 1) * w + k <= s)
+                    {
                         Some(key)
                     } else {
                         None
@@ -350,6 +368,7 @@ pub fn partition<C: CommunicatorCollectives>(
 
     let mut counts = vec![0 as i32; size as usize];
     let mut counts_from_processor = vec![0 as i32; size as usize];
+
     let mut all_elements = Vec::<MortonKey>::new();
     for (index, c) in counts.iter_mut().enumerate() {
         let elements = hash_map.get(&index).unwrap();
@@ -358,7 +377,6 @@ pub fn partition<C: CommunicatorCollectives>(
     }
 
     // Send around the number of elements for each process
-
     comm.all_to_all_into(&counts, &mut counts_from_processor);
 
     // We have the number of elements for each process now. Now send around

From f9904241f4f80fdfda9b68ebb01049d3244923fd Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Thu, 12 Sep 2024 11:49:57 +0100
Subject: [PATCH 03/42] Better testing of parallel code.

---
 examples/mpi.rs        | 102 +++++++++++++++++++++++++++++------------
 src/parallel_octree.rs |  18 ++++++--
 2 files changed, 88 insertions(+), 32 deletions(-)

diff --git a/examples/mpi.rs b/examples/mpi.rs
index 469d8d1..d427fd5 100644
--- a/examples/mpi.rs
+++ b/examples/mpi.rs
@@ -1,54 +1,98 @@
 //! Testing the hyksort component.
 use bempp_octree::morton::MortonKey;
-use bempp_octree::parallel_octree::partition;
+use bempp_octree::parallel_octree::{linearize, partition};
 use bempp_octree::parsort::{array_to_root, parsort};
 use itertools::Itertools;
-use mpi::traits::Communicator;
+use mpi::traits::*;
 use rand::prelude::*;
 
+pub fn assert_linearized<C: CommunicatorCollectives>(arr: &Vec<MortonKey>, comm: &C) {
+    // Check that the keys are still linearized.
+    let arr = array_to_root(&arr, comm);
+
+    if comm.rank() == 0 {
+        let arr = arr.unwrap();
+        for (&elem1, &elem2) in arr.iter().tuple_windows() {
+            assert!(!elem1.is_ancestor(elem2));
+        }
+        println!("{} keys are linearized.", &arr.len());
+    }
+}
+
 pub fn main() {
     let universe = mpi::initialize().unwrap();
     let world = universe.world();
     let rank = world.rank() as u64;
-    let n_per_rank = 10;
+    let max_level = 6;
+
+    // Each process gets its own rng
+    let mut rng = rand::rngs::StdRng::seed_from_u64(rank as u64);
+
+    // We first create a non-uniform tree on rank 0.
+
+    let mut keys = Vec::<MortonKey>::new();
+
+    pub fn add_level<R: Rng>(
+        keys: &mut Vec<MortonKey>,
+        current: MortonKey,
+        rng: &mut R,
+        max_level: usize,
+    ) {
+        keys.push(current);
+
+        if current.level() >= max_level {
+            return;
+        }
 
-    let mut rng = rand::rngs::StdRng::seed_from_u64(0);
+        let mut children = current.children();
 
-    let mut arr = Vec::<MortonKey>::new();
-    let mut weights = Vec::<usize>::new();
+        // This makes sure that the tree is not sorted.
+        children.shuffle(rng);
 
-    for index in n_per_rank * rank..n_per_rank * (rank + 1) {
-        arr.push(MortonKey::from_index_and_level([index as usize, 0, 0], 10));
+        for child in children {
+            if rng.gen_bool(0.9) {
+                add_level(keys, child, rng, max_level);
+            }
+        }
     }
 
-    let arr = parsort(&arr, &world, &mut rng);
+    add_level(&mut keys, MortonKey::root(), &mut rng, max_level);
+
+    println!("Number of keys on rank {}: {}", rank, keys.len());
+
+    // We now linearize the keys.
 
-    for index in 0..arr.len() {
-        weights.push((rank * n_per_rank) as usize + index);
+    if rank == 0 {
+        println!("Linearizing keys.");
     }
+    let sorted_keys = linearize(&keys, &mut rng, &world);
 
-    // let t = n_per_rank * rank as usize;
-    // let mut index_sum = if rank == 0 { 0 } else { (t * (t - 1)) / 2 };
-    // for index in n_per_rank * (rank as usize)..(n_per_rank * (1 + rank as usize)) {
-    //     arr.push(MortonKey::from_index_and_level([0, 0, 0], 0));
-    //     weights.push(index_sum);
-    //     index_sum += index;
-    //     // weights.push(rng.gen_range(1..20));
-    // }
+    println!(
+        "Number of linearized keys on rank {}: {}",
+        rank,
+        sorted_keys.len()
+    );
 
-    let partitioned = partition(&arr, &weights, &world);
+    // Now check that the tree is properly linearized.
 
-    println!("Rank: {}, Len: {}", rank, partitioned.len());
+    assert_linearized(&sorted_keys, &world);
 
-    let arr = array_to_root(&partitioned, &world);
+    // We now partition the keys equally across the processes. We give
+    // each leaf equal weights here.
 
-    if rank == 0 {
-        let arr = arr.unwrap();
+    let weights = vec![1 as usize; sorted_keys.len()];
 
-        for (elem1, elem2) in arr.iter().tuple_windows() {
-            assert!(elem1 <= elem2);
-        }
-        println!("{} elements are sorted.", arr.len());
-        println!("Finished.");
+    if rank == 0 {
+        println!("Partitioning keys.");
     }
+
+    let sorted_keys = partition(&sorted_keys, &weights, &world);
+
+    println!(
+        "After partitioning have {} keys on rank {}",
+        sorted_keys.len(),
+        rank
+    );
+
+    assert_linearized(&sorted_keys, &world);
 }
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index d686394..f6ab194 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -227,14 +227,20 @@ pub fn linearize<R: Rng, C: CommunicatorCollectives>(
     rng: &mut R,
     comm: &C,
 ) -> Vec<MortonKey> {
+    let size = comm.size();
+    let rank = comm.rank();
+
+    // If we only have one process we use the standard serial linearization.
+
+    if size == 1 {
+        return MortonKey::linearize(keys);
+    }
+
     // We are first sorting the keys. Then in a linear process across all processors we
     // go through the arrays and delete ancestors of nodes.
 
     let sorted_keys = parsort(&keys, comm, rng);
 
-    let size = comm.size();
-    let rank = comm.rank();
-
     // Each process needs to send its first element to the previous process. Each process
     // then goes through its own list and retains elements that are not ancestors of the
     // next element.
@@ -295,6 +301,12 @@ pub fn partition<C: CommunicatorCollectives>(
     let size = comm.size();
     let rank = comm.rank();
 
+    // If we only have one process we simply return.
+
+    if size == 1 {
+        return sorted_keys.to_vec();
+    }
+
     // First scan the weight.
     // We scan the local arrays, then use a global scan operation on the last element
     // of each array to get the global sums and then we update the array of each rank

From 902f08aea10a54d0a44ae8d08333d0a6358ad5f0 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Thu, 12 Sep 2024 15:08:18 +0100
Subject: [PATCH 04/42] WIP: Parallel tree generation

---
 src/morton.rs          | 15 ++------
 src/octree.rs          |  4 +-
 src/parallel_octree.rs | 86 +++++++++++++++++++++++++++++++++++++++---
 3 files changed, 86 insertions(+), 19 deletions(-)

diff --git a/src/morton.rs b/src/morton.rs
index 0f93c66..7babc72 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -491,6 +491,8 @@ impl MortonKey {
     }
 
     /// Complete a region ensuring that the given keys are part of the leafs.
+    ///
+    /// The given keys must not overlap.
     pub fn complete_region(keys: &[MortonKey]) -> Vec<MortonKey> {
         // First make sure that the input sequence is sorted.
         let mut keys = keys.to_vec();
@@ -509,15 +511,8 @@ impl MortonKey {
             return result;
         }
 
-        let deepest_first = MortonKey::from_index_and_level([0, 0, 0], DEEPEST_LEVEL as usize);
-        let deepest_last = MortonKey::from_index_and_level(
-            [
-                LEVEL_SIZE as usize - 1,
-                LEVEL_SIZE as usize - 1,
-                LEVEL_SIZE as usize - 1,
-            ],
-            DEEPEST_LEVEL as usize,
-        );
+        let deepest_first = MortonKey::deepest_first();
+        let deepest_last = MortonKey::deepest_last();
 
         // If the first key is not an ancestor of the deepest possible first element in the
         // tree get the finest ancestor between the two and use the first child of that.
@@ -975,8 +970,6 @@ mod test {
 
         let keys = children[1].fill_between_keys(children[2]);
         assert!(keys.is_empty());
-
-        // Correct result for two keys at deepest level
     }
 
     #[test]
diff --git a/src/octree.rs b/src/octree.rs
index e68297d..64cdb08 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -291,7 +291,7 @@ mod test {
     fn test_octree() {
         use std::time::Instant;
 
-        let npoints = 1000000;
+        let npoints = 10000;
         let points = get_points_on_sphere(npoints);
         let max_level = 7;
         let max_points_per_box = 100;
@@ -308,7 +308,7 @@ mod test {
     #[test]
     fn test_export() {
         let fname = "_test_sphere.vtk";
-        let npoints = 1000000;
+        let npoints = 10000;
         let points = get_points_on_sphere(npoints);
         let max_level = 7;
         let max_points_per_box = 100;
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index f6ab194..f6c29f0 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -1,12 +1,12 @@
 //! Parallel Octree structure
 
-use std::{borrow::BorrowMut, collections::HashMap, fmt::Display};
+use std::collections::HashMap;
 
 use crate::{
-    constants::{DEEPEST_LEVEL, NLEVELS},
+    constants::{DEEPEST_LEVEL, NSIBLINGS},
     geometry::PhysicalBox,
     morton::MortonKey,
-    parsort::{parsort, MaxValue, MinValue},
+    parsort::parsort,
 };
 
 use mpi::{
@@ -18,9 +18,7 @@ use mpi::{
 use itertools::{izip, Itertools};
 use mpi::{
     collective::SystemOperation,
-    datatype::UncommittedUserDatatype,
-    topology::Process,
-    traits::{CommunicatorCollectives, Destination, Equivalence},
+    traits::{CommunicatorCollectives, Destination},
 };
 use rand::Rng;
 
@@ -425,3 +423,79 @@ pub fn partition<C: CommunicatorCollectives>(
     recvbuffer.sort_unstable();
     recvbuffer
 }
+
+/// Given a distributed set of keys, generate a complete linear Octree.
+pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
+    keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    let mut linearized_keys = linearize(keys, rng, comm);
+
+    let size = comm.size();
+    let rank = comm.rank();
+
+    if size == 1 {
+        return MortonKey::complete_region(linearized_keys.as_slice());
+    }
+
+    // Now insert on the first and last process the first and last child of the
+    // finest ancestor of first/last box on deepest level
+
+    // Send first element to previous rank and insert into local keys.
+    // On the first process we also need to insert the first child of the finest
+    // ancestor of the deepest first key and first element. Correspondingly on the last process
+    // we need to insert the last child of the finest ancester of the deepest last key and last element.
+
+    if rank == size - 1 {
+        // On last process send first element to previous processes and insert last
+        // possible box from region into list.
+        comm.process_at_rank(rank - 1)
+            .send(linearized_keys.first().unwrap());
+        let last_key = *linearized_keys.last().unwrap();
+        let deepest_last = MortonKey::deepest_last();
+        if !last_key.is_ancestor(deepest_last) {
+            let ancestor = deepest_last.finest_common_ancestor(last_key);
+            linearized_keys.push(ancestor.children()[NSIBLINGS - 1]);
+        }
+    } else {
+        let (other, _status) = if rank > 0 {
+            // On intermediate process receive from the next process
+            // and send first element to previous process.
+            p2p::send_receive(
+                linearized_keys.first().unwrap(),
+                &comm.process_at_rank(rank - 1),
+                &comm.process_at_rank(rank + 1),
+            )
+        } else {
+            // On first process insert at the beginning the first possible
+            // box in the region and receive the key from next process.
+            let first_key = *linearized_keys.first().unwrap();
+            let deepest_first = MortonKey::deepest_first();
+            if !first_key.is_ancestor(deepest_first) {
+                let ancestor = deepest_first.finest_common_ancestor(first_key);
+                linearized_keys.push(ancestor.children()[0]);
+            }
+
+            comm.process_at_rank(1).receive::<MortonKey>()
+        };
+        // If we are not at the last process we need to introduce the received key
+        // into our list.
+        linearized_keys.push(other);
+    };
+
+    // Now complete the regions defined by the keys on each process.
+
+    let mut result = Vec::<MortonKey>::new();
+
+    for (&key1, &key2) in linearized_keys.iter().tuple_windows() {
+        result.push(key1);
+        result.extend_from_slice(key1.fill_between_keys(key2).as_slice());
+    }
+
+    if rank == size - 1 {
+        result.push(*linearized_keys.last().unwrap());
+    }
+
+    result
+}

From 59e3f9a30b976f18930bb681e2958bcc78d871bf Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Thu, 12 Sep 2024 16:37:03 +0100
Subject: [PATCH 05/42] WIP: Testing block partitioning

---
 examples/mpi.rs        | 86 +++++++++++++++++++++++++-----------------
 src/morton.rs          | 16 ++++----
 src/parallel_octree.rs | 25 +++++++-----
 3 files changed, 75 insertions(+), 52 deletions(-)

diff --git a/examples/mpi.rs b/examples/mpi.rs
index d427fd5..d1c3c8e 100644
--- a/examples/mpi.rs
+++ b/examples/mpi.rs
@@ -1,6 +1,6 @@
 //! Testing the hyksort component.
 use bempp_octree::morton::MortonKey;
-use bempp_octree::parallel_octree::{linearize, partition};
+use bempp_octree::parallel_octree::{block_partition, linearize, partition};
 use bempp_octree::parsort::{array_to_root, parsort};
 use itertools::Itertools;
 use mpi::traits::*;
@@ -19,19 +19,7 @@ pub fn assert_linearized<C: CommunicatorCollectives>(arr: &Vec<MortonKey>, comm:
     }
 }
 
-pub fn main() {
-    let universe = mpi::initialize().unwrap();
-    let world = universe.world();
-    let rank = world.rank() as u64;
-    let max_level = 6;
-
-    // Each process gets its own rng
-    let mut rng = rand::rngs::StdRng::seed_from_u64(rank as u64);
-
-    // We first create a non-uniform tree on rank 0.
-
-    let mut keys = Vec::<MortonKey>::new();
-
+pub fn generate_random_tree<R: Rng>(max_level: usize, rng: &mut R) -> Vec<MortonKey> {
     pub fn add_level<R: Rng>(
         keys: &mut Vec<MortonKey>,
         current: MortonKey,
@@ -56,43 +44,71 @@ pub fn main() {
         }
     }
 
-    add_level(&mut keys, MortonKey::root(), &mut rng, max_level);
+    let mut keys = Vec::<MortonKey>::new();
+    add_level(&mut keys, MortonKey::root(), rng, max_level);
 
-    println!("Number of keys on rank {}: {}", rank, keys.len());
+    keys
+}
+
+pub fn test_linearize<R: Rng, C: CommunicatorCollectives>(rng: &mut R, comm: &C) {
+    let max_level = 6;
+    let keys = generate_random_tree(max_level, rng);
+    let rank = comm.rank();
 
     // We now linearize the keys.
 
     if rank == 0 {
         println!("Linearizing keys.");
     }
-    let sorted_keys = linearize(&keys, &mut rng, &world);
-
-    println!(
-        "Number of linearized keys on rank {}: {}",
-        rank,
-        sorted_keys.len()
-    );
+    let sorted_keys = linearize(&keys, rng, comm);
 
     // Now check that the tree is properly linearized.
 
-    assert_linearized(&sorted_keys, &world);
+    assert_linearized(&sorted_keys, comm);
+    if rank == 0 {
+        println!("Linearization successful.");
+    }
 
-    // We now partition the keys equally across the processes. We give
-    // each leaf equal weights here.
+    // Now form the coarse tree
+}
 
-    let weights = vec![1 as usize; sorted_keys.len()];
+pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, comm: &C) {
+    let max_level = 6;
+    let keys = generate_random_tree(max_level, rng);
+    let rank = comm.rank();
+
+    let arr = array_to_root(&keys, comm);
 
     if rank == 0 {
-        println!("Partitioning keys.");
+        let arr = arr.unwrap();
+        println!("Fine tree has {} elements", arr.len());
     }
 
-    let sorted_keys = partition(&sorted_keys, &weights, &world);
+    // We now linearize the keys.
 
-    println!(
-        "After partitioning have {} keys on rank {}",
-        sorted_keys.len(),
-        rank
-    );
+    let keys = parsort(&keys, comm, rng);
 
-    assert_linearized(&sorted_keys, &world);
+    let coarse_tree = block_partition(&keys, rng, comm);
+    if rank == 1 {
+        println!("Length of coarse tree {}", coarse_tree.len());
+    }
+
+    let arr = array_to_root(&coarse_tree, comm);
+
+    if rank == 0 {
+        let arr = arr.unwrap();
+        assert!(MortonKey::is_complete_linear_octree(&arr));
+
+        println!("Coarse tree has {} keys", arr.len());
+    }
+}
+
+pub fn main() {
+    let universe = mpi::initialize().unwrap();
+    let comm = universe.world();
+    let rank = comm.rank() as u64;
+    // Each process gets its own rng
+    let mut rng = rand::rngs::StdRng::seed_from_u64(rank as u64);
+    test_linearize(&mut rng, &comm);
+    test_coarse_partition(&mut rng, &comm);
 }
diff --git a/src/morton.rs b/src/morton.rs
index 7babc72..1e1e633 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -490,10 +490,10 @@ impl MortonKey {
         result
     }
 
-    /// Complete a region ensuring that the given keys are part of the leafs.
+    /// Complete a tree ensuring that the given keys are part of the leafs.
     ///
     /// The given keys must not overlap.
-    pub fn complete_region(keys: &[MortonKey]) -> Vec<MortonKey> {
+    pub fn complete_tree(keys: &[MortonKey]) -> Vec<MortonKey> {
         // First make sure that the input sequence is sorted.
         let mut keys = keys.to_vec();
         keys.sort_unstable();
@@ -506,9 +506,9 @@ impl MortonKey {
             return result;
         }
 
-        // If a single element is given then just return the result if it is the root of the tree.
-        if keys.len() == 1 && result[0] == MortonKey::from_index_and_level([0, 0, 0], 0) {
-            return result;
+        // If just the root is given return that.
+        if keys.len() == 1 && *keys.first().unwrap() == MortonKey::root() {
+            return keys.to_vec();
         }
 
         let deepest_first = MortonKey::deepest_first();
@@ -1018,13 +1018,13 @@ mod test {
 
         let keys = [key1, key2, key3];
 
-        let complete_region = MortonKey::complete_region(keys.as_slice());
+        let complete_region = MortonKey::complete_tree(keys.as_slice());
 
         sanity_checks(keys.as_slice(), complete_region.as_slice());
 
         // For an empty slice the complete region method should just add the root of the tree.
         let keys = Vec::<MortonKey>::new();
-        let complete_region = MortonKey::complete_region(keys.as_slice());
+        let complete_region = MortonKey::complete_tree(keys.as_slice());
         assert_eq!(complete_region.len(), 1);
 
         sanity_checks(keys.as_slice(), complete_region.as_slice());
@@ -1033,7 +1033,7 @@ mod test {
 
         let keys = [MortonKey::deepest_first(), MortonKey::deepest_last()];
 
-        let complete_region = MortonKey::complete_region(keys.as_slice());
+        let complete_region = MortonKey::complete_tree(keys.as_slice());
 
         sanity_checks(keys.as_slice(), complete_region.as_slice());
     }
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index f6c29f0..5a33f8c 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -194,17 +194,19 @@ pub fn points_to_morton<C: CommunicatorCollectives>(
     (keys, bounding_box)
 }
 
+/// Block partition of tree
 pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
-    keys: &[MortonKey],
+    sorted_keys: &[MortonKey],
     rng: &mut R,
     comm: &C,
-) {
-    // First we sort the array of weighted keys.
-
-    let sorted_keys = parsort(&keys, comm, rng);
+) -> Vec<MortonKey> {
+    let mut completed_region = sorted_keys
+        .first()
+        .unwrap()
+        .fill_between_keys(*sorted_keys.last().unwrap());
 
-    let mut completed_region =
-        MortonKey::complete_region(&[*sorted_keys.first().unwrap(), *sorted_keys.last().unwrap()]);
+    completed_region.insert(0, *sorted_keys.first().unwrap());
+    completed_region.push(*sorted_keys.last().unwrap());
 
     // Get the smallest level members of the completed region.
 
@@ -216,7 +218,12 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
 
     let largest_boxes = completed_region
         .iter()
-        .filter(|elem| elem.level() == min_level);
+        .filter(|elem| elem.level() == min_level)
+        .copied()
+        .collect_vec();
+
+    let coarse_tree = complete_tree(&largest_boxes, rng, comm);
+    coarse_tree
 }
 
 /// Linearize a set of weighted Morton keys.
@@ -436,7 +443,7 @@ pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
     let rank = comm.rank();
 
     if size == 1 {
-        return MortonKey::complete_region(linearized_keys.as_slice());
+        return MortonKey::complete_tree(linearized_keys.as_slice());
     }
 
     // Now insert on the first and last process the first and last child of the

From d86abf4804c40da9da0107f22db61fbb401c627d Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Thu, 12 Sep 2024 23:39:37 +0100
Subject: [PATCH 06/42] WIP: Block coarsening

---
 examples/mpi.rs        |  9 +++------
 src/parallel_octree.rs | 20 ++++++++++++++++++--
 2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/examples/mpi.rs b/examples/mpi.rs
index d1c3c8e..e8e0490 100644
--- a/examples/mpi.rs
+++ b/examples/mpi.rs
@@ -1,6 +1,6 @@
 //! Testing the hyksort component.
 use bempp_octree::morton::MortonKey;
-use bempp_octree::parallel_octree::{block_partition, linearize, partition};
+use bempp_octree::parallel_octree::{block_partition, is_sorted_array, linearize, partition};
 use bempp_octree::parsort::{array_to_root, parsort};
 use itertools::Itertools;
 use mpi::traits::*;
@@ -89,17 +89,14 @@ pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, co
     let keys = parsort(&keys, comm, rng);
 
     let coarse_tree = block_partition(&keys, rng, comm);
-    if rank == 1 {
-        println!("Length of coarse tree {}", coarse_tree.len());
-    }
 
     let arr = array_to_root(&coarse_tree, comm);
 
     if rank == 0 {
         let arr = arr.unwrap();
-        assert!(MortonKey::is_complete_linear_octree(&arr));
-
         println!("Coarse tree has {} keys", arr.len());
+        assert!(MortonKey::is_complete_linear_octree(&arr));
+        println!("Coarse tree is sorted and complete.");
     }
 }
 
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index 5a33f8c..0fb778e 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -6,7 +6,7 @@ use crate::{
     constants::{DEEPEST_LEVEL, NSIBLINGS},
     geometry::PhysicalBox,
     morton::MortonKey,
-    parsort::parsort,
+    parsort::{array_to_root, parsort},
 };
 
 use mpi::{
@@ -481,7 +481,7 @@ pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
             let deepest_first = MortonKey::deepest_first();
             if !first_key.is_ancestor(deepest_first) {
                 let ancestor = deepest_first.finest_common_ancestor(first_key);
-                linearized_keys.push(ancestor.children()[0]);
+                linearized_keys.insert(0, ancestor.children()[0]);
             }
 
             comm.process_at_rank(1).receive::<MortonKey>()
@@ -506,3 +506,19 @@ pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
 
     result
 }
+
+/// Check if an array is sorted.
+pub fn is_sorted_array<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> Option<bool> {
+    let arr = array_to_root(arr, comm);
+    if comm.rank() == 0 {
+        let arr = arr.unwrap();
+        for (&elem1, &elem2) in arr.iter().tuple_windows() {
+            if elem1 > elem2 {
+                return Some(false);
+            }
+        }
+        Some(true)
+    } else {
+        None
+    }
+}

From 695239fbf8a27cb9b31bed877531bcaa63e3abd3 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 13 Sep 2024 18:35:52 +0100
Subject: [PATCH 07/42] Testing the coarse partitioning.

---
 src/parallel_octree.rs | 131 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 130 insertions(+), 1 deletion(-)

diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index 0fb778e..388abef 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -12,7 +12,7 @@ use crate::{
 use mpi::{
     datatype::{Partition, PartitionMut},
     point_to_point as p2p,
-    traits::{Root, Source},
+    traits::{Equivalence, Root, Source},
 };
 
 use itertools::{izip, Itertools};
@@ -200,6 +200,8 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
     rng: &mut R,
     comm: &C,
 ) -> Vec<MortonKey> {
+    let rank = comm.rank();
+
     let mut completed_region = sorted_keys
         .first()
         .unwrap()
@@ -216,6 +218,9 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
         .min()
         .unwrap();
 
+    // Each process selects its largest boxes. These are used to create
+    // a coarse tree.
+
     let largest_boxes = completed_region
         .iter()
         .filter(|elem| elem.level() == min_level)
@@ -223,6 +228,82 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
         .collect_vec();
 
     let coarse_tree = complete_tree(&largest_boxes, rng, comm);
+
+    // We want to partition the coarse tree. But we need the correct weights. The idea
+    // is that we use the number of original leafs that intersect with the coarse tree
+    // as leafs. In order to compute this we send the coarse tree around to all processes
+    // so that each process computes for each coarse tree element how many of its keys
+    // intersect with each node of the coarse tree. We then sum up the local weight for each
+    // coarse tree node across all nodes to get the weight.
+
+    let global_coarse_tree = gather_to_all(&coarse_tree, comm);
+
+    // We also want to send around a corresponding array of ranks so that for each global coarse tree key
+    // we have the rank of where it originates from.
+
+    let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm);
+
+    // We now compute the local weights.
+    let mut local_weights = vec![0 as usize; global_coarse_tree.len()];
+
+    // In the following loop we want to be a bit smart. We do not iterate through all the local elements.
+    // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
+    // of our sorted keys that overlaps with the coarse tree region.
+
+    // Let's find the start of our region.
+
+    let first_key = *sorted_keys.first().unwrap();
+
+    let first_coarse_index = global_coarse_tree
+        .iter()
+        .take_while(|coarse_key| !coarse_key.is_ancestor(first_key))
+        .count();
+
+    // Now we need to find the end index of our region.
+    let last_key = *sorted_keys.last().unwrap();
+
+    let last_coarse_index = first_coarse_index
+        + global_coarse_tree
+            .iter()
+            .skip(first_coarse_index)
+            .take_while(|coarse_key| coarse_key.is_ancestor(last_key))
+            .count();
+
+    // We now only need to iterate through between the first and last coarse index in the coarse tree.
+
+    for (w, &global_coarse_key) in izip!(
+        local_weights[first_coarse_index..last_coarse_index].iter_mut(),
+        global_coarse_tree[first_coarse_index..last_coarse_index].iter()
+    ) {
+        *w += sorted_keys
+            .iter()
+            .filter(|&&key| global_coarse_key.is_ancestor(key))
+            .count();
+    }
+
+    // We now need to sum up the weights across all processes.
+
+    let mut weights = vec![0 as usize; global_coarse_tree.len()];
+
+    comm.all_reduce_into(&local_weights, &mut weights, SystemOperation::sum());
+
+    // Each process now has all weights. However, we only need the ones for the current process.
+    // So we just filter the rest out.
+
+    let weights = izip!(coarse_tree_ranks, weights)
+        .filter_map(|(r, weight)| {
+            if r == rank as usize {
+                Some(weight)
+            } else {
+                None
+            }
+        })
+        .collect_vec();
+
+    // We have now all the information we need to repartition the coarse tree (finally...). Let's just do it.
+
+    let coarse_tree = partition(&coarse_tree, &weights, comm);
+
     coarse_tree
 }
 
@@ -522,3 +603,51 @@ pub fn is_sorted_array<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C)
         None
     }
 }
+
+/// Get global size of a distributed array.
+pub fn global_size<T, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> usize {
+    let local_size = arr.len();
+    let mut global_size = 0;
+
+    comm.all_reduce_into(&local_size, &mut global_size, SystemOperation::sum());
+
+    global_size
+}
+
+/// Gather array to all processes
+pub fn gather_to_all<T: Equivalence, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> Vec<T> {
+    // First we need to broadcast the individual sizes on each process.
+
+    let size = comm.size();
+
+    let local_len = arr.len();
+
+    let mut sizes = vec![0 as i32; size as usize];
+
+    comm.all_to_all_into(&local_len, &mut sizes);
+
+    let recv_len = sizes.iter().sum::<i32>() as usize;
+
+    // Now we have the size of each local contribution.
+    // let mut recvbuffer =
+    //     vec![T: Default; counts_from_processor.iter().sum::<i32>() as usize];
+    let mut recvbuffer = Vec::<T>::with_capacity(recv_len);
+    let buf: &mut [T] = unsafe { std::mem::transmute(recvbuffer.spare_capacity_mut()) };
+
+    let recv_displs: Vec<i32> = sizes
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect();
+
+    let mut receiv_partition = PartitionMut::new(buf, sizes, &recv_displs[..]);
+
+    comm.all_gather_varcount_into(arr, &mut receiv_partition);
+
+    unsafe { recvbuffer.set_len(recv_len) };
+
+    recvbuffer
+}

From 0cb3c776f59f85d0decb9f63bd4cf6057ecfa2bd Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 13 Sep 2024 18:46:41 +0100
Subject: [PATCH 08/42] Fix in coarse partitioning

---
 src/parallel_octree.rs | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index 388abef..fd01bac 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -194,7 +194,10 @@ pub fn points_to_morton<C: CommunicatorCollectives>(
     (keys, bounding_box)
 }
 
-/// Block partition of tree
+/// Block partition of tree.
+///
+/// A necessary condition for the block partitioning is that
+// all sorted keys are on the same level.
 pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
     sorted_keys: &[MortonKey],
     rng: &mut R,
@@ -250,7 +253,8 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
     // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
     // of our sorted keys that overlaps with the coarse tree region.
 
-    // Let's find the start of our region.
+    // Let's find the start of our region. The start of our region is a coarse key that is an ancestor
+    // of our current key. This works because the coarse tree has levels at most as high as the sorted keys.
 
     let first_key = *sorted_keys.first().unwrap();
 
@@ -259,21 +263,22 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
         .take_while(|coarse_key| !coarse_key.is_ancestor(first_key))
         .count();
 
-    // Now we need to find the end index of our region.
+    // Now we need to find the end index of our region. For this again we find the index of our coarse tree that
+    // is an ancestor of our last key.
     let last_key = *sorted_keys.last().unwrap();
 
     let last_coarse_index = first_coarse_index
         + global_coarse_tree
             .iter()
-            .skip(first_coarse_index)
-            .take_while(|coarse_key| coarse_key.is_ancestor(last_key))
+            .take_while(|coarse_key| !coarse_key.is_ancestor(last_key))
             .count();
 
     // We now only need to iterate through between the first and last coarse index in the coarse tree.
+    // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key).
 
     for (w, &global_coarse_key) in izip!(
-        local_weights[first_coarse_index..last_coarse_index].iter_mut(),
-        global_coarse_tree[first_coarse_index..last_coarse_index].iter()
+        local_weights[first_coarse_index..=last_coarse_index].iter_mut(),
+        global_coarse_tree[first_coarse_index..=last_coarse_index].iter()
     ) {
         *w += sorted_keys
             .iter()

From 6532b499a75fd2744f026b6f139d9c2fc5f7961c Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 13 Sep 2024 19:12:35 +0100
Subject: [PATCH 09/42] Testing block partitioning

---
 examples/mpi.rs        |  8 +++++++-
 src/parallel_octree.rs | 13 ++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/mpi.rs b/examples/mpi.rs
index e8e0490..cdfc38b 100644
--- a/examples/mpi.rs
+++ b/examples/mpi.rs
@@ -86,10 +86,16 @@ pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, co
 
     // We now linearize the keys.
 
-    let keys = parsort(&keys, comm, rng);
+    let keys = linearize(&keys, rng, comm);
 
     let coarse_tree = block_partition(&keys, rng, comm);
 
+    println!(
+        "Coarse tree on rank {} has {} keys.",
+        rank,
+        coarse_tree.len()
+    );
+
     let arr = array_to_root(&coarse_tree, comm);
 
     if rank == 0 {
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index fd01bac..5aee2d9 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -267,11 +267,10 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
     // is an ancestor of our last key.
     let last_key = *sorted_keys.last().unwrap();
 
-    let last_coarse_index = first_coarse_index
-        + global_coarse_tree
-            .iter()
-            .take_while(|coarse_key| !coarse_key.is_ancestor(last_key))
-            .count();
+    let last_coarse_index = global_coarse_tree
+        .iter()
+        .take_while(|coarse_key| !coarse_key.is_ancestor(last_key))
+        .count();
 
     // We now only need to iterate through between the first and last coarse index in the coarse tree.
     // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key).
@@ -625,11 +624,11 @@ pub fn gather_to_all<T: Equivalence, C: CommunicatorCollectives>(arr: &[T], comm
 
     let size = comm.size();
 
-    let local_len = arr.len();
+    let local_len = arr.len() as i32;
 
     let mut sizes = vec![0 as i32; size as usize];
 
-    comm.all_to_all_into(&local_len, &mut sizes);
+    comm.all_gather_into(&local_len, &mut sizes);
 
     let recv_len = sizes.iter().sum::<i32>() as usize;
 

From ce53f8c7d0e9b1304f6909f111b2b85b7e19f751 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sat, 14 Sep 2024 17:24:24 +0100
Subject: [PATCH 10/42] WIP: Block partitioning

---
 Cargo.toml             |  2 +-
 examples/mpi.rs        | 33 +++++++++++------
 src/morton.rs          |  7 ++++
 src/parallel_octree.rs | 82 ++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 110 insertions(+), 14 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index e089311..a44ac3c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ crate-type = ["cdylib", "lib"]
 
 [dependencies]
 itertools = "0.13.*"
-rand = "0.8.5"
+rand = { version = "0.8.5", features = ["alloc"] }
 bytemuck = "1.*"
 vtkio = "0.6.*"
 mpi = {version = "0.8.*", features = ["derive", "user-operations"] }
diff --git a/examples/mpi.rs b/examples/mpi.rs
index cdfc38b..5b46df2 100644
--- a/examples/mpi.rs
+++ b/examples/mpi.rs
@@ -1,8 +1,9 @@
 //! Testing the hyksort component.
+use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE};
 use bempp_octree::morton::MortonKey;
 use bempp_octree::parallel_octree::{block_partition, is_sorted_array, linearize, partition};
 use bempp_octree::parsort::{array_to_root, parsort};
-use itertools::Itertools;
+use itertools::{izip, Itertools};
 use mpi::traits::*;
 use rand::prelude::*;
 
@@ -19,6 +20,23 @@ pub fn assert_linearized<C: CommunicatorCollectives>(arr: &Vec<MortonKey>, comm:
     }
 }
 
+pub fn generate_random_keys<R: Rng>(nkeys: usize, rng: &mut R) -> Vec<MortonKey> {
+    let mut result = Vec::<MortonKey>::with_capacity(nkeys);
+
+    let xindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+    let yindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+    let zindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+
+    for (xval, yval, zval) in izip!(xindices.iter(), yindices.iter(), zindices.iter()) {
+        result.push(MortonKey::from_index_and_level(
+            [xval, yval, zval],
+            DEEPEST_LEVEL as usize,
+        ));
+    }
+
+    result
+}
+
 pub fn generate_random_tree<R: Rng>(max_level: usize, rng: &mut R) -> Vec<MortonKey> {
     pub fn add_level<R: Rng>(
         keys: &mut Vec<MortonKey>,
@@ -73,20 +91,13 @@ pub fn test_linearize<R: Rng, C: CommunicatorCollectives>(rng: &mut R, comm: &C)
 }
 
 pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, comm: &C) {
-    let max_level = 6;
-    let keys = generate_random_tree(max_level, rng);
+    let keys = generate_random_keys(10000, rng);
     let rank = comm.rank();
 
-    let arr = array_to_root(&keys, comm);
-
-    if rank == 0 {
-        let arr = arr.unwrap();
-        println!("Fine tree has {} elements", arr.len());
-    }
-
     // We now linearize the keys.
 
     let keys = linearize(&keys, rng, comm);
+    println!("There are {} keys on rank {}", keys.len(), rank);
 
     let coarse_tree = block_partition(&keys, rng, comm);
 
@@ -102,7 +113,7 @@ pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, co
         let arr = arr.unwrap();
         println!("Coarse tree has {} keys", arr.len());
         assert!(MortonKey::is_complete_linear_octree(&arr));
-        println!("Coarse tree is sorted and complete.");
+        println!("Coarse tree is sorted, linear and complete.");
     }
 }
 
diff --git a/src/morton.rs b/src/morton.rs
index 1e1e633..9601e4d 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -48,6 +48,13 @@ impl MortonKey {
         key
     }
 
+    /// A key that is not valid or well formed but guaranteed to be larger than any valid key.
+    ///
+    /// This is useful when a guaranteed upper bound is needed.
+    pub fn upper_bound() -> Self {
+        Self { value: u64::MAX }
+    }
+
     /// Check if a key is invalid.
     pub fn invalid_key() -> Self {
         Self { value: 1 << 63 }
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index 5aee2d9..6095cd1 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -304,11 +304,89 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
         })
         .collect_vec();
 
-    // We have now all the information we need to repartition the coarse tree (finally...). Let's just do it.
-
     let coarse_tree = partition(&coarse_tree, &weights, comm);
 
     coarse_tree
+
+    // We now need to redistribute the global tree according to the coarse tree.
+}
+
+pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
+    sorted_keys: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    let rank = comm.rank();
+    let size = comm.size();
+
+    if size == 1 {
+        return sorted_keys.to_vec();
+    }
+
+    // We want to globally redistribute keys so that the keys on each process are descendents
+    // of the local coarse tree keys.
+
+    // We are using here the fact that the coarse tree is complete and sorted.
+    // We are sending around to each process the first local index. This
+    // defines bins in which we sort our keys. The keys are then sent around to the correct
+    // processes via an alltoallv operation.
+
+    let my_first = *coarse_tree.first().unwrap();
+
+    let mut global_bins = Vec::<MortonKey>::with_capacity(size as usize);
+    let global_bins_buff: &mut [MortonKey] =
+        unsafe { std::mem::transmute(global_bins.spare_capacity_mut()) };
+
+    comm.all_gather_into(&my_first, global_bins_buff);
+
+    unsafe { global_bins.set_len(size as usize) };
+
+    // // We now have the first index from each process. We also want the last index from the last
+    // // process everywhere to make sorting into bins easier.
+
+    // let mut last_coarse_key = MortonKey::default();
+
+    // if rank == size - 1 {
+    //     last_coarse_key = *coarse_tree.last().unwrap();
+    // }
+
+    // comm.process_at_rank(size - 1)
+    //     .broadcast_into(&mut last_coarse_key);
+
+    global_bins.push(MortonKey::upper_bound());
+    let mut ranks = vec![0 as usize; size as usize];
+
+    // We now have our bins. We go through our keys and assign to each key the
+    // rank it should be sent to. For this we are using the fact that both our
+    // keys and the coarse tree are sorted.
+
+    // This iterates over each possible bin and returns also the associated rank.
+    let mut bin_iter = global_bins
+        .iter()
+        .tuple_windows::<(&MortonKey, &MortonKey)>()
+        .enumerate();
+
+    // We take the first element of the bin iterator. There will always be at least one.
+    let (mut rank, (mut bin_start, mut bin_end)) = bin_iter.next().unwrap();
+
+    for (&key, r) in izip!(sorted_keys.iter(), ranks.iter_mut()) {
+        if *bin_start <= key && key < *bin_end {
+            *r = rank
+        } else {
+            // Move the bin forward until it fits. There will always be a fitting bin.
+            while let Some((rn, (bsn, ben))) = bin_iter.next() {
+                if *bsn <= key && key < *ben {
+                    *r = rn;
+                    rank = rn;
+                    bin_start = bsn;
+                    bin_end = ben;
+                } else {
+                    continue;
+                }
+            }
+        }
+    }
+    sorted_keys.to_vec()
 }
 
 /// Linearize a set of weighted Morton keys.

From 6d4c14e975783f14711ba44e3dab12f88c391e79 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sat, 14 Sep 2024 17:24:55 +0100
Subject: [PATCH 11/42] WIP: Block partitioning

---
 src/parallel_octree.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index 6095cd1..8824489 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -386,6 +386,10 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
             }
         }
     }
+
+    // We have now the necessary rank for each key element.
+    // We do a stable sort of the sorted_keys to sort them by rank
+
     sorted_keys.to_vec()
 }
 

From 7b684ff71fe36ef2fc1192f5d85b9223e700bd6d Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 15 Sep 2024 18:53:35 +0100
Subject: [PATCH 12/42] Block partitioning works.

---
 examples/mpi.rs        | 44 +++++++++++++++++------
 src/parallel_octree.rs | 82 +++++++++++++++++++++++++++++++-----------
 2 files changed, 96 insertions(+), 30 deletions(-)

diff --git a/examples/mpi.rs b/examples/mpi.rs
index 5b46df2..5c2f779 100644
--- a/examples/mpi.rs
+++ b/examples/mpi.rs
@@ -91,29 +91,53 @@ pub fn test_linearize<R: Rng, C: CommunicatorCollectives>(rng: &mut R, comm: &C)
 }
 
 pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, comm: &C) {
-    let keys = generate_random_keys(10000, rng);
     let rank = comm.rank();
+    let keys = if rank == 0 {
+        generate_random_keys(50, rng)
+    } else {
+        generate_random_keys(1000, rng)
+    };
 
     // We now linearize the keys.
 
-    let keys = linearize(&keys, rng, comm);
-    println!("There are {} keys on rank {}", keys.len(), rank);
+    let mut keys = linearize(&keys, rng, comm);
 
-    let coarse_tree = block_partition(&keys, rng, comm);
+    // We move most keys over from rank 0 to rank 2 to check how the partitioning works.
+
+    let nsend = 400;
+    // Send the last 200 keys from rank 0 to rank 1.
+
+    if rank == 0 {
+        let send_keys = &keys[keys.len() - nsend..keys.len()];
+        comm.process_at_rank(1).send(send_keys);
+        keys = keys[0..keys.len() - nsend].to_vec();
+    }
+
+    if rank == 1 {
+        let mut recv_keys = vec![MortonKey::default(); nsend];
+        comm.process_at_rank(0).receive_into(&mut recv_keys);
+        recv_keys.extend(keys.iter());
+        keys = recv_keys;
+    }
+
+    println!("Rank {} has {} keys. ", rank, keys.len());
+
+    let partitioned_tree = block_partition(&keys, rng, comm);
 
     println!(
-        "Coarse tree on rank {} has {} keys.",
+        "Partitioned tree on rank {} has {} keys.",
         rank,
-        coarse_tree.len()
+        partitioned_tree.len()
     );
 
-    let arr = array_to_root(&coarse_tree, comm);
+    let arr = array_to_root(&partitioned_tree, comm);
 
     if rank == 0 {
         let arr = arr.unwrap();
-        println!("Coarse tree has {} keys", arr.len());
-        assert!(MortonKey::is_complete_linear_octree(&arr));
-        println!("Coarse tree is sorted, linear and complete.");
+        for (elem1, elem2) in arr.iter().tuple_windows() {
+            assert!(*elem1 <= *elem2);
+        }
+        println!("Keys are sorted.");
     }
 }
 
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index 8824489..cb72ea4 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -204,6 +204,10 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
     comm: &C,
 ) -> Vec<MortonKey> {
     let rank = comm.rank();
+    if comm.size() == 1 {
+        // On a single node block partitioning should not do anything.
+        return sorted_keys.to_vec();
+    }
 
     let mut completed_region = sorted_keys
         .first()
@@ -306,18 +310,19 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
 
     let coarse_tree = partition(&coarse_tree, &weights, comm);
 
-    coarse_tree
+    redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm)
 
     // We now need to redistribute the global tree according to the coarse tree.
 }
 
+/// Redistribute sorted keys with respect to a linear coarse tree.
 pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
     sorted_keys: &[MortonKey],
     coarse_tree: &[MortonKey],
     comm: &C,
 ) -> Vec<MortonKey> {
-    let rank = comm.rank();
     let size = comm.size();
+    let rank = comm.rank();
 
     if size == 1 {
         return sorted_keys.to_vec();
@@ -354,43 +359,80 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
     //     .broadcast_into(&mut last_coarse_key);
 
     global_bins.push(MortonKey::upper_bound());
-    let mut ranks = vec![0 as usize; size as usize];
 
-    // We now have our bins. We go through our keys and assign to each key the
-    // rank it should be sent to. For this we are using the fact that both our
-    // keys and the coarse tree are sorted.
+    // We now have our bins. We go through our keys and store how
+    // many keys are assigned to each rank. We are using here that
+    // our keys and the coarse tree are both sorted.
+
+    // This will store for each rank how many keys will be assigned to it.
+    let mut rank_counts = vec![0 as i32; size as usize];
 
     // This iterates over each possible bin and returns also the associated rank.
-    let mut bin_iter = global_bins
-        .iter()
-        .tuple_windows::<(&MortonKey, &MortonKey)>()
-        .enumerate();
+    let mut bin_iter = izip!(
+        rank_counts.iter_mut(),
+        global_bins
+            .iter()
+            .tuple_windows::<(&MortonKey, &MortonKey)>(),
+    );
 
     // We take the first element of the bin iterator. There will always be at least one.
-    let (mut rank, (mut bin_start, mut bin_end)) = bin_iter.next().unwrap();
+    let mut r: &mut i32;
+    let mut bin_start: &MortonKey;
+    let mut bin_end: &MortonKey;
+    (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
 
-    for (&key, r) in izip!(sorted_keys.iter(), ranks.iter_mut()) {
+    for &key in sorted_keys.iter() {
         if *bin_start <= key && key < *bin_end {
-            *r = rank
+            *r += 1;
         } else {
             // Move the bin forward until it fits. There will always be a fitting bin.
             while let Some((rn, (bsn, ben))) = bin_iter.next() {
                 if *bsn <= key && key < *ben {
-                    *r = rn;
-                    rank = rn;
+                    *rn += 1;
+                    r = rn;
                     bin_start = bsn;
                     bin_end = ben;
-                } else {
-                    continue;
+                    break;
                 }
             }
         }
     }
 
-    // We have now the necessary rank for each key element.
-    // We do a stable sort of the sorted_keys to sort them by rank
+    // We now have the counts for each rank. Let's send it around via alltoallv.
+
+    let mut counts_from_proc = vec![0 as i32; size as usize];
+
+    comm.all_to_all_into(&rank_counts, &mut counts_from_proc);
+    // Now compute the send and receive displacements.
+
+    // We can now send around the actual elements with an alltoallv.
+    let send_displs: Vec<i32> = rank_counts
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp as i32)
+        })
+        .collect();
+
+    let send_partition = Partition::new(&sorted_keys[..], &rank_counts[..], &send_displs[..]);
+
+    let mut recvbuffer = vec![MortonKey::default(); counts_from_proc.iter().sum::<i32>() as usize];
+
+    let recv_displs: Vec<i32> = counts_from_proc
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect();
+
+    let mut receiv_partition =
+        PartitionMut::new(&mut recvbuffer[..], counts_from_proc, &recv_displs[..]);
+    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
 
-    sorted_keys.to_vec()
+    recvbuffer
 }
 
 /// Linearize a set of weighted Morton keys.

From 2946e6b18b31119304209caeef730ccf1a56baa4 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Mon, 16 Sep 2024 15:29:13 +0100
Subject: [PATCH 13/42] Implementing finest out subblocks

---
 examples/mpi.rs        |   4 +-
 src/morton.rs          |   8 +--
 src/parallel_octree.rs | 149 ++++++++++++++++++++++++++++-------------
 3 files changed, 105 insertions(+), 56 deletions(-)

diff --git a/examples/mpi.rs b/examples/mpi.rs
index 5c2f779..1837b8c 100644
--- a/examples/mpi.rs
+++ b/examples/mpi.rs
@@ -127,10 +127,10 @@ pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, co
     println!(
         "Partitioned tree on rank {} has {} keys.",
         rank,
-        partitioned_tree.len()
+        partitioned_tree.0.len()
     );
 
-    let arr = array_to_root(&partitioned_tree, comm);
+    let arr = array_to_root(&partitioned_tree.0, comm);
 
     if rank == 0 {
         let arr = arr.unwrap();
diff --git a/src/morton.rs b/src/morton.rs
index 9601e4d..fccf871 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -662,13 +662,7 @@ impl MortonKey {
                     );
                 }
             }
-            new_work_list.extend_from_slice(
-                keys.iter()
-                    .copied()
-                    .filter(|&key| key.level() == level - 1)
-                    .collect_vec()
-                    .as_slice(),
-            );
+            new_work_list.extend(keys.iter().copied().filter(|&key| key.level() == level - 1));
 
             work_list = new_work_list;
             // Now extend the work list with the
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
index cb72ea4..e77cdf1 100644
--- a/src/parallel_octree.rs
+++ b/src/parallel_octree.rs
@@ -196,17 +196,19 @@ pub fn points_to_morton<C: CommunicatorCollectives>(
 
 /// Block partition of tree.
 ///
+/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned
+/// keys and the associated coarse keys.
 /// A necessary condition for the block partitioning is that
 // all sorted keys are on the same level.
 pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
     sorted_keys: &[MortonKey],
     rng: &mut R,
     comm: &C,
-) -> Vec<MortonKey> {
+) -> (Vec<MortonKey>, Vec<MortonKey>) {
     let rank = comm.rank();
     if comm.size() == 1 {
         // On a single node block partitioning should not do anything.
-        return sorted_keys.to_vec();
+        return (sorted_keys.to_vec(), vec![MortonKey::root()]);
     }
 
     let mut completed_region = sorted_keys
@@ -310,7 +312,10 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
 
     let coarse_tree = partition(&coarse_tree, &weights, comm);
 
-    redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm)
+    (
+        redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm),
+        coarse_tree,
+    )
 
     // We now need to redistribute the global tree according to the coarse tree.
 }
@@ -322,7 +327,6 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
     comm: &C,
 ) -> Vec<MortonKey> {
     let size = comm.size();
-    let rank = comm.rank();
 
     if size == 1 {
         return sorted_keys.to_vec();
@@ -346,18 +350,9 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
 
     unsafe { global_bins.set_len(size as usize) };
 
-    // // We now have the first index from each process. We also want the last index from the last
-    // // process everywhere to make sorting into bins easier.
-
-    // let mut last_coarse_key = MortonKey::default();
-
-    // if rank == size - 1 {
-    //     last_coarse_key = *coarse_tree.last().unwrap();
-    // }
-
-    // comm.process_at_rank(size - 1)
-    //     .broadcast_into(&mut last_coarse_key);
-
+    // We now have the first index from each process. We also want
+    // an upper bound for the last index of the tree to make the sorting into
+    // bins easier.
     global_bins.push(MortonKey::upper_bound());
 
     // We now have our bins. We go through our keys and store how
@@ -365,38 +360,11 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
     // our keys and the coarse tree are both sorted.
 
     // This will store for each rank how many keys will be assigned to it.
-    let mut rank_counts = vec![0 as i32; size as usize];
 
-    // This iterates over each possible bin and returns also the associated rank.
-    let mut bin_iter = izip!(
-        rank_counts.iter_mut(),
-        global_bins
-            .iter()
-            .tuple_windows::<(&MortonKey, &MortonKey)>(),
-    );
-
-    // We take the first element of the bin iterator. There will always be at least one.
-    let mut r: &mut i32;
-    let mut bin_start: &MortonKey;
-    let mut bin_end: &MortonKey;
-    (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
-
-    for &key in sorted_keys.iter() {
-        if *bin_start <= key && key < *bin_end {
-            *r += 1;
-        } else {
-            // Move the bin forward until it fits. There will always be a fitting bin.
-            while let Some((rn, (bsn, ben))) = bin_iter.next() {
-                if *bsn <= key && key < *ben {
-                    *rn += 1;
-                    r = rn;
-                    bin_start = bsn;
-                    bin_end = ben;
-                    break;
-                }
-            }
-        }
-    }
+    let rank_counts = sort_to_bins(sorted_keys, &global_bins)
+        .iter()
+        .map(|&elem| elem as i32)
+        .collect_vec();
 
     // We now have the counts for each rank. Let's send it around via alltoallv.
 
@@ -435,6 +403,93 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
     recvbuffer
 }
 
+/// Create bins from sorted keys.
+pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec<usize> {
+    let mut bin_counts = vec![0 as usize; bins.len() - 1];
+
+    // This iterates over each possible bin and returns also the associated rank.
+    let mut bin_iter = izip!(
+        bin_counts.iter_mut(),
+        bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(),
+    );
+
+    // We take the first element of the bin iterator. There will always be at least one.
+    let mut r: &mut usize;
+    let mut bin_start: &MortonKey;
+    let mut bin_end: &MortonKey;
+    (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
+
+    for &key in sorted_keys.iter() {
+        if *bin_start <= key && key < *bin_end {
+            *r += 1;
+        } else {
+            // Move the bin forward until it fits. There will always be a fitting bin.
+            while let Some((rn, (bsn, ben))) = bin_iter.next() {
+                if *bsn <= key && key < *ben {
+                    *rn += 1;
+                    r = rn;
+                    bin_start = bsn;
+                    bin_end = ben;
+                    break;
+                }
+            }
+        }
+    }
+
+    bin_counts
+}
+
+/// Return a complete tree generated from local keys and associated coarse keys.
+///
+/// The coarse keys are refined until the maximum level is reached or until each coarse key
+/// is the ancestor of at most `max_keys` fine keys.
+pub fn create_local_tree(
+    sorted_fine_keys: &[MortonKey],
+    coarse_keys: &[MortonKey],
+    mut max_level: usize,
+    max_keys: usize,
+) -> Vec<MortonKey> {
+    if max_level > DEEPEST_LEVEL as usize {
+        max_level = DEEPEST_LEVEL as usize;
+    }
+
+    // We split the sorted fine keys into subslices so that each subslice
+    // is associated with a coarse slice. For this we need to add an upper bound
+    // coarse keys to ensure that we have suitable bins.
+
+    let mut bins = coarse_keys.to_vec();
+    bins.push(MortonKey::upper_bound());
+
+    let counts = sort_to_bins(&sorted_fine_keys, &bins);
+
+    // We now know how many fine keys are associated with each coarse block. We iterate
+    // through and locally refine for each block that requires it.
+
+    let mut remainder = sorted_fine_keys;
+    let mut new_coarse_keys = Vec::<MortonKey>::new();
+
+    for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) {
+        let current;
+        (current, remainder) = remainder.split_at(count);
+        if coarse_key.level() < max_level && current.len() > max_keys {
+            // We need to refine the current split.
+            new_coarse_keys.extend_from_slice(
+                create_local_tree(
+                    current,
+                    coarse_key.children().as_slice(),
+                    max_level,
+                    max_keys,
+                )
+                .as_slice(),
+            );
+        } else {
+            new_coarse_keys.push(coarse_key)
+        }
+    }
+
+    coarse_keys.to_vec()
+}
+
 /// Linearize a set of weighted Morton keys.
 pub fn linearize<R: Rng, C: CommunicatorCollectives>(
     keys: &[MortonKey],

From 805666be26b70f0c3d2e5d20a35bc12230c5765c Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Mon, 16 Sep 2024 22:50:07 +0100
Subject: [PATCH 14/42] Added outer key generation

---
 src/morton.rs | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/src/morton.rs b/src/morton.rs
index fccf871..8e66bfb 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -430,6 +430,42 @@ impl MortonKey {
         result
     }
 
+    /// Return the index of the key as a child of the parent, i.e. 0, 1, ..., 7.
+    #[inline(always)]
+    pub fn child_index(&self) -> usize {
+        if *self == MortonKey::root() {
+            return 0;
+        }
+        let level = self.level() as u64;
+
+        let shift = LEVEL_DISPLACEMENT + 3 * (DEEPEST_LEVEL - level);
+
+        ((self.value >> shift) % 8) as usize
+    }
+
+    /// Return the finest descendent that is opposite to the joint corner with the siblings.
+    pub fn finest_outer_descendent(&self) -> MortonKey {
+        // First find out which child the current key is.
+
+        let level = self.level() as u64;
+
+        if level == DEEPEST_LEVEL {
+            return *self;
+        }
+
+        let mut child_level = 1 + level;
+        let mut key = *self;
+        let outer_index = self.child_index() as u64;
+
+        while child_level <= DEEPEST_LEVEL {
+            let shift = LEVEL_DISPLACEMENT + 3 * (DEEPEST_LEVEL - child_level);
+            key = MortonKey::new(1 + (key.value | outer_index << shift));
+            child_level += 1;
+        }
+
+        key
+    }
+
     /// Linearize by sorting and removing overlaps.
     pub fn linearize(keys: &[MortonKey]) -> Vec<MortonKey> {
         let mut new_keys = Vec::<MortonKey>::new();
@@ -1245,4 +1281,38 @@ mod test {
 
         // Now compute the box.
     }
+
+    #[test]
+    pub fn test_child_index() {
+        let key = MortonKey::from_index_and_level([1, 501, 718], 10);
+
+        let children = key.children();
+
+        for (index, child) in children.iter().enumerate() {
+            assert_eq!(index, child.child_index());
+        }
+    }
+
+    #[test]
+    pub fn test_finest_outer_descendent() {
+        let key = MortonKey::from_index_and_level([0, 0, 0], 1);
+
+        let finest_outer_descendent = key.finest_outer_descendent();
+
+        assert_eq!(
+            finest_outer_descendent,
+            MortonKey::from_index_and_level([0, 0, 0], DEEPEST_LEVEL as usize)
+        );
+
+        let key = MortonKey::from_index_and_level([1, 1, 0], 1);
+        let finest_outer_descendent = key.finest_outer_descendent();
+
+        assert_eq!(
+            finest_outer_descendent,
+            MortonKey::from_index_and_level(
+                [LEVEL_SIZE as usize - 1, LEVEL_SIZE as usize - 1, 0],
+                DEEPEST_LEVEL as usize
+            )
+        );
+    }
 }

From be5d08329772b2a8cefce5a543f9d0535cb4ff5a Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Tue, 17 Sep 2024 09:41:24 +0100
Subject: [PATCH 15/42] WIP: Refactor and cleanup

---
 Cargo.toml                             |   1 +
 examples/battleship.rs                 |   2 +-
 examples/mpi_global_bounding_box.rs    |  36 +
 examples/{mpi.rs => parallel_tests.rs} |   2 +-
 src/lib.rs                             |   3 +-
 src/octree.rs                          | 904 ++++++++++++++++++-------
 src/parallel_octree.rs                 | 836 -----------------------
 src/parsort.rs                         |  45 --
 src/serial.rs                          | 325 +++++++++
 src/tools.rs                           | 135 ++++
 10 files changed, 1154 insertions(+), 1135 deletions(-)
 create mode 100644 examples/mpi_global_bounding_box.rs
 rename examples/{mpi.rs => parallel_tests.rs} (97%)
 delete mode 100644 src/parallel_octree.rs
 create mode 100644 src/serial.rs
 create mode 100644 src/tools.rs

diff --git a/Cargo.toml b/Cargo.toml
index a44ac3c..3c3fc1a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ crate-type = ["cdylib", "lib"]
 [dependencies]
 itertools = "0.13.*"
 rand = { version = "0.8.5", features = ["alloc"] }
+rand_chacha = "0.3.*"
 bytemuck = "1.*"
 vtkio = "0.6.*"
 mpi = {version = "0.8.*", features = ["derive", "user-operations"] }
diff --git a/examples/battleship.rs b/examples/battleship.rs
index 97f7c81..50587d9 100644
--- a/examples/battleship.rs
+++ b/examples/battleship.rs
@@ -4,7 +4,7 @@
 use std::time::Instant;
 
 #[cfg(feature = "battleship")]
-use bempp_octree::octree::Octree;
+use bempp_octree::serial::Octree;
 #[cfg(feature = "battleship")]
 use vtkio::model::*;
 
diff --git a/examples/mpi_global_bounding_box.rs b/examples/mpi_global_bounding_box.rs
new file mode 100644
index 0000000..28748b2
--- /dev/null
+++ b/examples/mpi_global_bounding_box.rs
@@ -0,0 +1,36 @@
+//! Test the computation of a global bounding box across MPI ranks.
+
+use bempp_octree::octree::compute_global_bounding_box;
+use mpi::traits::*;
+use rand::prelude::*;
+use rand_chacha::ChaCha8Rng;
+
+pub fn main() {
+    // Initialise MPI
+    let universe = mpi::initialize().unwrap();
+
+    // Get the world communicator
+    let comm = universe.world();
+
+    // Initialise a seeded Rng.
+    let mut rng = ChaCha8Rng::seed_from_u64(2);
+
+    // Get the rank and size
+    let rank = comm.rank();
+    let size = comm.size();
+
+    // Create `npoints` per rank.
+    let npoints = 10;
+
+    // Generate random points.
+
+    let mut points = Vec::<f64>::with_capacity(3 * npoints);
+
+    for _ in 0..3 * npoints {
+        points.push(rng.gen());
+    }
+
+    // Compute the distributed bounding box.
+
+    let bounding_box = compute_global_bounding_box(&points, &comm);
+}
diff --git a/examples/mpi.rs b/examples/parallel_tests.rs
similarity index 97%
rename from examples/mpi.rs
rename to examples/parallel_tests.rs
index 1837b8c..e3555e9 100644
--- a/examples/mpi.rs
+++ b/examples/parallel_tests.rs
@@ -1,7 +1,7 @@
 //! Testing the hyksort component.
 use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE};
 use bempp_octree::morton::MortonKey;
-use bempp_octree::parallel_octree::{block_partition, is_sorted_array, linearize, partition};
+use bempp_octree::octree::{block_partition, is_sorted_array, linearize, partition};
 use bempp_octree::parsort::{array_to_root, parsort};
 use itertools::{izip, Itertools};
 use mpi::traits::*;
diff --git a/src/lib.rs b/src/lib.rs
index abc07bd..c94ad07 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -6,6 +6,7 @@ pub mod constants;
 pub mod geometry;
 pub mod morton;
 pub mod octree;
-pub mod parallel_octree;
 pub mod parsort;
+pub mod serial;
+pub mod tools;
 pub mod types;
diff --git a/src/octree.rs b/src/octree.rs
index 64cdb08..a3caa64 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -1,325 +1,727 @@
-//! Definition of a linear octree
+//! Parallel Octree structure
+
+use std::collections::HashMap;
 
 use crate::{
-    constants::{DEEPEST_LEVEL, NLEVELS},
+    constants::{DEEPEST_LEVEL, NSIBLINGS},
     geometry::PhysicalBox,
     morton::MortonKey,
+    parsort::parsort,
+    tools::gather_to_all,
 };
-use bytemuck;
-use std::collections::HashMap;
-use vtkio;
-
-/// A neighbour
-pub struct Neighbour {
-    /// Direction
-    pub direction: [i64; 3],
-    /// Level
-    pub level: usize,
-    /// Morton key
-    pub key: MortonKey,
+
+use mpi::{
+    datatype::{Partition, PartitionMut},
+    point_to_point as p2p,
+    traits::{Root, Source},
+};
+
+use itertools::{izip, Itertools};
+use mpi::{
+    collective::SystemOperation,
+    traits::{CommunicatorCollectives, Destination},
+};
+use rand::Rng;
+
+/// Compute the global bounding box across all points on all processes.
+pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
+    points: &[f64],
+    comm: &C,
+) -> PhysicalBox {
+    // Make sure that the points array is a multiple of 3.
+    assert_eq!(points.len() % 3, 0);
+    let points: &[[f64; 3]] = bytemuck::cast_slice(points);
+
+    // Now compute the minimum and maximum across each dimension.
+
+    let mut xmin = f64::MAX;
+    let mut xmax = f64::MIN;
+
+    let mut ymin = f64::MAX;
+    let mut ymax = f64::MIN;
+
+    let mut zmin = f64::MAX;
+    let mut zmax = f64::MIN;
+
+    for point in points {
+        let x = point[0];
+        let y = point[1];
+        let z = point[2];
+
+        xmin = f64::min(xmin, x);
+        xmax = f64::max(xmax, x);
+
+        ymin = f64::min(ymin, y);
+        ymax = f64::max(ymax, y);
+
+        zmin = f64::min(zmin, z);
+        zmax = f64::max(zmax, z);
+    }
+
+    let mut global_xmin = 0.0;
+    let mut global_xmax = 0.0;
+
+    let mut global_ymin = 0.0;
+    let mut global_ymax = 0.0;
+
+    let mut global_zmin = 0.0;
+    let mut global_zmax = 0.0;
+
+    comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min());
+    comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max());
+
+    comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min());
+    comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max());
+
+    comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min());
+    comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max());
+
+    let xdiam = global_xmax - global_xmin;
+    let ydiam = global_ymax - global_ymin;
+    let zdiam = global_zmax - global_zmin;
+
+    let xmean = global_xmin + 0.5 * xdiam;
+    let ymean = global_ymin + 0.5 * ydiam;
+    let zmean = global_zmin + 0.5 * zdiam;
+
+    // We increase diameters by box size on deepest level
+    // and use the maximum diameter to compute a
+    // cubic bounding box.
+
+    let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64;
+
+    let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap();
+
+    let max_diam = max_diam * (1.0 + deepest_box_diam);
+
+    PhysicalBox::new([
+        xmean - 0.5 * max_diam,
+        ymean - 0.5 * max_diam,
+        zmean - 0.5 * max_diam,
+        xmean + 0.5 * max_diam,
+        ymean + 0.5 * max_diam,
+        zmean + 0.5 * max_diam,
+    ])
 }
 
-/// An octree
-pub struct Octree {
-    leaf_keys: Vec<MortonKey>,
-    points: Vec<[f64; 3]>,
-    point_to_level_keys: [Vec<MortonKey>; NLEVELS],
-    bounding_box: PhysicalBox,
-    key_counts: HashMap<MortonKey, usize>,
-    max_leaf_level: usize,
-    max_points_in_leaf: usize,
+/// Convert points to Morton keys on specified level.
+pub fn points_to_morton<C: CommunicatorCollectives>(
+    points: &[f64],
+    max_level: usize,
+    comm: &C,
+) -> (Vec<MortonKey>, PhysicalBox) {
+    // Make sure that the points array is a multiple of 3.
+    assert_eq!(points.len() % 3, 0);
+
+    // Make sure that max level never exceeds DEEPEST_LEVEL
+    let max_level = if max_level > DEEPEST_LEVEL as usize {
+        DEEPEST_LEVEL as usize
+    } else {
+        max_level
+    };
+
+    // Compute the physical bounding box.
+
+    let bounding_box = compute_global_bounding_box(points, comm);
+
+    // Bunch the points in arrays of 3.
+
+    let points: &[[f64; 3]] = bytemuck::cast_slice(points);
+
+    let keys = points
+        .iter()
+        .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level))
+        .collect_vec();
+
+    // Now want to get weighted Morton keys. We use a HashMap.
+
+    let mut value_counts = HashMap::<MortonKey, usize>::new();
+
+    for key in &keys {
+        *value_counts.entry(*key).or_insert(0) += 1;
+    }
+
+    // let weighted_keys = value_counts
+    //     .iter()
+    //     .map(|(&key, &weight)| WeightedMortonKey::new(key, weight))
+    //     .collect_vec();
+
+    (keys, bounding_box)
 }
 
-impl Octree {
-    /// Create octress from points
-    pub fn from_points(points: &[f64], max_level: usize, max_points_per_box: usize) -> Self {
-        // Make sure that the points array is a multiple of 3.
-        assert_eq!(points.len() % 3, 0);
+/// Block partition of tree.
+///
+/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned
+/// keys and the associated coarse keys.
+/// A necessary condition for the block partitioning is that
+// all sorted keys are on the same level.
+pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
+    sorted_keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> (Vec<MortonKey>, Vec<MortonKey>) {
+    let rank = comm.rank();
+    if comm.size() == 1 {
+        // On a single node block partitioning should not do anything.
+        return (sorted_keys.to_vec(), vec![MortonKey::root()]);
+    }
 
-        // Make sure that max level never exceeds DEEPEST_LEVEL
-        let max_level = if max_level > DEEPEST_LEVEL as usize {
-            DEEPEST_LEVEL as usize
-        } else {
-            max_level
-        };
+    let mut completed_region = sorted_keys
+        .first()
+        .unwrap()
+        .fill_between_keys(*sorted_keys.last().unwrap());
+
+    completed_region.insert(0, *sorted_keys.first().unwrap());
+    completed_region.push(*sorted_keys.last().unwrap());
+
+    // Get the smallest level members of the completed region.
+
+    let min_level = completed_region
+        .iter()
+        .map(|elem| elem.level())
+        .min()
+        .unwrap();
+
+    // Each process selects its largest boxes. These are used to create
+    // a coarse tree.
+
+    let largest_boxes = completed_region
+        .iter()
+        .filter(|elem| elem.level() == min_level)
+        .copied()
+        .collect_vec();
+
+    let coarse_tree = complete_tree(&largest_boxes, rng, comm);
+
+    // We want to partition the coarse tree. But we need the correct weights. The idea
+    // is that we use the number of original leafs that intersect with the coarse tree
+    // as leafs. In order to compute this we send the coarse tree around to all processes
+    // so that each process computes for each coarse tree element how many of its keys
+    // intersect with each node of the coarse tree. We then sum up the local weight for each
+    // coarse tree node across all nodes to get the weight.
+
+    let global_coarse_tree = gather_to_all(&coarse_tree, comm);
 
-        // Compute the physical bounding box.
+    // We also want to send around a corresponding array of ranks so that for each global coarse tree key
+    // we have the rank of where it originates from.
 
-        let bounding_box = PhysicalBox::from_points(points);
+    let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm);
 
-        // Bunch the points in arrays of 3.
+    // We now compute the local weights.
+    let mut local_weights = vec![0 as usize; global_coarse_tree.len()];
 
-        let points: &[[f64; 3]] = bytemuck::cast_slice(points);
-        let npoints = points.len();
+    // In the following loop we want to be a bit smart. We do not iterate through all the local elements.
+    // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
+    // of our sorted keys that overlaps with the coarse tree region.
 
-        // We create a vector of keys for each point on each level. We compute the
-        // keys on the deepest level and fill the other levels by going from
-        // parent to parent.
+    // Let's find the start of our region. The start of our region is a coarse key that is an ancestor
+    // of our current key. This works because the coarse tree has levels at most as high as the sorted keys.
 
-        let mut point_to_level_keys: [Vec<MortonKey>; NLEVELS] = Default::default();
-        point_to_level_keys[DEEPEST_LEVEL as usize] = points
+    let first_key = *sorted_keys.first().unwrap();
+
+    let first_coarse_index = global_coarse_tree
+        .iter()
+        .take_while(|coarse_key| !coarse_key.is_ancestor(first_key))
+        .count();
+
+    // Now we need to find the end index of our region. For this again we find the index of our coarse tree that
+    // is an ancestor of our last key.
+    let last_key = *sorted_keys.last().unwrap();
+
+    let last_coarse_index = global_coarse_tree
+        .iter()
+        .take_while(|coarse_key| !coarse_key.is_ancestor(last_key))
+        .count();
+
+    // We now only need to iterate through between the first and last coarse index in the coarse tree.
+    // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key).
+
+    for (w, &global_coarse_key) in izip!(
+        local_weights[first_coarse_index..=last_coarse_index].iter_mut(),
+        global_coarse_tree[first_coarse_index..=last_coarse_index].iter()
+    ) {
+        *w += sorted_keys
             .iter()
-            .map(|&point| {
-                MortonKey::from_physical_point(point, &bounding_box, DEEPEST_LEVEL as usize)
-            })
-            .collect::<Vec<_>>();
-
-        for index in (1..=DEEPEST_LEVEL as usize).rev() {
-            let mut new_vec = Vec::<MortonKey>::with_capacity(npoints);
-            for &key in &point_to_level_keys[index] {
-                new_vec.push(key.parent());
-            }
-            point_to_level_keys[index - 1] = new_vec;
-        }
+            .filter(|&&key| global_coarse_key.is_ancestor(key))
+            .count();
+    }
 
-        // We now have to create level keys. We are starting at the root and recursing
-        // down until each box has fewer than max_points_per_box keys.
+    // We now need to sum up the weights across all processes.
 
-        // First we compute the counts of each key on each level. For that we create
-        // for each level a Hashmap for the keys and then add up.
+    let mut weights = vec![0 as usize; global_coarse_tree.len()];
 
-        let mut key_counts: HashMap<MortonKey, usize> = Default::default();
+    comm.all_reduce_into(&local_weights, &mut weights, SystemOperation::sum());
 
-        for keys in &point_to_level_keys {
-            for key in keys {
-                *key_counts.entry(*key).or_default() += 1;
-            }
-        }
+    // Each process now has all weights. However, we only need the ones for the current process.
+    // So we just filter the rest out.
 
-        // We can now easily create an adaptive tree by subdividing. We do this by
-        // a recursive function.
-
-        let mut leaf_keys = Vec::<MortonKey>::new();
-
-        fn recurse_keys(
-            key: MortonKey,
-            key_counts: &HashMap<MortonKey, usize>,
-            leaf_keys: &mut Vec<MortonKey>,
-            max_points_per_box: usize,
-            max_level: usize,
-        ) {
-            let level = key.level();
-            // A key may have not be associated with points. This happens if one of the children on
-            // the previous level has no points in its physical box. However, we want to create a
-            // complete tree. So we still add this one empty child.
-            if let Some(&count) = key_counts.get(&key) {
-                if count > max_points_per_box && level < max_level {
-                    for child in key.children() {
-                        recurse_keys(child, key_counts, leaf_keys, max_points_per_box, max_level);
-                    }
-                } else {
-                    leaf_keys.push(key)
-                }
+    let weights = izip!(coarse_tree_ranks, weights)
+        .filter_map(|(r, weight)| {
+            if r == rank as usize {
+                Some(weight)
             } else {
-                leaf_keys.push(key)
+                None
             }
-        }
+        })
+        .collect_vec();
 
-        // Now execute the recursion starting from root
+    let coarse_tree = partition(&coarse_tree, &weights, comm);
 
-        recurse_keys(
-            MortonKey::root(),
-            &key_counts,
-            &mut leaf_keys,
-            max_points_per_box,
-            max_level,
-        );
+    (
+        redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm),
+        coarse_tree,
+    )
 
-        // The leaf keys are now a complete linear tree. But they are not yet balanced.
-        // In the final step we balance the leafs.
+    // We now need to redistribute the global tree according to the coarse tree.
+}
 
-        let leaf_keys = MortonKey::balance(&leaf_keys, MortonKey::root());
+/// Redistribute sorted keys with respect to a linear coarse tree.
+pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
+    sorted_keys: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    let size = comm.size();
 
-        let mut max_leaf_level = 0;
-        let mut max_points_in_leaf = 0;
+    if size == 1 {
+        return sorted_keys.to_vec();
+    }
 
-        for key in &leaf_keys {
-            max_leaf_level = max_leaf_level.max(key.level());
-            max_points_in_leaf =
-                max_points_in_leaf.max(if let Some(&count) = key_counts.get(key) {
-                    count
-                } else {
-                    0
-                });
-        }
+    // We want to globally redistribute keys so that the keys on each process are descendents
+    // of the local coarse tree keys.
 
-        Self {
-            leaf_keys,
-            points: points.to_vec(),
-            point_to_level_keys,
-            bounding_box,
-            key_counts,
-            max_leaf_level,
-            max_points_in_leaf,
-        }
-    }
+    // We are using here the fact that the coarse tree is complete and sorted.
+    // We are sending around to each process the first local index. This
+    // defines bins in which we sort our keys. The keys are then sent around to the correct
+    // processes via an alltoallv operation.
 
-    /// Leaf keys
-    pub fn leaf_keys(&self) -> &Vec<MortonKey> {
-        &self.leaf_keys
-    }
+    let my_first = *coarse_tree.first().unwrap();
 
-    /// Points
-    pub fn points(&self) -> &Vec<[f64; 3]> {
-        &self.points
-    }
+    let mut global_bins = Vec::<MortonKey>::with_capacity(size as usize);
+    let global_bins_buff: &mut [MortonKey] =
+        unsafe { std::mem::transmute(global_bins.spare_capacity_mut()) };
 
-    /// Get level keys for each point
-    pub fn point_to_level_keys(&self) -> &[Vec<MortonKey>; NLEVELS] {
-        &self.point_to_level_keys
-    }
+    comm.all_gather_into(&my_first, global_bins_buff);
 
-    /// Bounding box
-    pub fn bounding_box(&self) -> &PhysicalBox {
-        &self.bounding_box
-    }
+    unsafe { global_bins.set_len(size as usize) };
+
+    // We now have the first index from each process. We also want
+    // an upper bound for the last index of the tree to make the sorting into
+    // bins easier.
+    global_bins.push(MortonKey::upper_bound());
+
+    // We now have our bins. We go through our keys and store how
+    // many keys are assigned to each rank. We are using here that
+    // our keys and the coarse tree are both sorted.
+
+    // This will store for each rank how many keys will be assigned to it.
+
+    let rank_counts = sort_to_bins(sorted_keys, &global_bins)
+        .iter()
+        .map(|&elem| elem as i32)
+        .collect_vec();
 
-    /// Maximum leaf level
-    pub fn maximum_leaf_level(&self) -> usize {
-        self.max_leaf_level
+    // We now have the counts for each rank. Let's send it around via alltoallv.
+
+    let mut counts_from_proc = vec![0 as i32; size as usize];
+
+    comm.all_to_all_into(&rank_counts, &mut counts_from_proc);
+    // Now compute the send and receive displacements.
+
+    // We can now send around the actual elements with an alltoallv.
+    let send_displs: Vec<i32> = rank_counts
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp as i32)
+        })
+        .collect();
+
+    let send_partition = Partition::new(&sorted_keys[..], &rank_counts[..], &send_displs[..]);
+
+    let mut recvbuffer = vec![MortonKey::default(); counts_from_proc.iter().sum::<i32>() as usize];
+
+    let recv_displs: Vec<i32> = counts_from_proc
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect();
+
+    let mut receiv_partition =
+        PartitionMut::new(&mut recvbuffer[..], counts_from_proc, &recv_displs[..]);
+    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
+
+    recvbuffer
+}
+
+/// Create bins from sorted keys.
+pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec<usize> {
+    let mut bin_counts = vec![0 as usize; bins.len() - 1];
+
+    // This iterates over each possible bin and returns also the associated rank.
+    let mut bin_iter = izip!(
+        bin_counts.iter_mut(),
+        bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(),
+    );
+
+    // We take the first element of the bin iterator. There will always be at least one.
+    let mut r: &mut usize;
+    let mut bin_start: &MortonKey;
+    let mut bin_end: &MortonKey;
+    (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
+
+    for &key in sorted_keys.iter() {
+        if *bin_start <= key && key < *bin_end {
+            *r += 1;
+        } else {
+            // Move the bin forward until it fits. There will always be a fitting bin.
+            while let Some((rn, (bsn, ben))) = bin_iter.next() {
+                if *bsn <= key && key < *ben {
+                    *rn += 1;
+                    r = rn;
+                    bin_start = bsn;
+                    bin_end = ben;
+                    break;
+                }
+            }
+        }
     }
 
-    /// Maximum number of points in a leaf box
-    pub fn max_points_in_leaf_box(&self) -> usize {
-        self.max_points_in_leaf
+    bin_counts
+}
+
+/// Return a complete tree generated from local keys and associated coarse keys.
+///
+/// The coarse keys are refined until the maximum level is reached or until each coarse key
+/// is the ancestor of at most `max_keys` fine keys.
+pub fn create_local_tree(
+    sorted_fine_keys: &[MortonKey],
+    coarse_keys: &[MortonKey],
+    mut max_level: usize,
+    max_keys: usize,
+) -> Vec<MortonKey> {
+    if max_level > DEEPEST_LEVEL as usize {
+        max_level = DEEPEST_LEVEL as usize;
     }
 
-    /// Number of points in the box indexed by a key
-    pub fn number_of_points_in_key(&self, key: MortonKey) -> usize {
-        if let Some(&count) = self.key_counts.get(&key) {
-            count
+    // We split the sorted fine keys into subslices so that each subslice
+    // is associated with a coarse slice. For this we need to add an upper bound
+    // coarse keys to ensure that we have suitable bins.
+
+    let mut bins = coarse_keys.to_vec();
+    bins.push(MortonKey::upper_bound());
+
+    let counts = sort_to_bins(&sorted_fine_keys, &bins);
+
+    // We now know how many fine keys are associated with each coarse block. We iterate
+    // through and locally refine for each block that requires it.
+
+    let mut remainder = sorted_fine_keys;
+    let mut new_coarse_keys = Vec::<MortonKey>::new();
+
+    for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) {
+        let current;
+        (current, remainder) = remainder.split_at(count);
+        if coarse_key.level() < max_level && current.len() > max_keys {
+            // We need to refine the current split.
+            new_coarse_keys.extend_from_slice(
+                create_local_tree(
+                    current,
+                    coarse_key.children().as_slice(),
+                    max_level,
+                    max_keys,
+                )
+                .as_slice(),
+            );
         } else {
-            0
+            new_coarse_keys.push(coarse_key)
         }
     }
 
-    /// Export the tree to vtk
-    pub fn export_to_vtk(&self, file_path: &str) {
-        use vtkio::model::{
-            Attributes, ByteOrder, CellType, Cells, DataSet, IOBuffer, UnstructuredGridPiece,
-            Version, VertexNumbers,
-        };
+    coarse_keys.to_vec()
+}
 
-        // Each box has 8 corners with 3 coordinates each, hence 24 floats per key.
-        let mut points = Vec::<f64>::new();
-        // 8 coords per box, hence 8 * nkeys values in connectivity.
-        let mut connectivity = Vec::<u64>::new();
-        // Store the vtk offset for each box.
-        let mut offsets = Vec::<u64>::new();
+/// Linearize a set of weighted Morton keys.
+pub fn linearize<R: Rng, C: CommunicatorCollectives>(
+    keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    let size = comm.size();
+    let rank = comm.rank();
 
-        let bounding_box = self.bounding_box();
+    // If we only have one process we use the standard serial linearization.
 
-        // Go through the keys and add coordinates and connectivity.
-        // Box coordinates are already in the right order, so connectivity
-        // just counts up. We don't mind doubly counted vertices from two boxes.
-        let mut point_count = 0;
-        let mut key_count = 0;
+    if size == 1 {
+        return MortonKey::linearize(keys);
+    }
 
-        for key in self.leaf_keys().iter() {
-            // We only want to export non-empty boxes.
-            if self.number_of_points_in_key(*key) == 0 {
-                continue;
-            }
-            let coords = key.physical_box(bounding_box).corners();
+    // We are first sorting the keys. Then in a linear process across all processors we
+    // go through the arrays and delete ancestors of nodes.
+
+    let sorted_keys = parsort(&keys, comm, rng);
 
-            key_count += 1;
-            offsets.push(8 * key_count);
+    // Each process needs to send its first element to the previous process. Each process
+    // then goes through its own list and retains elements that are not ancestors of the
+    // next element.
 
-            for coord in &coords {
-                points.push(coord[0]);
-                points.push(coord[1]);
-                points.push(coord[2]);
+    let mut result = Vec::<MortonKey>::new();
 
-                connectivity.push(point_count);
-                point_count += 1;
+    if rank == size - 1 {
+        comm.process_at_rank(rank - 1)
+            .send(sorted_keys.first().unwrap());
+
+        for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
+            // m1 is also ancestor of m2 if they are identical.
+            if m1.is_ancestor(m2) {
+                continue;
+            } else {
+                result.push(m1);
             }
         }
 
-        let vtk_file = vtkio::Vtk {
-            version: Version::new((1, 0)),
-            title: String::new(),
-            byte_order: ByteOrder::LittleEndian,
-            file_path: None,
-            data: DataSet::inline(UnstructuredGridPiece {
-                points: IOBuffer::F64(points),
-                cells: Cells {
-                    cell_verts: VertexNumbers::XML {
-                        connectivity,
-                        offsets,
-                    },
-                    types: vec![CellType::Hexahedron; key_count as usize],
-                },
-                data: Attributes {
-                    point: vec![],
-                    cell: vec![],
-                },
-            }),
+        result.push(*sorted_keys.last().unwrap());
+    } else {
+        let (other, _status) = if rank > 0 {
+            p2p::send_receive(
+                sorted_keys.first().unwrap(),
+                &comm.process_at_rank(rank - 1),
+                &comm.process_at_rank(rank + 1),
+            )
+        } else {
+            comm.any_process().receive::<MortonKey>()
         };
+        for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
+            // m1 is also ancestor of m2 if they are identical.
+            if m1.is_ancestor(m2) {
+                continue;
+            } else {
+                result.push(m1);
+            }
+        }
+
+        let last = *sorted_keys.last().unwrap();
 
-        vtk_file.export_ascii(file_path).unwrap();
+        if !last.is_ancestor(other) {
+            result.push(last)
+        }
     }
 
-    // We can now create the vtk object.
+    result
 }
 
-#[cfg(test)]
-mod test {
-    use super::Octree;
-    use rand::prelude::*;
+/// Balance a sorted list of Morton keys across processors given an array of corresponding weights.
+pub fn partition<C: CommunicatorCollectives>(
+    sorted_keys: &[MortonKey],
+    weights: &[usize],
+    comm: &C,
+) -> Vec<MortonKey> {
+    assert_eq!(sorted_keys.len(), weights.len());
 
-    fn get_points_on_sphere(npoints: usize) -> Vec<f64> {
-        let mut rng = rand::rngs::StdRng::seed_from_u64(0);
-        let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
+    let size = comm.size();
+    let rank = comm.rank();
 
-        let mut points = Vec::<f64>::with_capacity(3 * npoints);
-        for _ in 0..(npoints) {
-            let x: f64 = normal.sample(&mut rng);
-            let y: f64 = normal.sample(&mut rng);
-            let z: f64 = normal.sample(&mut rng);
+    // If we only have one process we simply return.
 
-            let norm = (x * x + y * y + z * z).sqrt();
+    if size == 1 {
+        return sorted_keys.to_vec();
+    }
 
-            points.push(x / norm);
-            points.push(y / norm);
-            points.push(z / norm);
-        }
+    // First scan the weight.
+    // We scan the local arrays, then use a global scan operation on the last element
+    // of each array to get the global sums and then we update the array of each rank
+    // with the sum from the previous ranks.
+
+    let mut scan: Vec<usize> = weights
+        .iter()
+        .scan(0, |state, x| {
+            *state += *x;
+            Some(*state)
+        })
+        .collect_vec();
+    let scan_last = *scan.last().unwrap();
+    let mut scan_result: usize = 0;
+    comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum());
+    for elem in &mut scan {
+        *elem += scan_result;
+    }
 
-        points
+    let mut total_weight = if rank == size - 1 {
+        *scan.last().unwrap()
+    } else {
+        0
+    };
+
+    // Scan the weight (form cumulative sums) and broadcast the total weight (last entry on last process)
+    // to all other processes.
+
+    comm.process_at_rank(size - 1)
+        .broadcast_into(&mut total_weight);
+
+    let w = total_weight / (size as usize);
+    let k = total_weight % (size as usize);
+
+    let mut hash_map = HashMap::<usize, Vec<MortonKey>>::new();
+
+    // Sort the elements into bins according to which process they should be sent.
+
+    for p in 1..=size as usize {
+        let q = if p <= k as usize {
+            izip!(sorted_keys, &scan)
+                .filter_map(|(&key, &s)| {
+                    if ((p - 1) * (1 + w) <= s && s < p * (w + 1))
+                        || (p == size as usize && (p - 1) * (1 + w) <= s)
+                    {
+                        Some(key)
+                    } else {
+                        None
+                    }
+                })
+                .collect_vec()
+        } else {
+            izip!(sorted_keys, &scan)
+                .filter_map(|(&key, &s)| {
+                    if ((p - 1) * w + k <= s && s < p * w + k)
+                        || (p == size as usize && (p - 1) * w + k <= s)
+                    {
+                        Some(key)
+                    } else {
+                        None
+                    }
+                })
+                .collect_vec()
+        };
+        hash_map.insert(p - 1, q);
     }
 
-    #[test]
-    fn test_octree() {
-        use std::time::Instant;
+    // Now distribute the data with an all to all v.
+    // We create a vector of how many elements to send to each process and
+    // then send the actual data.
+
+    let mut counts = vec![0 as i32; size as usize];
+    let mut counts_from_processor = vec![0 as i32; size as usize];
 
-        let npoints = 10000;
-        let points = get_points_on_sphere(npoints);
-        let max_level = 7;
-        let max_points_per_box = 100;
+    let mut all_elements = Vec::<MortonKey>::new();
+    for (index, c) in counts.iter_mut().enumerate() {
+        let elements = hash_map.get(&index).unwrap();
+        *c = elements.len() as i32;
+        all_elements.extend(elements.iter())
+    }
+
+    // Send around the number of elements for each process
+    comm.all_to_all_into(&counts, &mut counts_from_processor);
+
+    // We have the number of elements for each process now. Now send around
+    // the actual elements.
+
+    // We can now send around the actual elements with an alltoallv.
+    let send_displs: Vec<i32> = counts
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp as i32)
+        })
+        .collect();
+
+    let send_partition = Partition::new(&all_elements, &counts[..], &send_displs[..]);
+
+    let mut recvbuffer =
+        vec![MortonKey::default(); counts_from_processor.iter().sum::<i32>() as usize];
+
+    let recv_displs: Vec<i32> = counts_from_processor
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect();
+
+    let mut receiv_partition =
+        PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]);
+    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
+
+    recvbuffer.sort_unstable();
+    recvbuffer
+}
 
-        let start = Instant::now();
-        let octree = Octree::from_points(&points, max_level, max_points_per_box);
-        let duration = start.elapsed();
+/// Given a distributed set of keys, generate a complete linear Octree.
+pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
+    keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    let mut linearized_keys = linearize(keys, rng, comm);
 
-        println!("Creation time: {}", duration.as_millis());
-        println!("Number of leaf keys: {}", octree.leaf_keys().len());
-        println!("Bounding box: {}", octree.bounding_box());
+    let size = comm.size();
+    let rank = comm.rank();
+
+    if size == 1 {
+        return MortonKey::complete_tree(linearized_keys.as_slice());
     }
 
-    #[test]
-    fn test_export() {
-        let fname = "_test_sphere.vtk";
-        let npoints = 10000;
-        let points = get_points_on_sphere(npoints);
-        let max_level = 7;
-        let max_points_per_box = 100;
-
-        let octree = Octree::from_points(&points, max_level, max_points_per_box);
-
-        octree.export_to_vtk(fname);
-        println!("Maximum leaf level: {}", octree.maximum_leaf_level());
-        println!(
-            "Maximum number of points in leaf box: {}",
-            octree.max_points_in_leaf_box()
-        );
+    // Now insert on the first and last process the first and last child of the
+    // finest ancestor of first/last box on deepest level
+
+    // Send first element to previous rank and insert into local keys.
+    // On the first process we also need to insert the first child of the finest
+    // ancestor of the deepest first key and first element. Correspondingly on the last process
+    // we need to insert the last child of the finest ancester of the deepest last key and last element.
+
+    if rank == size - 1 {
+        // On last process send first element to previous processes and insert last
+        // possible box from region into list.
+        comm.process_at_rank(rank - 1)
+            .send(linearized_keys.first().unwrap());
+        let last_key = *linearized_keys.last().unwrap();
+        let deepest_last = MortonKey::deepest_last();
+        if !last_key.is_ancestor(deepest_last) {
+            let ancestor = deepest_last.finest_common_ancestor(last_key);
+            linearized_keys.push(ancestor.children()[NSIBLINGS - 1]);
+        }
+    } else {
+        let (other, _status) = if rank > 0 {
+            // On intermediate process receive from the next process
+            // and send first element to previous process.
+            p2p::send_receive(
+                linearized_keys.first().unwrap(),
+                &comm.process_at_rank(rank - 1),
+                &comm.process_at_rank(rank + 1),
+            )
+        } else {
+            // On first process insert at the beginning the first possible
+            // box in the region and receive the key from next process.
+            let first_key = *linearized_keys.first().unwrap();
+            let deepest_first = MortonKey::deepest_first();
+            if !first_key.is_ancestor(deepest_first) {
+                let ancestor = deepest_first.finest_common_ancestor(first_key);
+                linearized_keys.insert(0, ancestor.children()[0]);
+            }
+
+            comm.process_at_rank(1).receive::<MortonKey>()
+        };
+        // If we are not at the last process we need to introduce the received key
+        // into our list.
+        linearized_keys.push(other);
+    };
+
+    // Now complete the regions defined by the keys on each process.
+
+    let mut result = Vec::<MortonKey>::new();
+
+    for (&key1, &key2) in linearized_keys.iter().tuple_windows() {
+        result.push(key1);
+        result.extend_from_slice(key1.fill_between_keys(key2).as_slice());
+    }
+
+    if rank == size - 1 {
+        result.push(*linearized_keys.last().unwrap());
     }
+
+    result
 }
diff --git a/src/parallel_octree.rs b/src/parallel_octree.rs
deleted file mode 100644
index e77cdf1..0000000
--- a/src/parallel_octree.rs
+++ /dev/null
@@ -1,836 +0,0 @@
-//! Parallel Octree structure
-
-use std::collections::HashMap;
-
-use crate::{
-    constants::{DEEPEST_LEVEL, NSIBLINGS},
-    geometry::PhysicalBox,
-    morton::MortonKey,
-    parsort::{array_to_root, parsort},
-};
-
-use mpi::{
-    datatype::{Partition, PartitionMut},
-    point_to_point as p2p,
-    traits::{Equivalence, Root, Source},
-};
-
-use itertools::{izip, Itertools};
-use mpi::{
-    collective::SystemOperation,
-    traits::{CommunicatorCollectives, Destination},
-};
-use rand::Rng;
-
-// /// A weighted Mortonkey contains weights to enable load balancing.
-// #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Equivalence)]
-// pub struct WeightedMortonKey {
-//     /// The actual MortonKey.
-//     pub key: MortonKey,
-//     /// The weight of the key, typically the number of points in the corresponding octant.
-//     pub weight: usize,
-// }
-
-// impl WeightedMortonKey {
-//     /// Get a new weighted Morton key
-//     pub fn new(key: MortonKey, weight: usize) -> Self {
-//         Self { key, weight }
-//     }
-// }
-
-// impl MinValue for WeightedMortonKey {
-//     fn min_value() -> Self {
-//         WeightedMortonKey {
-//             key: MortonKey::from_index_and_level([0, 0, 0], 0),
-//             weight: 0,
-//         }
-//     }
-// }
-
-// impl MaxValue for WeightedMortonKey {
-//     fn max_value() -> Self {
-//         WeightedMortonKey {
-//             key: MortonKey::deepest_last(),
-//             weight: usize::MAX,
-//         }
-//     }
-// }
-
-// impl Default for WeightedMortonKey {
-//     fn default() -> Self {
-//         WeightedMortonKey::new(Default::default(), 0)
-//     }
-// }
-
-// impl Display for WeightedMortonKey {
-//     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-//         write!(f, "(Key: {}, Weight: {}", self.key, self.weight)
-//     }
-// }
-
-/// Compute the global bounding box across all points on all processes.
-pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
-    points: &[f64],
-    comm: &C,
-) -> PhysicalBox {
-    // Make sure that the points array is a multiple of 3.
-    assert_eq!(points.len() % 3, 0);
-    let points: &[[f64; 3]] = bytemuck::cast_slice(points);
-
-    // Now compute the minimum and maximum across each dimension.
-
-    let mut xmin = f64::MAX;
-    let mut xmax = f64::MIN;
-
-    let mut ymin = f64::MAX;
-    let mut ymax = f64::MIN;
-
-    let mut zmin = f64::MAX;
-    let mut zmax = f64::MIN;
-
-    for point in points {
-        let x = point[0];
-        let y = point[1];
-        let z = point[2];
-
-        xmin = f64::min(xmin, x);
-        xmax = f64::max(xmax, x);
-
-        ymin = f64::min(ymin, y);
-        ymax = f64::max(ymax, y);
-
-        zmin = f64::min(zmin, z);
-        zmax = f64::max(zmax, z);
-    }
-
-    let mut global_xmin = 0.0;
-    let mut global_xmax = 0.0;
-
-    let mut global_ymin = 0.0;
-    let mut global_ymax = 0.0;
-
-    let mut global_zmin = 0.0;
-    let mut global_zmax = 0.0;
-
-    comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min());
-    comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max());
-
-    comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min());
-    comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max());
-
-    comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min());
-    comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max());
-
-    let xdiam = global_xmax - global_xmin;
-    let ydiam = global_ymax - global_ymin;
-    let zdiam = global_zmax - global_zmin;
-
-    let xmean = global_xmin + 0.5 * xdiam;
-    let ymean = global_ymin + 0.5 * ydiam;
-    let zmean = global_zmin + 0.5 * zdiam;
-
-    // We increase diameters by box size on deepest level
-    // and use the maximum diameter to compute a
-    // cubic bounding box.
-
-    let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64;
-
-    let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap();
-
-    let max_diam = max_diam * (1.0 + deepest_box_diam);
-
-    PhysicalBox::new([
-        xmean - 0.5 * max_diam,
-        ymean - 0.5 * max_diam,
-        zmean - 0.5 * max_diam,
-        xmean + 0.5 * max_diam,
-        ymean + 0.5 * max_diam,
-        zmean + 0.5 * max_diam,
-    ])
-}
-
-/// Convert points to Morton keys on specified level.
-pub fn points_to_morton<C: CommunicatorCollectives>(
-    points: &[f64],
-    max_level: usize,
-    comm: &C,
-) -> (Vec<MortonKey>, PhysicalBox) {
-    // Make sure that the points array is a multiple of 3.
-    assert_eq!(points.len() % 3, 0);
-
-    // Make sure that max level never exceeds DEEPEST_LEVEL
-    let max_level = if max_level > DEEPEST_LEVEL as usize {
-        DEEPEST_LEVEL as usize
-    } else {
-        max_level
-    };
-
-    // Compute the physical bounding box.
-
-    let bounding_box = compute_global_bounding_box(points, comm);
-
-    // Bunch the points in arrays of 3.
-
-    let points: &[[f64; 3]] = bytemuck::cast_slice(points);
-
-    let keys = points
-        .iter()
-        .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level))
-        .collect_vec();
-
-    // Now want to get weighted Morton keys. We use a HashMap.
-
-    let mut value_counts = HashMap::<MortonKey, usize>::new();
-
-    for key in &keys {
-        *value_counts.entry(*key).or_insert(0) += 1;
-    }
-
-    // let weighted_keys = value_counts
-    //     .iter()
-    //     .map(|(&key, &weight)| WeightedMortonKey::new(key, weight))
-    //     .collect_vec();
-
-    (keys, bounding_box)
-}
-
-/// Block partition of tree.
-///
-/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned
-/// keys and the associated coarse keys.
-/// A necessary condition for the block partitioning is that
-// all sorted keys are on the same level.
-pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
-    sorted_keys: &[MortonKey],
-    rng: &mut R,
-    comm: &C,
-) -> (Vec<MortonKey>, Vec<MortonKey>) {
-    let rank = comm.rank();
-    if comm.size() == 1 {
-        // On a single node block partitioning should not do anything.
-        return (sorted_keys.to_vec(), vec![MortonKey::root()]);
-    }
-
-    let mut completed_region = sorted_keys
-        .first()
-        .unwrap()
-        .fill_between_keys(*sorted_keys.last().unwrap());
-
-    completed_region.insert(0, *sorted_keys.first().unwrap());
-    completed_region.push(*sorted_keys.last().unwrap());
-
-    // Get the smallest level members of the completed region.
-
-    let min_level = completed_region
-        .iter()
-        .map(|elem| elem.level())
-        .min()
-        .unwrap();
-
-    // Each process selects its largest boxes. These are used to create
-    // a coarse tree.
-
-    let largest_boxes = completed_region
-        .iter()
-        .filter(|elem| elem.level() == min_level)
-        .copied()
-        .collect_vec();
-
-    let coarse_tree = complete_tree(&largest_boxes, rng, comm);
-
-    // We want to partition the coarse tree. But we need the correct weights. The idea
-    // is that we use the number of original leafs that intersect with the coarse tree
-    // as leafs. In order to compute this we send the coarse tree around to all processes
-    // so that each process computes for each coarse tree element how many of its keys
-    // intersect with each node of the coarse tree. We then sum up the local weight for each
-    // coarse tree node across all nodes to get the weight.
-
-    let global_coarse_tree = gather_to_all(&coarse_tree, comm);
-
-    // We also want to send around a corresponding array of ranks so that for each global coarse tree key
-    // we have the rank of where it originates from.
-
-    let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm);
-
-    // We now compute the local weights.
-    let mut local_weights = vec![0 as usize; global_coarse_tree.len()];
-
-    // In the following loop we want to be a bit smart. We do not iterate through all the local elements.
-    // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
-    // of our sorted keys that overlaps with the coarse tree region.
-
-    // Let's find the start of our region. The start of our region is a coarse key that is an ancestor
-    // of our current key. This works because the coarse tree has levels at most as high as the sorted keys.
-
-    let first_key = *sorted_keys.first().unwrap();
-
-    let first_coarse_index = global_coarse_tree
-        .iter()
-        .take_while(|coarse_key| !coarse_key.is_ancestor(first_key))
-        .count();
-
-    // Now we need to find the end index of our region. For this again we find the index of our coarse tree that
-    // is an ancestor of our last key.
-    let last_key = *sorted_keys.last().unwrap();
-
-    let last_coarse_index = global_coarse_tree
-        .iter()
-        .take_while(|coarse_key| !coarse_key.is_ancestor(last_key))
-        .count();
-
-    // We now only need to iterate through between the first and last coarse index in the coarse tree.
-    // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key).
-
-    for (w, &global_coarse_key) in izip!(
-        local_weights[first_coarse_index..=last_coarse_index].iter_mut(),
-        global_coarse_tree[first_coarse_index..=last_coarse_index].iter()
-    ) {
-        *w += sorted_keys
-            .iter()
-            .filter(|&&key| global_coarse_key.is_ancestor(key))
-            .count();
-    }
-
-    // We now need to sum up the weights across all processes.
-
-    let mut weights = vec![0 as usize; global_coarse_tree.len()];
-
-    comm.all_reduce_into(&local_weights, &mut weights, SystemOperation::sum());
-
-    // Each process now has all weights. However, we only need the ones for the current process.
-    // So we just filter the rest out.
-
-    let weights = izip!(coarse_tree_ranks, weights)
-        .filter_map(|(r, weight)| {
-            if r == rank as usize {
-                Some(weight)
-            } else {
-                None
-            }
-        })
-        .collect_vec();
-
-    let coarse_tree = partition(&coarse_tree, &weights, comm);
-
-    (
-        redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm),
-        coarse_tree,
-    )
-
-    // We now need to redistribute the global tree according to the coarse tree.
-}
-
-/// Redistribute sorted keys with respect to a linear coarse tree.
-pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
-    sorted_keys: &[MortonKey],
-    coarse_tree: &[MortonKey],
-    comm: &C,
-) -> Vec<MortonKey> {
-    let size = comm.size();
-
-    if size == 1 {
-        return sorted_keys.to_vec();
-    }
-
-    // We want to globally redistribute keys so that the keys on each process are descendents
-    // of the local coarse tree keys.
-
-    // We are using here the fact that the coarse tree is complete and sorted.
-    // We are sending around to each process the first local index. This
-    // defines bins in which we sort our keys. The keys are then sent around to the correct
-    // processes via an alltoallv operation.
-
-    let my_first = *coarse_tree.first().unwrap();
-
-    let mut global_bins = Vec::<MortonKey>::with_capacity(size as usize);
-    let global_bins_buff: &mut [MortonKey] =
-        unsafe { std::mem::transmute(global_bins.spare_capacity_mut()) };
-
-    comm.all_gather_into(&my_first, global_bins_buff);
-
-    unsafe { global_bins.set_len(size as usize) };
-
-    // We now have the first index from each process. We also want
-    // an upper bound for the last index of the tree to make the sorting into
-    // bins easier.
-    global_bins.push(MortonKey::upper_bound());
-
-    // We now have our bins. We go through our keys and store how
-    // many keys are assigned to each rank. We are using here that
-    // our keys and the coarse tree are both sorted.
-
-    // This will store for each rank how many keys will be assigned to it.
-
-    let rank_counts = sort_to_bins(sorted_keys, &global_bins)
-        .iter()
-        .map(|&elem| elem as i32)
-        .collect_vec();
-
-    // We now have the counts for each rank. Let's send it around via alltoallv.
-
-    let mut counts_from_proc = vec![0 as i32; size as usize];
-
-    comm.all_to_all_into(&rank_counts, &mut counts_from_proc);
-    // Now compute the send and receive displacements.
-
-    // We can now send around the actual elements with an alltoallv.
-    let send_displs: Vec<i32> = rank_counts
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp as i32)
-        })
-        .collect();
-
-    let send_partition = Partition::new(&sorted_keys[..], &rank_counts[..], &send_displs[..]);
-
-    let mut recvbuffer = vec![MortonKey::default(); counts_from_proc.iter().sum::<i32>() as usize];
-
-    let recv_displs: Vec<i32> = counts_from_proc
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
-
-    let mut receiv_partition =
-        PartitionMut::new(&mut recvbuffer[..], counts_from_proc, &recv_displs[..]);
-    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
-
-    recvbuffer
-}
-
-/// Create bins from sorted keys.
-pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec<usize> {
-    let mut bin_counts = vec![0 as usize; bins.len() - 1];
-
-    // This iterates over each possible bin and returns also the associated rank.
-    let mut bin_iter = izip!(
-        bin_counts.iter_mut(),
-        bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(),
-    );
-
-    // We take the first element of the bin iterator. There will always be at least one.
-    let mut r: &mut usize;
-    let mut bin_start: &MortonKey;
-    let mut bin_end: &MortonKey;
-    (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
-
-    for &key in sorted_keys.iter() {
-        if *bin_start <= key && key < *bin_end {
-            *r += 1;
-        } else {
-            // Move the bin forward until it fits. There will always be a fitting bin.
-            while let Some((rn, (bsn, ben))) = bin_iter.next() {
-                if *bsn <= key && key < *ben {
-                    *rn += 1;
-                    r = rn;
-                    bin_start = bsn;
-                    bin_end = ben;
-                    break;
-                }
-            }
-        }
-    }
-
-    bin_counts
-}
-
-/// Return a complete tree generated from local keys and associated coarse keys.
-///
-/// The coarse keys are refined until the maximum level is reached or until each coarse key
-/// is the ancestor of at most `max_keys` fine keys.
-pub fn create_local_tree(
-    sorted_fine_keys: &[MortonKey],
-    coarse_keys: &[MortonKey],
-    mut max_level: usize,
-    max_keys: usize,
-) -> Vec<MortonKey> {
-    if max_level > DEEPEST_LEVEL as usize {
-        max_level = DEEPEST_LEVEL as usize;
-    }
-
-    // We split the sorted fine keys into subslices so that each subslice
-    // is associated with a coarse slice. For this we need to add an upper bound
-    // coarse keys to ensure that we have suitable bins.
-
-    let mut bins = coarse_keys.to_vec();
-    bins.push(MortonKey::upper_bound());
-
-    let counts = sort_to_bins(&sorted_fine_keys, &bins);
-
-    // We now know how many fine keys are associated with each coarse block. We iterate
-    // through and locally refine for each block that requires it.
-
-    let mut remainder = sorted_fine_keys;
-    let mut new_coarse_keys = Vec::<MortonKey>::new();
-
-    for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) {
-        let current;
-        (current, remainder) = remainder.split_at(count);
-        if coarse_key.level() < max_level && current.len() > max_keys {
-            // We need to refine the current split.
-            new_coarse_keys.extend_from_slice(
-                create_local_tree(
-                    current,
-                    coarse_key.children().as_slice(),
-                    max_level,
-                    max_keys,
-                )
-                .as_slice(),
-            );
-        } else {
-            new_coarse_keys.push(coarse_key)
-        }
-    }
-
-    coarse_keys.to_vec()
-}
-
-/// Linearize a set of weighted Morton keys.
-pub fn linearize<R: Rng, C: CommunicatorCollectives>(
-    keys: &[MortonKey],
-    rng: &mut R,
-    comm: &C,
-) -> Vec<MortonKey> {
-    let size = comm.size();
-    let rank = comm.rank();
-
-    // If we only have one process we use the standard serial linearization.
-
-    if size == 1 {
-        return MortonKey::linearize(keys);
-    }
-
-    // We are first sorting the keys. Then in a linear process across all processors we
-    // go through the arrays and delete ancestors of nodes.
-
-    let sorted_keys = parsort(&keys, comm, rng);
-
-    // Each process needs to send its first element to the previous process. Each process
-    // then goes through its own list and retains elements that are not ancestors of the
-    // next element.
-
-    let mut result = Vec::<MortonKey>::new();
-
-    if rank == size - 1 {
-        comm.process_at_rank(rank - 1)
-            .send(sorted_keys.first().unwrap());
-
-        for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
-            // m1 is also ancestor of m2 if they are identical.
-            if m1.is_ancestor(m2) {
-                continue;
-            } else {
-                result.push(m1);
-            }
-        }
-
-        result.push(*sorted_keys.last().unwrap());
-    } else {
-        let (other, _status) = if rank > 0 {
-            p2p::send_receive(
-                sorted_keys.first().unwrap(),
-                &comm.process_at_rank(rank - 1),
-                &comm.process_at_rank(rank + 1),
-            )
-        } else {
-            comm.any_process().receive::<MortonKey>()
-        };
-        for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
-            // m1 is also ancestor of m2 if they are identical.
-            if m1.is_ancestor(m2) {
-                continue;
-            } else {
-                result.push(m1);
-            }
-        }
-
-        let last = *sorted_keys.last().unwrap();
-
-        if !last.is_ancestor(other) {
-            result.push(last)
-        }
-    }
-
-    result
-}
-
-/// Balance a sorted list of Morton keys across processors given an array of corresponding weights.
-pub fn partition<C: CommunicatorCollectives>(
-    sorted_keys: &[MortonKey],
-    weights: &[usize],
-    comm: &C,
-) -> Vec<MortonKey> {
-    assert_eq!(sorted_keys.len(), weights.len());
-
-    let size = comm.size();
-    let rank = comm.rank();
-
-    // If we only have one process we simply return.
-
-    if size == 1 {
-        return sorted_keys.to_vec();
-    }
-
-    // First scan the weight.
-    // We scan the local arrays, then use a global scan operation on the last element
-    // of each array to get the global sums and then we update the array of each rank
-    // with the sum from the previous ranks.
-
-    let mut scan: Vec<usize> = weights
-        .iter()
-        .scan(0, |state, x| {
-            *state += *x;
-            Some(*state)
-        })
-        .collect_vec();
-    let scan_last = *scan.last().unwrap();
-    let mut scan_result: usize = 0;
-    comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum());
-    for elem in &mut scan {
-        *elem += scan_result;
-    }
-
-    let mut total_weight = if rank == size - 1 {
-        *scan.last().unwrap()
-    } else {
-        0
-    };
-
-    // Scan the weight (form cumulative sums) and broadcast the total weight (last entry on last process)
-    // to all other processes.
-
-    comm.process_at_rank(size - 1)
-        .broadcast_into(&mut total_weight);
-
-    let w = total_weight / (size as usize);
-    let k = total_weight % (size as usize);
-
-    let mut hash_map = HashMap::<usize, Vec<MortonKey>>::new();
-
-    // Sort the elements into bins according to which process they should be sent.
-
-    for p in 1..=size as usize {
-        let q = if p <= k as usize {
-            izip!(sorted_keys, &scan)
-                .filter_map(|(&key, &s)| {
-                    if ((p - 1) * (1 + w) <= s && s < p * (w + 1))
-                        || (p == size as usize && (p - 1) * (1 + w) <= s)
-                    {
-                        Some(key)
-                    } else {
-                        None
-                    }
-                })
-                .collect_vec()
-        } else {
-            izip!(sorted_keys, &scan)
-                .filter_map(|(&key, &s)| {
-                    if ((p - 1) * w + k <= s && s < p * w + k)
-                        || (p == size as usize && (p - 1) * w + k <= s)
-                    {
-                        Some(key)
-                    } else {
-                        None
-                    }
-                })
-                .collect_vec()
-        };
-        hash_map.insert(p - 1, q);
-    }
-
-    // Now distribute the data with an all to all v.
-    // We create a vector of how many elements to send to each process and
-    // then send the actual data.
-
-    let mut counts = vec![0 as i32; size as usize];
-    let mut counts_from_processor = vec![0 as i32; size as usize];
-
-    let mut all_elements = Vec::<MortonKey>::new();
-    for (index, c) in counts.iter_mut().enumerate() {
-        let elements = hash_map.get(&index).unwrap();
-        *c = elements.len() as i32;
-        all_elements.extend(elements.iter())
-    }
-
-    // Send around the number of elements for each process
-    comm.all_to_all_into(&counts, &mut counts_from_processor);
-
-    // We have the number of elements for each process now. Now send around
-    // the actual elements.
-
-    // We can now send around the actual elements with an alltoallv.
-    let send_displs: Vec<i32> = counts
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp as i32)
-        })
-        .collect();
-
-    let send_partition = Partition::new(&all_elements, &counts[..], &send_displs[..]);
-
-    let mut recvbuffer =
-        vec![MortonKey::default(); counts_from_processor.iter().sum::<i32>() as usize];
-
-    let recv_displs: Vec<i32> = counts_from_processor
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
-
-    let mut receiv_partition =
-        PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]);
-    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
-
-    recvbuffer.sort_unstable();
-    recvbuffer
-}
-
-/// Given a distributed set of keys, generate a complete linear Octree.
-pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
-    keys: &[MortonKey],
-    rng: &mut R,
-    comm: &C,
-) -> Vec<MortonKey> {
-    let mut linearized_keys = linearize(keys, rng, comm);
-
-    let size = comm.size();
-    let rank = comm.rank();
-
-    if size == 1 {
-        return MortonKey::complete_tree(linearized_keys.as_slice());
-    }
-
-    // Now insert on the first and last process the first and last child of the
-    // finest ancestor of first/last box on deepest level
-
-    // Send first element to previous rank and insert into local keys.
-    // On the first process we also need to insert the first child of the finest
-    // ancestor of the deepest first key and first element. Correspondingly on the last process
-    // we need to insert the last child of the finest ancester of the deepest last key and last element.
-
-    if rank == size - 1 {
-        // On last process send first element to previous processes and insert last
-        // possible box from region into list.
-        comm.process_at_rank(rank - 1)
-            .send(linearized_keys.first().unwrap());
-        let last_key = *linearized_keys.last().unwrap();
-        let deepest_last = MortonKey::deepest_last();
-        if !last_key.is_ancestor(deepest_last) {
-            let ancestor = deepest_last.finest_common_ancestor(last_key);
-            linearized_keys.push(ancestor.children()[NSIBLINGS - 1]);
-        }
-    } else {
-        let (other, _status) = if rank > 0 {
-            // On intermediate process receive from the next process
-            // and send first element to previous process.
-            p2p::send_receive(
-                linearized_keys.first().unwrap(),
-                &comm.process_at_rank(rank - 1),
-                &comm.process_at_rank(rank + 1),
-            )
-        } else {
-            // On first process insert at the beginning the first possible
-            // box in the region and receive the key from next process.
-            let first_key = *linearized_keys.first().unwrap();
-            let deepest_first = MortonKey::deepest_first();
-            if !first_key.is_ancestor(deepest_first) {
-                let ancestor = deepest_first.finest_common_ancestor(first_key);
-                linearized_keys.insert(0, ancestor.children()[0]);
-            }
-
-            comm.process_at_rank(1).receive::<MortonKey>()
-        };
-        // If we are not at the last process we need to introduce the received key
-        // into our list.
-        linearized_keys.push(other);
-    };
-
-    // Now complete the regions defined by the keys on each process.
-
-    let mut result = Vec::<MortonKey>::new();
-
-    for (&key1, &key2) in linearized_keys.iter().tuple_windows() {
-        result.push(key1);
-        result.extend_from_slice(key1.fill_between_keys(key2).as_slice());
-    }
-
-    if rank == size - 1 {
-        result.push(*linearized_keys.last().unwrap());
-    }
-
-    result
-}
-
-/// Check if an array is sorted.
-pub fn is_sorted_array<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> Option<bool> {
-    let arr = array_to_root(arr, comm);
-    if comm.rank() == 0 {
-        let arr = arr.unwrap();
-        for (&elem1, &elem2) in arr.iter().tuple_windows() {
-            if elem1 > elem2 {
-                return Some(false);
-            }
-        }
-        Some(true)
-    } else {
-        None
-    }
-}
-
-/// Get global size of a distributed array.
-pub fn global_size<T, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> usize {
-    let local_size = arr.len();
-    let mut global_size = 0;
-
-    comm.all_reduce_into(&local_size, &mut global_size, SystemOperation::sum());
-
-    global_size
-}
-
-/// Gather array to all processes
-pub fn gather_to_all<T: Equivalence, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> Vec<T> {
-    // First we need to broadcast the individual sizes on each process.
-
-    let size = comm.size();
-
-    let local_len = arr.len() as i32;
-
-    let mut sizes = vec![0 as i32; size as usize];
-
-    comm.all_gather_into(&local_len, &mut sizes);
-
-    let recv_len = sizes.iter().sum::<i32>() as usize;
-
-    // Now we have the size of each local contribution.
-    // let mut recvbuffer =
-    //     vec![T: Default; counts_from_processor.iter().sum::<i32>() as usize];
-    let mut recvbuffer = Vec::<T>::with_capacity(recv_len);
-    let buf: &mut [T] = unsafe { std::mem::transmute(recvbuffer.spare_capacity_mut()) };
-
-    let recv_displs: Vec<i32> = sizes
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
-
-    let mut receiv_partition = PartitionMut::new(buf, sizes, &recv_displs[..]);
-
-    comm.all_gather_varcount_into(arr, &mut receiv_partition);
-
-    unsafe { recvbuffer.set_len(recv_len) };
-
-    recvbuffer
-}
diff --git a/src/parsort.rs b/src/parsort.rs
index 16518cc..0ffe189 100644
--- a/src/parsort.rs
+++ b/src/parsort.rs
@@ -397,51 +397,6 @@ impl<'a, T> Iterator for Split<'a, T> {
     }
 }
 
-/// Array to root
-pub fn array_to_root<T: ParallelSortable, C: CommunicatorCollectives>(
-    arr: &[T],
-    comm: &C,
-) -> Option<Vec<T>> {
-    let n = arr.len() as i32;
-    let rank = comm.rank();
-    let size = comm.size();
-    let root_process = comm.process_at_rank(0);
-
-    // We first communicate the length of the array to root.
-
-    if rank == 0 {
-        // We are at root.
-
-        let mut ranks = vec![0_i32; size as usize];
-        root_process.gather_into_root(&n, &mut ranks);
-
-        // We now have all ranks at root. Can now a varcount gather to get
-        // the array elements.
-
-        let nelements = ranks.iter().sum::<i32>();
-
-        let mut new_arr = vec![<T as Default>::default(); nelements as usize];
-
-        let displs: Vec<i32> = ranks
-            .iter()
-            .scan(0, |acc, &x| {
-                let tmp = *acc;
-                *acc += x;
-                Some(tmp)
-            })
-            .collect();
-
-        let mut partition = PartitionMut::new(&mut new_arr[..], ranks, &displs[..]);
-
-        root_process.gather_varcount_into_root(arr, &mut partition);
-        Some(new_arr)
-    } else {
-        root_process.gather_into(&n);
-        root_process.gather_varcount_into(arr);
-        None
-    }
-}
-
 macro_rules! impl_min_max_value {
     ($type:ty) => {
         impl MinValue for $type {
diff --git a/src/serial.rs b/src/serial.rs
new file mode 100644
index 0000000..64cdb08
--- /dev/null
+++ b/src/serial.rs
@@ -0,0 +1,325 @@
+//! Definition of a linear octree
+
+use crate::{
+    constants::{DEEPEST_LEVEL, NLEVELS},
+    geometry::PhysicalBox,
+    morton::MortonKey,
+};
+use bytemuck;
+use std::collections::HashMap;
+use vtkio;
+
+/// A neighbour
+pub struct Neighbour {
+    /// Direction
+    pub direction: [i64; 3],
+    /// Level
+    pub level: usize,
+    /// Morton key
+    pub key: MortonKey,
+}
+
+/// An octree
+pub struct Octree {
+    leaf_keys: Vec<MortonKey>,
+    points: Vec<[f64; 3]>,
+    point_to_level_keys: [Vec<MortonKey>; NLEVELS],
+    bounding_box: PhysicalBox,
+    key_counts: HashMap<MortonKey, usize>,
+    max_leaf_level: usize,
+    max_points_in_leaf: usize,
+}
+
+impl Octree {
+    /// Create octress from points
+    pub fn from_points(points: &[f64], max_level: usize, max_points_per_box: usize) -> Self {
+        // Make sure that the points array is a multiple of 3.
+        assert_eq!(points.len() % 3, 0);
+
+        // Make sure that max level never exceeds DEEPEST_LEVEL
+        let max_level = if max_level > DEEPEST_LEVEL as usize {
+            DEEPEST_LEVEL as usize
+        } else {
+            max_level
+        };
+
+        // Compute the physical bounding box.
+
+        let bounding_box = PhysicalBox::from_points(points);
+
+        // Bunch the points in arrays of 3.
+
+        let points: &[[f64; 3]] = bytemuck::cast_slice(points);
+        let npoints = points.len();
+
+        // We create a vector of keys for each point on each level. We compute the
+        // keys on the deepest level and fill the other levels by going from
+        // parent to parent.
+
+        let mut point_to_level_keys: [Vec<MortonKey>; NLEVELS] = Default::default();
+        point_to_level_keys[DEEPEST_LEVEL as usize] = points
+            .iter()
+            .map(|&point| {
+                MortonKey::from_physical_point(point, &bounding_box, DEEPEST_LEVEL as usize)
+            })
+            .collect::<Vec<_>>();
+
+        for index in (1..=DEEPEST_LEVEL as usize).rev() {
+            let mut new_vec = Vec::<MortonKey>::with_capacity(npoints);
+            for &key in &point_to_level_keys[index] {
+                new_vec.push(key.parent());
+            }
+            point_to_level_keys[index - 1] = new_vec;
+        }
+
+        // We now have to create level keys. We are starting at the root and recursing
+        // down until each box has fewer than max_points_per_box keys.
+
+        // First we compute the counts of each key on each level. For that we create
+        // for each level a Hashmap for the keys and then add up.
+
+        let mut key_counts: HashMap<MortonKey, usize> = Default::default();
+
+        for keys in &point_to_level_keys {
+            for key in keys {
+                *key_counts.entry(*key).or_default() += 1;
+            }
+        }
+
+        // We can now easily create an adaptive tree by subdividing. We do this by
+        // a recursive function.
+
+        let mut leaf_keys = Vec::<MortonKey>::new();
+
+        fn recurse_keys(
+            key: MortonKey,
+            key_counts: &HashMap<MortonKey, usize>,
+            leaf_keys: &mut Vec<MortonKey>,
+            max_points_per_box: usize,
+            max_level: usize,
+        ) {
+            let level = key.level();
+            // A key may have not be associated with points. This happens if one of the children on
+            // the previous level has no points in its physical box. However, we want to create a
+            // complete tree. So we still add this one empty child.
+            if let Some(&count) = key_counts.get(&key) {
+                if count > max_points_per_box && level < max_level {
+                    for child in key.children() {
+                        recurse_keys(child, key_counts, leaf_keys, max_points_per_box, max_level);
+                    }
+                } else {
+                    leaf_keys.push(key)
+                }
+            } else {
+                leaf_keys.push(key)
+            }
+        }
+
+        // Now execute the recursion starting from root
+
+        recurse_keys(
+            MortonKey::root(),
+            &key_counts,
+            &mut leaf_keys,
+            max_points_per_box,
+            max_level,
+        );
+
+        // The leaf keys are now a complete linear tree. But they are not yet balanced.
+        // In the final step we balance the leafs.
+
+        let leaf_keys = MortonKey::balance(&leaf_keys, MortonKey::root());
+
+        let mut max_leaf_level = 0;
+        let mut max_points_in_leaf = 0;
+
+        for key in &leaf_keys {
+            max_leaf_level = max_leaf_level.max(key.level());
+            max_points_in_leaf =
+                max_points_in_leaf.max(if let Some(&count) = key_counts.get(key) {
+                    count
+                } else {
+                    0
+                });
+        }
+
+        Self {
+            leaf_keys,
+            points: points.to_vec(),
+            point_to_level_keys,
+            bounding_box,
+            key_counts,
+            max_leaf_level,
+            max_points_in_leaf,
+        }
+    }
+
+    /// Leaf keys
+    pub fn leaf_keys(&self) -> &Vec<MortonKey> {
+        &self.leaf_keys
+    }
+
+    /// Points
+    pub fn points(&self) -> &Vec<[f64; 3]> {
+        &self.points
+    }
+
+    /// Get level keys for each point
+    pub fn point_to_level_keys(&self) -> &[Vec<MortonKey>; NLEVELS] {
+        &self.point_to_level_keys
+    }
+
+    /// Bounding box
+    pub fn bounding_box(&self) -> &PhysicalBox {
+        &self.bounding_box
+    }
+
+    /// Maximum leaf level
+    pub fn maximum_leaf_level(&self) -> usize {
+        self.max_leaf_level
+    }
+
+    /// Maximum number of points in a leaf box
+    pub fn max_points_in_leaf_box(&self) -> usize {
+        self.max_points_in_leaf
+    }
+
+    /// Number of points in the box indexed by a key
+    pub fn number_of_points_in_key(&self, key: MortonKey) -> usize {
+        if let Some(&count) = self.key_counts.get(&key) {
+            count
+        } else {
+            0
+        }
+    }
+
+    /// Export the tree to vtk
+    pub fn export_to_vtk(&self, file_path: &str) {
+        use vtkio::model::{
+            Attributes, ByteOrder, CellType, Cells, DataSet, IOBuffer, UnstructuredGridPiece,
+            Version, VertexNumbers,
+        };
+
+        // Each box has 8 corners with 3 coordinates each, hence 24 floats per key.
+        let mut points = Vec::<f64>::new();
+        // 8 coords per box, hence 8 * nkeys values in connectivity.
+        let mut connectivity = Vec::<u64>::new();
+        // Store the vtk offset for each box.
+        let mut offsets = Vec::<u64>::new();
+
+        let bounding_box = self.bounding_box();
+
+        // Go through the keys and add coordinates and connectivity.
+        // Box coordinates are already in the right order, so connectivity
+        // just counts up. We don't mind doubly counted vertices from two boxes.
+        let mut point_count = 0;
+        let mut key_count = 0;
+
+        for key in self.leaf_keys().iter() {
+            // We only want to export non-empty boxes.
+            if self.number_of_points_in_key(*key) == 0 {
+                continue;
+            }
+            let coords = key.physical_box(bounding_box).corners();
+
+            key_count += 1;
+            offsets.push(8 * key_count);
+
+            for coord in &coords {
+                points.push(coord[0]);
+                points.push(coord[1]);
+                points.push(coord[2]);
+
+                connectivity.push(point_count);
+                point_count += 1;
+            }
+        }
+
+        let vtk_file = vtkio::Vtk {
+            version: Version::new((1, 0)),
+            title: String::new(),
+            byte_order: ByteOrder::LittleEndian,
+            file_path: None,
+            data: DataSet::inline(UnstructuredGridPiece {
+                points: IOBuffer::F64(points),
+                cells: Cells {
+                    cell_verts: VertexNumbers::XML {
+                        connectivity,
+                        offsets,
+                    },
+                    types: vec![CellType::Hexahedron; key_count as usize],
+                },
+                data: Attributes {
+                    point: vec![],
+                    cell: vec![],
+                },
+            }),
+        };
+
+        vtk_file.export_ascii(file_path).unwrap();
+    }
+
+    // We can now create the vtk object.
+}
+
+#[cfg(test)]
+mod test {
+    use super::Octree;
+    use rand::prelude::*;
+
+    fn get_points_on_sphere(npoints: usize) -> Vec<f64> {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(0);
+        let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
+
+        let mut points = Vec::<f64>::with_capacity(3 * npoints);
+        for _ in 0..(npoints) {
+            let x: f64 = normal.sample(&mut rng);
+            let y: f64 = normal.sample(&mut rng);
+            let z: f64 = normal.sample(&mut rng);
+
+            let norm = (x * x + y * y + z * z).sqrt();
+
+            points.push(x / norm);
+            points.push(y / norm);
+            points.push(z / norm);
+        }
+
+        points
+    }
+
+    #[test]
+    fn test_octree() {
+        use std::time::Instant;
+
+        let npoints = 10000;
+        let points = get_points_on_sphere(npoints);
+        let max_level = 7;
+        let max_points_per_box = 100;
+
+        let start = Instant::now();
+        let octree = Octree::from_points(&points, max_level, max_points_per_box);
+        let duration = start.elapsed();
+
+        println!("Creation time: {}", duration.as_millis());
+        println!("Number of leaf keys: {}", octree.leaf_keys().len());
+        println!("Bounding box: {}", octree.bounding_box());
+    }
+
+    #[test]
+    fn test_export() {
+        let fname = "_test_sphere.vtk";
+        let npoints = 10000;
+        let points = get_points_on_sphere(npoints);
+        let max_level = 7;
+        let max_points_per_box = 100;
+
+        let octree = Octree::from_points(&points, max_level, max_points_per_box);
+
+        octree.export_to_vtk(fname);
+        println!("Maximum leaf level: {}", octree.maximum_leaf_level());
+        println!(
+            "Maximum number of points in leaf box: {}",
+            octree.max_points_in_leaf_box()
+        );
+    }
+}
diff --git a/src/tools.rs b/src/tools.rs
new file mode 100644
index 0000000..394273e
--- /dev/null
+++ b/src/tools.rs
@@ -0,0 +1,135 @@
+//! Utility routines.
+
+use mpi::{
+    collective::SystemOperation,
+    datatype::PartitionMut,
+    traits::{CommunicatorCollectives, Equivalence, Root},
+};
+
+/// Gather array to all processes
+pub fn gather_to_all<T: Equivalence, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> Vec<T> {
+    // First we need to broadcast the individual sizes on each process.
+
+    let size = comm.size();
+
+    let local_len = arr.len() as i32;
+
+    let mut sizes = vec![0 as i32; size as usize];
+
+    comm.all_gather_into(&local_len, &mut sizes);
+
+    let recv_len = sizes.iter().sum::<i32>() as usize;
+
+    // Now we have the size of each local contribution.
+    // let mut recvbuffer =
+    //     vec![T: Default; counts_from_processor.iter().sum::<i32>() as usize];
+    let mut recvbuffer = Vec::<T>::with_capacity(recv_len);
+    let buf: &mut [T] = unsafe { std::mem::transmute(recvbuffer.spare_capacity_mut()) };
+
+    let recv_displs: Vec<i32> = sizes
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect();
+
+    let mut receiv_partition = PartitionMut::new(buf, sizes, &recv_displs[..]);
+
+    comm.all_gather_varcount_into(arr, &mut receiv_partition);
+
+    unsafe { recvbuffer.set_len(recv_len) };
+
+    recvbuffer
+}
+/// Array to root
+
+/// Gather distributed array to the root rank.
+///
+/// The result is a `Vec<T>` on root and `None` on all other ranks.
+pub fn gather_to_root<T: Equivalence, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> Option<Vec<T>> {
+    let n = arr.len() as i32;
+    let rank = comm.rank();
+    let size = comm.size();
+    let root_process = comm.process_at_rank(0);
+
+    // We first communicate the length of the array to root.
+
+    if rank == 0 {
+        // We are at root.
+
+        let mut counts = vec![0_i32; size as usize];
+        root_process.gather_into_root(&n, &mut counts);
+
+        // We now have all ranks at root. Can now a varcount gather to get
+        // the array elements.
+
+        let nelements = counts.iter().sum::<i32>();
+        let mut new_arr = Vec::<T>::with_capacity(nelements as usize);
+        let new_arr_buf: &mut [T] = unsafe { std::mem::transmute(new_arr.spare_capacity_mut()) };
+
+        let displs = displacements(counts.as_slice());
+
+        let mut partition = PartitionMut::new(new_arr_buf, counts, &displs[..]);
+
+        root_process.gather_varcount_into_root(arr, &mut partition);
+
+        unsafe { new_arr.set_len(nelements as usize) };
+        Some(new_arr)
+    } else {
+        root_process.gather_into(&n);
+        root_process.gather_varcount_into(arr);
+        None
+    }
+}
+
+/// Get global size of a distributed array.
+///
+/// Computes the size and broadcoasts it to all ranks.
+pub fn global_size<T, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> usize {
+    let local_size = arr.len();
+    let mut global_size = 0;
+
+    comm.all_reduce_into(&local_size, &mut global_size, SystemOperation::sum());
+
+    global_size
+}
+
+/// Check if an array is sorted.
+pub fn is_sorted_array<T: Equivalence, C: CommunicatorCollectives>(
+    arr: &[MortonKey],
+    comm: &C,
+) -> Option<bool> {
+    let arr = gather_to_root(arr, comm);
+    if comm.rank() == 0 {
+        let arr = arr.unwrap();
+        for (&elem1, &elem2) in arr.iter().tuple_windows() {
+            if elem1 > elem2 {
+                return Some(false);
+            }
+        }
+        Some(true)
+    } else {
+        None
+    }
+}
+
+/// Compute displacements from a vector of counts.
+///
+/// This is useful for global MPI varcount operations. Let
+/// count [ 3, 4, 5]. Then the corresponding displacements are
+// [0, 3, 7]. Note that the last element `5` is ignored.
+pub fn displacements(counts: &[i32]) -> Vec<i32> {
+    counts
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect()
+}

From 301156c7481cb8256848782d95299e7c73ae39df Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Tue, 17 Sep 2024 23:19:53 +0100
Subject: [PATCH 16/42] WIP: Refactor

---
 src/parsort.rs | 32 ++++++-------------------
 src/tools.rs   | 65 +++++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 58 insertions(+), 39 deletions(-)

diff --git a/src/parsort.rs b/src/parsort.rs
index 0ffe189..93c9868 100644
--- a/src/parsort.rs
+++ b/src/parsort.rs
@@ -5,13 +5,15 @@ use std::mem::offset_of;
 
 use itertools::Itertools;
 use mpi::datatype::{UncommittedDatatypeRef, UncommittedUserDatatype, UserDatatype};
-use mpi::traits::{Equivalence, Root};
+use mpi::traits::Equivalence;
 use mpi::{
     datatype::{Partition, PartitionMut},
     traits::CommunicatorCollectives,
 };
 use rand::{seq::SliceRandom, Rng};
 
+use crate::tools::displacements;
+
 const OVERSAMPLING: usize = 8;
 
 /// Sortable trait that each type fed into parsort needs to satisfy.
@@ -177,14 +179,7 @@ where
     let mut all_splitters = vec![Default::default(); n_all_splitters];
     let splitters_per_rank = splitters_per_rank.iter().map(|&x| x as i32).collect_vec();
 
-    let displs: Vec<i32> = splitters_per_rank
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
+    let displs = displacements(&splitters_per_rank);
 
     let mut partition = PartitionMut::new(&mut all_splitters[..], splitters_per_rank, &displs[..]);
     comm.all_gather_varcount_into(&splitters, &mut partition);
@@ -331,28 +326,15 @@ pub fn parsort<T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized>
     // Each processor now knows how much he gets from all the others.
 
     // We can now send around the actual elements with an alltoallv.
-    let send_displs: Vec<i32> = counts
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
+
+    let send_displs = displacements(&counts);
 
     let send_partition = Partition::new(&arr, counts, &send_displs[..]);
 
     let mut recvbuffer =
         vec![UniqueItem::default(); counts_from_processor.iter().sum::<i32>() as usize];
 
-    let recv_displs: Vec<i32> = counts_from_processor
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
+    let recv_displs = displacements(&counts_from_processor);
 
     let mut receiv_partition =
         PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]);
diff --git a/src/tools.rs b/src/tools.rs
index 394273e..0811629 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -1,9 +1,13 @@
 //! Utility routines.
 
+use std::mem::MaybeUninit;
+
+use itertools::Itertools;
 use mpi::{
     collective::SystemOperation,
     datatype::PartitionMut,
-    traits::{CommunicatorCollectives, Equivalence, Root},
+    point_to_point as p2p,
+    traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source},
 };
 
 /// Gather array to all processes
@@ -99,23 +103,56 @@ pub fn global_size<T, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> usize
     global_size
 }
 
+/// Communicate the first element of each local array back to the previous rank.
+pub fn communicate_back<T: Equivalence, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> Option<T> {
+    let rank = comm.rank();
+    let size = comm.size();
+
+    if rank == size - 1 {
+        comm.process_at_rank(rank - 1).send(arr.first().unwrap());
+        return None;
+    } else {
+        let (new_last, _status) = if rank > 0 {
+            p2p::send_receive(
+                arr.first().unwrap(),
+                &comm.process_at_rank(rank - 1),
+                &comm.process_at_rank(rank + 1),
+            )
+        } else {
+            comm.process_at_rank(1).receive::<T>()
+        };
+        Some(new_last)
+    }
+}
+
 /// Check if an array is sorted.
-pub fn is_sorted_array<T: Equivalence, C: CommunicatorCollectives>(
-    arr: &[MortonKey],
+pub fn is_sorted_array<T: Equivalence + PartialOrd, C: CommunicatorCollectives>(
+    arr: &[T],
     comm: &C,
-) -> Option<bool> {
-    let arr = gather_to_root(arr, comm);
-    if comm.rank() == 0 {
-        let arr = arr.unwrap();
-        for (&elem1, &elem2) in arr.iter().tuple_windows() {
-            if elem1 > elem2 {
-                return Some(false);
-            }
+) -> bool {
+    let mut sorted = true;
+    for (elem1, elem2) in arr.iter().tuple_windows() {
+        if elem1 > elem2 {
+            sorted = false;
         }
-        Some(true)
-    } else {
-        None
     }
+
+    if let Some(next_first) = communicate_back(arr, comm) {
+        sorted = *arr.last().unwrap() <= next_first;
+    }
+
+    let mut global_sorted: bool = false;
+    comm.all_reduce_into(&sorted, &mut global_sorted, SystemOperation::logical_and());
+
+    global_sorted
+}
+
+/// Redistribute an array via an all_to_all_varcount operation.
+pub fn redistribute<T: Equivalence>(arr: &[T], counts: &[i32]) {
+    todo!();
 }
 
 /// Compute displacements from a vector of counts.

From 8f0a5e88d9212af1afcea0dda91d8f14f6e13ea9 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 20 Sep 2024 18:39:05 +0100
Subject: [PATCH 17/42] WIP: Refactor

---
 examples/parsort.rs | 19 ++++---------------
 src/parsort.rs      | 24 +++---------------------
 src/tools.rs        | 44 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/examples/parsort.rs b/examples/parsort.rs
index 6361f6c..8a20706 100644
--- a/examples/parsort.rs
+++ b/examples/parsort.rs
@@ -1,13 +1,11 @@
 //! Testing the hyksort component.
-use bempp_octree::parsort::{array_to_root, parsort};
-use itertools::Itertools;
+use bempp_octree::{parsort::parsort, tools::is_sorted_array};
 use mpi::traits::Communicator;
 use rand::prelude::*;
 
 pub fn main() {
     let universe = mpi::initialize().unwrap();
     let world = universe.world();
-    let rank = world.rank() as u64;
     let n_per_rank = 1000;
 
     let mut rng = rand::rngs::StdRng::seed_from_u64(0);
@@ -18,20 +16,11 @@ pub fn main() {
         arr.push(rng.gen());
     }
 
-    // let splitters = get_splitters(&arr, &world, &mut rng);
-
-    // let bin_displs = get_bin_displacements(&arr, &splitters);
-
     let arr = parsort(&arr, &world, &mut rng);
-    let arr = array_to_root(&arr, &world);
 
-    if rank == 0 {
-        let arr = arr.unwrap();
+    assert!(is_sorted_array(&arr, &world));
 
-        for (elem1, elem2) in arr.iter().tuple_windows() {
-            assert!(elem1 <= elem2);
-        }
-        println!("Sorted {} elements.", arr.len());
-        println!("Finished.");
+    if world.rank() == 0 {
+        println!("Array is sorted.");
     }
 }
diff --git a/src/parsort.rs b/src/parsort.rs
index 93c9868..073d592 100644
--- a/src/parsort.rs
+++ b/src/parsort.rs
@@ -12,7 +12,7 @@ use mpi::{
 };
 use rand::{seq::SliceRandom, Rng};
 
-use crate::tools::displacements;
+use crate::tools::{displacements, gather_to_all};
 
 const OVERSAMPLING: usize = 8;
 
@@ -162,27 +162,9 @@ where
         .copied()
         .collect::<Vec<_>>();
 
-    // We use an all_gatherv so that each process receives all splitters.
-    // For that we first communicate how many splitters each process has
-    // and then we send the splitters themselves.
+    // We gather the splitters into all ranks so that each rank has all splitters.
 
-    let nsplitters = splitters.len();
-    let mut splitters_per_rank = vec![0_usize; size];
-
-    comm.all_gather_into(&nsplitters, &mut splitters_per_rank);
-
-    // We now know how many splitters each process has. We now create space
-    // for the splitters and send them all around.
-
-    let n_all_splitters = splitters_per_rank.iter().sum();
-
-    let mut all_splitters = vec![Default::default(); n_all_splitters];
-    let splitters_per_rank = splitters_per_rank.iter().map(|&x| x as i32).collect_vec();
-
-    let displs = displacements(&splitters_per_rank);
-
-    let mut partition = PartitionMut::new(&mut all_splitters[..], splitters_per_rank, &displs[..]);
-    comm.all_gather_varcount_into(&splitters, &mut partition);
+    let mut all_splitters = gather_to_all(&splitters, comm);
 
     // We now have all splitters available on each process.
     // We can now sort the splitters. Every process will then have the same list of sorted splitters.
diff --git a/src/tools.rs b/src/tools.rs
index 0811629..baa41a3 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -1,13 +1,14 @@
 //! Utility routines.
 
-use std::mem::MaybeUninit;
-
 use itertools::Itertools;
 use mpi::{
     collective::SystemOperation,
-    datatype::PartitionMut,
+    datatype::{Partition, PartitionMut},
     point_to_point as p2p,
-    traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source},
+    traits::{
+        CommunicatorCollectives, Destination, Equivalence, PartitionedBuffer, PartitionedBufferMut,
+        Root, Source,
+    },
 };
 
 /// Gather array to all processes
@@ -140,6 +141,10 @@ pub fn is_sorted_array<T: Equivalence + PartialOrd, C: CommunicatorCollectives>(
         }
     }
 
+    if comm.size() == 1 {
+        return sorted;
+    }
+
     if let Some(next_first) = communicate_back(arr, comm) {
         sorted = *arr.last().unwrap() <= next_first;
     }
@@ -151,8 +156,35 @@ pub fn is_sorted_array<T: Equivalence + PartialOrd, C: CommunicatorCollectives>(
 }
 
 /// Redistribute an array via an all_to_all_varcount operation.
-pub fn redistribute<T: Equivalence>(arr: &[T], counts: &[i32]) {
-    todo!();
+pub fn redistribute<T: Equivalence, C: CommunicatorCollectives>(
+    arr: &[T],
+    counts: &[i32],
+    comm: &C,
+) -> Vec<T> {
+    assert_eq!(counts.len(), comm.size() as usize);
+
+    // First send the counts around via an alltoall operation.
+
+    let mut recv_counts = vec![0 as i32; counts.len()];
+
+    comm.all_to_all_into(&counts[..], &mut recv_counts);
+
+    // We have the recv_counts. Allocate space and setup the partitions.
+
+    let nelems = recv_counts.iter().sum::<i32>() as usize;
+
+    let mut output = Vec::<T>::with_capacity(nelems);
+    let out_buf: &mut [T] = unsafe { std::mem::transmute(output.spare_capacity_mut()) };
+
+    let send_partition = Partition::new(arr, counts, displacements(counts));
+    let mut recv_partition =
+        PartitionMut::new(out_buf, &recv_counts[..], displacements(&recv_counts));
+
+    comm.all_to_all_varcount_into(&send_partition, &mut recv_partition);
+
+    unsafe { output.set_len(nelems) };
+
+    output
 }
 
 /// Compute displacements from a vector of counts.

From 44372e3ebdc23a215125daea90ed77b02507b9dc Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sat, 21 Sep 2024 17:10:25 +0100
Subject: [PATCH 18/42] WIP: Parallel tests

---
 examples/mpi_global_bounding_box.rs |  18 ++--
 src/octree.rs                       | 144 +++++-----------------------
 src/parsort.rs                      |  32 +------
 3 files changed, 38 insertions(+), 156 deletions(-)

diff --git a/examples/mpi_global_bounding_box.rs b/examples/mpi_global_bounding_box.rs
index 28748b2..330a168 100644
--- a/examples/mpi_global_bounding_box.rs
+++ b/examples/mpi_global_bounding_box.rs
@@ -1,7 +1,8 @@
 //! Test the computation of a global bounding box across MPI ranks.
 
-use bempp_octree::octree::compute_global_bounding_box;
-use mpi::traits::*;
+use bempp_octree::{
+    geometry::PhysicalBox, octree::compute_global_bounding_box, tools::gather_to_root,
+};
 use rand::prelude::*;
 use rand_chacha::ChaCha8Rng;
 
@@ -15,10 +16,6 @@ pub fn main() {
     // Initialise a seeded Rng.
     let mut rng = ChaCha8Rng::seed_from_u64(2);
 
-    // Get the rank and size
-    let rank = comm.rank();
-    let size = comm.size();
-
     // Create `npoints` per rank.
     let npoints = 10;
 
@@ -33,4 +30,13 @@ pub fn main() {
     // Compute the distributed bounding box.
 
     let bounding_box = compute_global_bounding_box(&points, &comm);
+
+    // Copy all points to root and compare local bounding box there.
+
+    if let Some(points_root) = gather_to_root(&points, &comm) {
+        // Compute the bounding box on root.
+
+        let expected = PhysicalBox::from_points(&points_root);
+        assert_eq!(expected.coordinates(), bounding_box.coordinates());
+    }
 }
diff --git a/src/octree.rs b/src/octree.rs
index a3caa64..fe21431 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -7,11 +7,10 @@ use crate::{
     geometry::PhysicalBox,
     morton::MortonKey,
     parsort::parsort,
-    tools::gather_to_all,
+    tools::{communicate_back, gather_to_all, redistribute},
 };
 
 use mpi::{
-    datatype::{Partition, PartitionMut},
     point_to_point as p2p,
     traits::{Root, Source},
 };
@@ -133,19 +132,6 @@ pub fn points_to_morton<C: CommunicatorCollectives>(
         .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level))
         .collect_vec();
 
-    // Now want to get weighted Morton keys. We use a HashMap.
-
-    let mut value_counts = HashMap::<MortonKey, usize>::new();
-
-    for key in &keys {
-        *value_counts.entry(*key).or_insert(0) += 1;
-    }
-
-    // let weighted_keys = value_counts
-    //     .iter()
-    //     .map(|(&key, &weight)| WeightedMortonKey::new(key, weight))
-    //     .collect_vec();
-
     (keys, bounding_box)
 }
 
@@ -295,15 +281,9 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
     // defines bins in which we sort our keys. The keys are then sent around to the correct
     // processes via an alltoallv operation.
 
-    let my_first = *coarse_tree.first().unwrap();
-
-    let mut global_bins = Vec::<MortonKey>::with_capacity(size as usize);
-    let global_bins_buff: &mut [MortonKey] =
-        unsafe { std::mem::transmute(global_bins.spare_capacity_mut()) };
-
-    comm.all_gather_into(&my_first, global_bins_buff);
+    let my_first = coarse_tree.first().unwrap();
 
-    unsafe { global_bins.set_len(size as usize) };
+    let mut global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
 
     // We now have the first index from each process. We also want
     // an upper bound for the last index of the tree to make the sorting into
@@ -321,41 +301,9 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
         .map(|&elem| elem as i32)
         .collect_vec();
 
-    // We now have the counts for each rank. Let's send it around via alltoallv.
-
-    let mut counts_from_proc = vec![0 as i32; size as usize];
-
-    comm.all_to_all_into(&rank_counts, &mut counts_from_proc);
-    // Now compute the send and receive displacements.
-
-    // We can now send around the actual elements with an alltoallv.
-    let send_displs: Vec<i32> = rank_counts
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp as i32)
-        })
-        .collect();
-
-    let send_partition = Partition::new(&sorted_keys[..], &rank_counts[..], &send_displs[..]);
+    // We now have the counts for each rank. Let's redistribute accordingly and return.
 
-    let mut recvbuffer = vec![MortonKey::default(); counts_from_proc.iter().sum::<i32>() as usize];
-
-    let recv_displs: Vec<i32> = counts_from_proc
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
-
-    let mut receiv_partition =
-        PartitionMut::new(&mut recvbuffer[..], counts_from_proc, &recv_displs[..]);
-    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
-
-    recvbuffer
+    redistribute(&sorted_keys, &rank_counts, comm)
 }
 
 /// Create bins from sorted keys.
@@ -471,43 +419,28 @@ pub fn linearize<R: Rng, C: CommunicatorCollectives>(
 
     let mut result = Vec::<MortonKey>::new();
 
-    if rank == size - 1 {
-        comm.process_at_rank(rank - 1)
-            .send(sorted_keys.first().unwrap());
+    let next_key = communicate_back(&sorted_keys, comm);
 
-        for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
-            // m1 is also ancestor of m2 if they are identical.
-            if m1.is_ancestor(m2) {
-                continue;
-            } else {
-                result.push(m1);
-            }
+    // Treat the local keys
+    for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
+        // m1 is also ancestor of m2 if they are identical.
+        if m1.is_ancestor(m2) {
+            continue;
+        } else {
+            result.push(m1);
         }
+    }
 
+    // If we are at the last process simply push the last key.
+    // Otherwise check whether it might be the ancestor of `next_key`,
+    // the first key on the next process. If yes, don't push it. Otherwise do.
+
+    if rank == size - 1 {
         result.push(*sorted_keys.last().unwrap());
     } else {
-        let (other, _status) = if rank > 0 {
-            p2p::send_receive(
-                sorted_keys.first().unwrap(),
-                &comm.process_at_rank(rank - 1),
-                &comm.process_at_rank(rank + 1),
-            )
-        } else {
-            comm.any_process().receive::<MortonKey>()
-        };
-        for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
-            // m1 is also ancestor of m2 if they are identical.
-            if m1.is_ancestor(m2) {
-                continue;
-            } else {
-                result.push(m1);
-            }
-        }
-
         let last = *sorted_keys.last().unwrap();
-
-        if !last.is_ancestor(other) {
-            result.push(last)
+        if !last.is_ancestor(next_key.unwrap()) {
+            result.push(last);
         }
     }
 
@@ -603,7 +536,6 @@ pub fn partition<C: CommunicatorCollectives>(
     // then send the actual data.
 
     let mut counts = vec![0 as i32; size as usize];
-    let mut counts_from_processor = vec![0 as i32; size as usize];
 
     let mut all_elements = Vec::<MortonKey>::new();
     for (index, c) in counts.iter_mut().enumerate() {
@@ -612,39 +544,7 @@ pub fn partition<C: CommunicatorCollectives>(
         all_elements.extend(elements.iter())
     }
 
-    // Send around the number of elements for each process
-    comm.all_to_all_into(&counts, &mut counts_from_processor);
-
-    // We have the number of elements for each process now. Now send around
-    // the actual elements.
-
-    // We can now send around the actual elements with an alltoallv.
-    let send_displs: Vec<i32> = counts
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp as i32)
-        })
-        .collect();
-
-    let send_partition = Partition::new(&all_elements, &counts[..], &send_displs[..]);
-
-    let mut recvbuffer =
-        vec![MortonKey::default(); counts_from_processor.iter().sum::<i32>() as usize];
-
-    let recv_displs: Vec<i32> = counts_from_processor
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
-
-    let mut receiv_partition =
-        PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]);
-    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
+    let mut recvbuffer = redistribute(&all_elements, &counts, comm);
 
     recvbuffer.sort_unstable();
     recvbuffer
diff --git a/src/parsort.rs b/src/parsort.rs
index 073d592..49d48ca 100644
--- a/src/parsort.rs
+++ b/src/parsort.rs
@@ -5,14 +5,11 @@ use std::mem::offset_of;
 
 use itertools::Itertools;
 use mpi::datatype::{UncommittedDatatypeRef, UncommittedUserDatatype, UserDatatype};
+use mpi::traits::CommunicatorCollectives;
 use mpi::traits::Equivalence;
-use mpi::{
-    datatype::{Partition, PartitionMut},
-    traits::CommunicatorCollectives,
-};
 use rand::{seq::SliceRandom, Rng};
 
-use crate::tools::{displacements, gather_to_all};
+use crate::tools::{gather_to_all, redistribute};
 
 const OVERSAMPLING: usize = 8;
 
@@ -297,30 +294,9 @@ pub fn parsort<T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized>
         .map(|&elem| elem as i32)
         .collect::<Vec<_>>();
 
-    // We now do an all_to_allv to communicate the array elements to the right processors.
+    // We can now redistribute the array across the processors.
 
-    // First we need to communicate how many elements everybody gets from each processor.
-
-    let mut counts_from_processor = vec![0_i32; size];
-
-    comm.all_to_all_into(&counts, &mut counts_from_processor);
-
-    // Each processor now knows how much he gets from all the others.
-
-    // We can now send around the actual elements with an alltoallv.
-
-    let send_displs = displacements(&counts);
-
-    let send_partition = Partition::new(&arr, counts, &send_displs[..]);
-
-    let mut recvbuffer =
-        vec![UniqueItem::default(); counts_from_processor.iter().sum::<i32>() as usize];
-
-    let recv_displs = displacements(&counts_from_processor);
-
-    let mut receiv_partition =
-        PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]);
-    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
+    let mut recvbuffer = redistribute(&arr, &counts, comm);
 
     // We now have everything in the receive buffer. Now sort the local elements and return
 

From fc8e63d343995ee856d966bd44a9167e7a218031 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sat, 21 Sep 2024 19:23:30 +0100
Subject: [PATCH 19/42] WIP: distribute_complete_tree_test

---
 examples/mpi_complete_tree.rs | 38 +++++++++++++++++++++
 examples/parallel_tests.rs    |  9 ++---
 src/morton.rs                 | 62 +++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 4 deletions(-)
 create mode 100644 examples/mpi_complete_tree.rs

diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
new file mode 100644
index 0000000..e3fea61
--- /dev/null
+++ b/examples/mpi_complete_tree.rs
@@ -0,0 +1,38 @@
+//! Test the computation of a global bounding box across MPI ranks.
+
+use bempp_octree::{
+    constants::DEEPEST_LEVEL,
+    geometry::PhysicalBox,
+    octree::{complete_tree, compute_global_bounding_box, points_to_morton},
+    tools::gather_to_root,
+};
+use rand::prelude::*;
+use rand_chacha::ChaCha8Rng;
+
+pub fn main() {
+    // Initialise MPI
+    let universe = mpi::initialize().unwrap();
+
+    // Get the world communicator
+    let comm = universe.world();
+
+    // Initialise a seeded Rng.
+    let mut rng = ChaCha8Rng::seed_from_u64(2);
+
+    // Create `npoints` per rank.
+    let npoints = 3;
+
+    // Generate random points.
+
+    let mut points = Vec::<f64>::with_capacity(3 * npoints);
+
+    for _ in 0..3 * npoints {
+        points.push(rng.gen());
+    }
+
+    // Compute the Morton keys on the deepest level
+    let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm);
+
+    // Generate a complete tree
+    let distributed_complete_tree = complete_tree(&keys, &mut rng, &comm);
+}
diff --git a/examples/parallel_tests.rs b/examples/parallel_tests.rs
index e3555e9..9d841ea 100644
--- a/examples/parallel_tests.rs
+++ b/examples/parallel_tests.rs
@@ -1,15 +1,16 @@
 //! Testing the hyksort component.
 use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE};
 use bempp_octree::morton::MortonKey;
-use bempp_octree::octree::{block_partition, is_sorted_array, linearize, partition};
-use bempp_octree::parsort::{array_to_root, parsort};
+use bempp_octree::octree::{block_partition, linearize, partition};
+use bempp_octree::parsort::parsort;
+use bempp_octree::tools::gather_to_root;
 use itertools::{izip, Itertools};
 use mpi::traits::*;
 use rand::prelude::*;
 
 pub fn assert_linearized<C: CommunicatorCollectives>(arr: &Vec<MortonKey>, comm: &C) {
     // Check that the keys are still linearized.
-    let arr = array_to_root(&arr, comm);
+    let arr = gather_to_root(&arr, comm);
 
     if comm.rank() == 0 {
         let arr = arr.unwrap();
@@ -130,7 +131,7 @@ pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, co
         partitioned_tree.0.len()
     );
 
-    let arr = array_to_root(&partitioned_tree.0, comm);
+    let arr = gather_to_root(&partitioned_tree.0, comm);
 
     if rank == 0 {
         let arr = arr.unwrap();
diff --git a/src/morton.rs b/src/morton.rs
index 8e66bfb..eb9d49e 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -466,6 +466,33 @@ impl MortonKey {
         key
     }
 
+    /// Return the next possible Morton key on the deepest level that is not a descendent of the current key.
+    ///
+    /// If the key is already the last possible key then return None.
+    pub fn next_non_descendent_key(&self) -> Option<MortonKey> {
+        // If we are an ancestor of deepest_last we return None as then there
+        // is next key.
+
+        if self.is_ancestor(MortonKey::deepest_last()) {
+            return None;
+        }
+
+        let level = self.level() as u64;
+
+        let level_diff = DEEPEST_LEVEL - level;
+        let shift = LEVEL_DISPLACEMENT + 3 * level_diff;
+
+        // Need to know which sibling we are.
+        let child_index = ((self.value >> shift) % 8) as usize;
+        // If we are between 0 and 6 take the next sibling and go to deepest level.
+        if child_index < 7 {
+            Some(MortonKey::new(self.value + (1 << shift) + level_diff))
+        } else {
+            // If we are the last child go to the parent and take next key from there.
+            self.parent().next_non_descendent_key()
+        }
+    }
+
     /// Linearize by sorting and removing overlaps.
     pub fn linearize(keys: &[MortonKey]) -> Vec<MortonKey> {
         let mut new_keys = Vec::<MortonKey>::new();
@@ -1315,4 +1342,39 @@ mod test {
             )
         );
     }
+
+    #[test]
+    pub fn test_next_nondescendent_key() {
+        let key = MortonKey::from_index_and_level([25, 17, 6], 5);
+
+        let children = key.children();
+
+        // Check the next nondescendent key for the first six children
+
+        for (child, next_child) in children.iter().tuple_windows() {
+            let next_key = child.next_non_descendent_key().unwrap();
+            assert_eq!(next_key.level(), DEEPEST_LEVEL as usize);
+            assert!(!child.is_ancestor(next_key));
+            assert!(next_child.is_ancestor(next_key));
+        }
+
+        // Now check the next nondescendent key from the last child.
+
+        let next_child = children.last().unwrap().next_non_descendent_key();
+
+        // Check that the next nondescendent key from the parent is the same as that of the last child.
+
+        assert_eq!(key.next_non_descendent_key(), next_child);
+
+        // Check that it is not a descendent of the parent and that its level is correct.
+
+        assert_eq!(next_child.unwrap().level(), DEEPEST_LEVEL as usize);
+        assert!(!key.is_ancestor(next_child.unwrap()));
+
+        // Finally make sure that an ancestor of deepest last returns None.
+
+        assert!(MortonKey::deepest_last()
+            .next_non_descendent_key()
+            .is_none());
+    }
 }

From 25c0e84dc6ef1742b6b03ff0146344e0f56aacbe Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sat, 21 Sep 2024 20:49:45 +0100
Subject: [PATCH 20/42] WIP: Test complete tree

---
 examples/mpi_complete_tree.rs | 19 ++++++++--
 src/octree.rs                 | 70 +++++++++++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index e3fea61..d3bd214 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -3,9 +3,12 @@
 use bempp_octree::{
     constants::DEEPEST_LEVEL,
     geometry::PhysicalBox,
-    octree::{complete_tree, compute_global_bounding_box, points_to_morton},
+    octree::{
+        complete_tree, compute_global_bounding_box, is_complete_linear_tree, points_to_morton,
+    },
     tools::gather_to_root,
 };
+use mpi::traits::*;
 use rand::prelude::*;
 use rand_chacha::ChaCha8Rng;
 
@@ -17,10 +20,10 @@ pub fn main() {
     let comm = universe.world();
 
     // Initialise a seeded Rng.
-    let mut rng = ChaCha8Rng::seed_from_u64(2);
+    let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
 
     // Create `npoints` per rank.
-    let npoints = 3;
+    let npoints = 10;
 
     // Generate random points.
 
@@ -33,6 +36,16 @@ pub fn main() {
     // Compute the Morton keys on the deepest level
     let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm);
 
+    assert!(!is_complete_linear_tree(&keys, &comm));
+
     // Generate a complete tree
     let distributed_complete_tree = complete_tree(&keys, &mut rng, &comm);
+
+    let is_complete_linear = is_complete_linear_tree(&distributed_complete_tree, &comm);
+
+    assert!(is_complete_linear);
+
+    if comm.rank() == 0 {
+        println!("Distributed tree is complete and linear.");
+    }
 }
diff --git a/src/octree.rs b/src/octree.rs
index fe21431..aaa01a0 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -625,3 +625,73 @@ pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
 
     result
 }
+
+/// Return true on all ranks if distributed tree is complete. Otherwise, return false.
+pub fn is_complete_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
+    // First check that the local tree on each node is complete.
+
+    let mut complete_linear = true;
+    for (key1, key2) in arr.iter().tuple_windows() {
+        // Make sure that the keys are sorted and not duplicated.
+        if key1 >= key2 {
+            complete_linear = false;
+            break;
+        }
+        // The next key should be an ancestor of the next non-descendent key.
+        if let Some(expected_next) = key1.next_non_descendent_key() {
+            if !key2.is_ancestor(expected_next) {
+                complete_linear = false;
+                break;
+            }
+        } else {
+            // Only for the very last key there should not be a next non-descendent key.
+            complete_linear = false;
+        }
+    }
+
+    // We now check the interfaces.
+
+    if let Some(next_first) = communicate_back(arr, comm) {
+        // We are on any but the last rank
+        let last_key = arr.last().unwrap();
+
+        // Check that the keys are sorted and not duplicated.
+        if *last_key >= next_first {
+            complete_linear = false;
+        }
+
+        // Check that the next key is an encestor of the next non-descendent.
+        if let Some(expected_next) = last_key.next_non_descendent_key() {
+            if !next_first.is_ancestor(expected_next) {
+                complete_linear = false;
+            }
+        } else {
+            complete_linear = false;
+        }
+    } else {
+        // We are on the last rank
+        // Check that the last key is ancestor of deepest last.
+        if !arr.last().unwrap().is_ancestor(MortonKey::deepest_last()) {
+            complete_linear = false;
+        }
+    }
+
+    // Now check that at the first rank we include the deepest first.
+
+    if comm.rank() == 0 {
+        if !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) {
+            complete_linear = false;
+        }
+    }
+
+    // Now communicate everything together.
+
+    let mut result = false;
+    comm.all_reduce_into(
+        &complete_linear,
+        &mut result,
+        SystemOperation::logical_and(),
+    );
+
+    result
+}

From ccb8be5b73f3103938ea7e7a5ecf244cb6399a8b Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 11:30:22 +0100
Subject: [PATCH 21/42] WIP: Better binning

---
 Cargo.toml             |   1 +
 examples/mpi_cumsum.rs |  66 ++++++++++++++++++++
 src/octree.rs          |  21 ++-----
 src/tools.rs           | 135 ++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 205 insertions(+), 18 deletions(-)
 create mode 100644 examples/mpi_cumsum.rs

diff --git a/Cargo.toml b/Cargo.toml
index 3c3fc1a..bff73c7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -24,6 +24,7 @@ crate-type = ["cdylib", "lib"]
 itertools = "0.13.*"
 rand = { version = "0.8.5", features = ["alloc"] }
 rand_chacha = "0.3.*"
+num = "0.4.*"
 bytemuck = "1.*"
 vtkio = "0.6.*"
 mpi = {version = "0.8.*", features = ["derive", "user-operations"] }
diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs
new file mode 100644
index 0000000..0c59245
--- /dev/null
+++ b/examples/mpi_cumsum.rs
@@ -0,0 +1,66 @@
+//! Test the computation of a global bounding box across MPI ranks.
+
+use bempp_octree::{
+    geometry::PhysicalBox,
+    octree::compute_global_bounding_box,
+    tools::{gather_to_root, global_inclusive_cumsum},
+};
+use itertools::{izip, Itertools};
+use mpi::traits::*;
+use rand::prelude::*;
+use rand_chacha::ChaCha8Rng;
+
+pub fn main() {
+    // Initialise MPI
+    let universe = mpi::initialize().unwrap();
+
+    // Get the world communicator
+    let comm = universe.world();
+
+    // Initialise a seeded Rng.
+    let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
+
+    // Create `npoints` per rank.
+    let nelems = 10;
+
+    // Generate random numbers
+
+    let mut elems = Vec::<usize>::with_capacity(3 * nelems);
+
+    for _ in 0..nelems {
+        elems.push(rng.gen_range(0..100));
+    }
+
+    // Compute the cumulative sum.
+
+    let global_cum_sum = global_inclusive_cumsum(&elems, &comm);
+
+    // Copy array to root and compare with inclusive scan there.
+
+    if let (Some(cum_sum_root), Some(original_array)) = (
+        gather_to_root(&global_cum_sum, &comm),
+        gather_to_root(&elems, &comm),
+    ) {
+        // Scan on root
+
+        let expected_cum_sum = original_array
+            .iter()
+            .scan(0, |state, x| {
+                *state = *x + *state;
+                Some(*state)
+            })
+            .collect_vec();
+
+        // Check that the first element is not modified (inclusive cumsum)
+        assert_eq!(
+            original_array.first().unwrap(),
+            cum_sum_root.first().unwrap()
+        );
+
+        for (actual, expected) in izip!(cum_sum_root.iter(), expected_cum_sum.iter()) {
+            assert_eq!(*actual, *expected);
+        }
+
+        println!("Cumulative sum computed.");
+    }
+}
diff --git a/src/octree.rs b/src/octree.rs
index aaa01a0..0371f3f 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -7,7 +7,7 @@ use crate::{
     geometry::PhysicalBox,
     morton::MortonKey,
     parsort::parsort,
-    tools::{communicate_back, gather_to_all, redistribute},
+    tools::{communicate_back, gather_to_all, global_inclusive_cumsum, redistribute},
 };
 
 use mpi::{
@@ -469,19 +469,9 @@ pub fn partition<C: CommunicatorCollectives>(
     // of each array to get the global sums and then we update the array of each rank
     // with the sum from the previous ranks.
 
-    let mut scan: Vec<usize> = weights
-        .iter()
-        .scan(0, |state, x| {
-            *state += *x;
-            Some(*state)
-        })
-        .collect_vec();
-    let scan_last = *scan.last().unwrap();
-    let mut scan_result: usize = 0;
-    comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum());
-    for elem in &mut scan {
-        *elem += scan_result;
-    }
+    let scan = global_inclusive_cumsum(&weights, comm);
+
+    // Now broadcast the total weight to all processes.
 
     let mut total_weight = if rank == size - 1 {
         *scan.last().unwrap()
@@ -489,9 +479,6 @@ pub fn partition<C: CommunicatorCollectives>(
         0
     };
 
-    // Scan the weight (form cumulative sums) and broadcast the total weight (last entry on last process)
-    // to all other processes.
-
     comm.process_at_rank(size - 1)
         .broadcast_into(&mut total_weight);
 
diff --git a/src/tools.rs b/src/tools.rs
index baa41a3..46c9514 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -1,6 +1,6 @@
 //! Utility routines.
 
-use itertools::Itertools;
+use itertools::{izip, Itertools};
 use mpi::{
     collective::SystemOperation,
     datatype::{Partition, PartitionMut},
@@ -10,6 +10,7 @@ use mpi::{
         Root, Source,
     },
 };
+use num::traits::Zero;
 
 /// Gather array to all processes
 pub fn gather_to_all<T: Equivalence, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> Vec<T> {
@@ -187,6 +188,119 @@ pub fn redistribute<T: Equivalence, C: CommunicatorCollectives>(
     output
 }
 
+/// Perform a global inclusive cumulative sum operation.
+///
+/// For the array `[1, 3, 5, 7]` the output will be `[1, 4, 9, 16]`.
+pub fn global_inclusive_cumsum<T: Equivalence + Zero + Copy, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> Vec<T> {
+    let mut scan: Vec<T> = arr
+        .iter()
+        .scan(<T as Zero>::zero(), |state, x| {
+            *state = *x + *state;
+            Some(*state)
+        })
+        .collect_vec();
+    let scan_last = *scan.last().unwrap();
+    let mut scan_result = T::zero();
+    comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum());
+    for elem in &mut scan {
+        *elem = *elem + scan_result;
+    }
+
+    scan
+}
+
+/// Distribute a sorted sequence into bins.
+///
+/// For an array with n elements to be distributed into p bins,
+/// the array `bins` has p elements. The bins are defined by half-open intervals
+/// of the form [b_j, b_{j+1})). The final bin is the half-open interval [b_{p-1}, \infty).
+/// It is assumed that the bins and the elements are both sorted sequences and that
+/// every element has an associated bin.
+/// The function returns a p element array with the counts of how many elements go to each bin.
+/// Since the sequence is sorted this fully defines what element goes into which bin.
+pub fn sort_to_bins<T: Ord>(sorted_keys: &[T], bins: &[T]) -> Vec<usize> {
+    let nbins = bins.len();
+
+    // Make sure that the smallest element of the sorted keys fits into the bins.
+    assert!(bins.first().unwrap() <= sorted_keys.first().unwrap());
+
+    // Deal with the special case that there is only one bin.
+    // This means that all elements are in the one bin.
+    if nbins == 1 {
+        return vec![sorted_keys.len(); 1];
+    }
+
+    let mut bin_counts = vec![0 as usize; nbins];
+
+    // This iterates over each possible bin and returns also the associated rank.
+    // The last bin position is not iterated over since for an array with p elements
+    // there are p-1 tuple windows.
+    let mut bin_iter = izip!(
+        bin_counts.iter_mut(),
+        bins.iter().tuple_windows::<(&T, &T)>(),
+    );
+
+    // We take the first element of the bin iterator. There will always be at least one since
+    // there are at least two bins (an actual one, and the last half infinite one)
+    let mut r: &mut usize;
+    let mut bin_start: &T;
+    let mut bin_end: &T;
+    (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
+
+    let mut count = 0;
+    'outer: for key in sorted_keys.iter() {
+        if bin_start <= key && key < bin_end {
+            *r += 1;
+            count += 1;
+        } else {
+            // Move the bin forward until it fits. There will always be a fitting bin.
+            loop {
+                if let Some((rn, (bsn, ben))) = bin_iter.next() {
+                    if bsn <= key && key < ben {
+                        // We have found the next fitting bin for our current element.
+                        // Can register it and go back to the outer for loop.
+                        *rn += 1;
+                        r = rn;
+                        bin_start = bsn;
+                        bin_end = ben;
+                        count += 1;
+                        break;
+                    }
+                } else {
+                    // We have no more fitting bin. So break the outer loop.
+                    break 'outer;
+                }
+            }
+        }
+    }
+
+    // We now have everything but the last bin. Just bunch the remaining elements to
+    // the last count.
+    *bin_counts.last_mut().unwrap() = sorted_keys.len() - count;
+
+    bin_counts
+}
+
+/// Redistribute locally sorted keys with respect to bins.
+///
+/// - The array `sorted_keys` is assumed to be sorted within each process. It needs not be globally sorted.
+/// - If there are `r` ranks in the communicator, the size of `bins` must be `r`.
+/// - The bins are defined through half-open intervals (bin[0], bin[1]), .... This defines r-1 bins. The
+///   last bin is the half-open interval [bin[r-1], \infty).
+/// - All array elements must be larger or equal bin[0]. This means that each element can be sorted into a bin.
+pub fn redistribute_by_bins<T: Equivalence + Ord, C: CommunicatorCollectives>(
+    sorted_keys: &[T],
+    bins: &[T],
+    comm: &C,
+) -> Vec<T> {
+    let counts = sort_to_bins(sorted_keys, bins);
+    let counts = counts.iter().map(|elem| *elem as i32).collect_vec();
+    redistribute(sorted_keys, &counts, comm)
+}
+
 /// Compute displacements from a vector of counts.
 ///
 /// This is useful for global MPI varcount operations. Let
@@ -202,3 +316,22 @@ pub fn displacements(counts: &[i32]) -> Vec<i32> {
         })
         .collect()
 }
+
+#[cfg(test)]
+mod test {
+    use itertools::Itertools;
+
+    use super::sort_to_bins;
+
+    #[test]
+    fn test_sort_to_bins() {
+        let elems = (0..100).collect_vec();
+        let bins = [0, 17, 55];
+
+        let counts = sort_to_bins(&elems, &bins);
+
+        assert_eq!(counts[0], 17);
+        assert_eq!(counts[1], 38);
+        assert_eq!(counts[2], 45);
+    }
+}

From 8a3d6e5274986eb25cd28d2d46fa517d3b178006 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 12:11:42 +0100
Subject: [PATCH 22/42] Custom min max for MPI

---
 src/morton.rs  |  13 ----
 src/parsort.rs | 176 +++++--------------------------------------------
 src/tools.rs   |  30 +++++++++
 3 files changed, 48 insertions(+), 171 deletions(-)

diff --git a/src/morton.rs b/src/morton.rs
index eb9d49e..43e4aa9 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -6,7 +6,6 @@ use crate::constants::{
     Y_LOOKUP_ENCODE, Z_LOOKUP_DECODE, Z_LOOKUP_ENCODE,
 };
 use crate::geometry::PhysicalBox;
-use crate::parsort::{MaxValue, MinValue};
 use itertools::izip;
 use itertools::Itertools;
 use mpi::traits::Equivalence;
@@ -27,18 +26,6 @@ impl Default for MortonKey {
     }
 }
 
-impl MinValue for MortonKey {
-    fn min_value() -> Self {
-        MortonKey::root()
-    }
-}
-
-impl MaxValue for MortonKey {
-    fn max_value() -> Self {
-        MortonKey::deepest_last()
-    }
-}
-
 impl MortonKey {
     /// Create a new Morton key. Users should use `[MortonKey::from_index_and_level].`
     fn new(value: u64) -> Self {
diff --git a/src/parsort.rs b/src/parsort.rs
index 49d48ca..f38e469 100644
--- a/src/parsort.rs
+++ b/src/parsort.rs
@@ -9,41 +9,18 @@ use mpi::traits::CommunicatorCollectives;
 use mpi::traits::Equivalence;
 use rand::{seq::SliceRandom, Rng};
 
-use crate::tools::{gather_to_all, redistribute};
+use crate::tools::{gather_to_all, global_max, global_min, redistribute_by_bins};
 
 const OVERSAMPLING: usize = 8;
 
 /// Sortable trait that each type fed into parsort needs to satisfy.
 pub trait ParallelSortable:
-    MinValue
-    + MaxValue
-    + Equivalence
-    + Copy
-    + Clone
-    + Default
-    + PartialEq
-    + Eq
-    + PartialOrd
-    + Ord
-    + Display
-    + Sized
+    Equivalence + Copy + Clone + PartialEq + Eq + PartialOrd + Ord + Display + Sized
 {
 }
 
-impl<
-        T: MinValue
-            + MaxValue
-            + Equivalence
-            + Copy
-            + Clone
-            + Default
-            + PartialEq
-            + Eq
-            + PartialOrd
-            + Ord
-            + Display
-            + Sized,
-    > ParallelSortable for T
+impl<T: Equivalence + Copy + Clone + PartialEq + Eq + PartialOrd + Ord + Display + Sized>
+    ParallelSortable for T
 {
 }
 
@@ -88,18 +65,6 @@ unsafe impl<T: ParallelSortable> Equivalence for UniqueItem<T> {
     }
 }
 
-/// Return the minimum possible value of a type.
-pub trait MinValue {
-    /// Return the min value.
-    fn min_value() -> Self;
-}
-
-/// Return the maximum possible value of a type.
-pub trait MaxValue {
-    /// Return the max value.
-    fn max_value() -> Self;
-}
-
 impl<T: ParallelSortable> Display for UniqueItem<T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
@@ -110,18 +75,6 @@ impl<T: ParallelSortable> Display for UniqueItem<T> {
     }
 }
 
-impl<T: ParallelSortable> MinValue for UniqueItem<T> {
-    fn min_value() -> Self {
-        UniqueItem::new(<T as MinValue>::min_value(), 0, 0)
-    }
-}
-
-impl<T: ParallelSortable> MaxValue for UniqueItem<T> {
-    fn max_value() -> Self {
-        UniqueItem::new(<T as MaxValue>::max_value(), 0, 0)
-    }
-}
-
 impl<T: ParallelSortable> UniqueItem<T> {
     pub fn new(value: T, rank: usize, index: usize) -> Self {
         Self { value, rank, index }
@@ -151,8 +104,13 @@ where
         OVERSAMPLING
     };
 
-    // We are choosing unique splitters that neither contain
-    // zero nor u64::max.
+    // We get the global smallest and global largest element. We do not want those
+    // in the splitter so filter out their occurence.
+
+    let global_min_elem = global_min(arr, comm);
+    let global_max_elem = global_max(arr, comm);
+
+    // We do not want the global smallest element in the splitter.
 
     let splitters = arr
         .choose_multiple(rng, oversampling)
@@ -171,91 +129,25 @@ where
     // We now insert the smallest and largest possible element if they are not already
     // in the splitter collection.
 
-    if *all_splitters.first().unwrap() != UniqueItem::min_value() {
-        all_splitters.insert(0, UniqueItem::min_value())
+    if *all_splitters.first().unwrap() != global_min_elem {
+        all_splitters.insert(0, global_min_elem)
     }
 
-    if *all_splitters.last().unwrap() != UniqueItem::max_value() {
-        all_splitters.push(UniqueItem::max_value());
+    if *all_splitters.last().unwrap() != global_max_elem {
+        all_splitters.push(global_max_elem);
     }
 
     // We now define p buckets (p is number of processors) and we return
-    // a p + 1 element array containing the first element of each bucket
-    // concluded with the largest possible element.
+    // a p element array containing the first element of each bucket
 
     all_splitters = split(&all_splitters, size)
         .map(|slice| slice.first().unwrap())
         .copied()
         .collect::<Vec<_>>();
-    all_splitters.push(UniqueItem::max_value());
 
     all_splitters
 }
 
-fn get_counts<T: ParallelSortable>(arr: &[UniqueItem<T>], buckets: &[UniqueItem<T>]) -> Vec<usize> {
-    // The following array will store the counts for each bucket.
-
-    let mut counts = vec![0_usize; buckets.len() - 1];
-
-    // We are iterating through the array. Whenever an element is larger or equal than
-    // the current splitter we store the current position in `bin_displs` and advance `splitter_iter`
-    // by 1.
-
-    // In the following iterator we skip the first bin displacement position as this must be the default
-    // zero (start of the bins).
-
-    // Note that bucket iterator has as many elements as counts as the tuple_windows has length
-    // 1 smaller than the original array length.
-    let mut bucket_iter = buckets.iter().tuple_windows::<(_, _)>();
-
-    // We skip the first element as this is always zero.
-    let mut count_iter = counts.iter_mut();
-
-    let mut count: usize = 0;
-    let mut current_count = count_iter.next().unwrap();
-
-    let (mut first, mut last) = bucket_iter.next().unwrap();
-
-    for elem in arr {
-        // The test after the or sorts out the case that our set includes the maximum possible
-        // item and we are in the last bucket. The biggest item should be counted as belonging
-        // to the bucket.
-        if (first <= elem && elem < last)
-            || (*last == UniqueItem::max_value() && *elem == UniqueItem::max_value())
-        {
-            // Element is in the right bucket.
-            count += 1;
-            continue;
-        } else {
-            // Element is not in the right bucket.
-            // Store counts and find the correct bucket.
-            *current_count = count;
-            loop {
-                (first, last) = bucket_iter.next().unwrap();
-                current_count = count_iter.next().unwrap();
-                if (first <= elem && elem < last)
-                    || (*last == UniqueItem::max_value() && *elem == UniqueItem::max_value())
-                {
-                    break;
-                }
-            }
-            // Now have the right bucket. Reset count and continue.
-            count = 1;
-        }
-    }
-
-    // Need to store the count for the last bucket in the iterator.
-    // This is always necessary as last iterator is half open interval.
-    // So we don't go into the else part of the for loop.
-
-    *current_count = count;
-
-    // We don't need to fill the remaining counts entries with zero
-    // since the array is already initialized with zero.
-
-    counts
-}
-
 /// Parallel sort
 pub fn parsort<T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized>(
     arr: &[T],
@@ -287,16 +179,8 @@ pub fn parsort<T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized>
 
     let buckets = get_buckets(&arr, comm, rng);
 
-    // We now compute how many elements of our array go into each bucket.
-
-    let counts = get_counts(&arr, &buckets)
-        .iter()
-        .map(|&elem| elem as i32)
-        .collect::<Vec<_>>();
-
-    // We can now redistribute the array across the processors.
-
-    let mut recvbuffer = redistribute(&arr, &counts, comm);
+    // We now redistribute with respect to these buckets.
+    let mut recvbuffer = redistribute_by_bins(&arr, &buckets, comm);
 
     // We now have everything in the receive buffer. Now sort the local elements and return
 
@@ -336,27 +220,3 @@ impl<'a, T> Iterator for Split<'a, T> {
         Some(chunk)
     }
 }
-
-macro_rules! impl_min_max_value {
-    ($type:ty) => {
-        impl MinValue for $type {
-            fn min_value() -> Self {
-                <$type>::MIN
-            }
-        }
-
-        impl MaxValue for $type {
-            fn max_value() -> Self {
-                <$type>::MAX
-            }
-        }
-    };
-}
-
-impl_min_max_value!(usize);
-impl_min_max_value!(i8);
-impl_min_max_value!(i32);
-impl_min_max_value!(i64);
-impl_min_max_value!(u8);
-impl_min_max_value!(u32);
-impl_min_max_value!(u64);
diff --git a/src/tools.rs b/src/tools.rs
index 46c9514..46415fd 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -105,6 +105,36 @@ pub fn global_size<T, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> usize
     global_size
 }
 
+/// Get the maximum value across all ranks
+pub fn global_max<T: Equivalence + Copy + Ord, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> T {
+    let local_max = arr.iter().max().unwrap();
+
+    // Just need to initialize global_max with something.
+    let mut global_max = *local_max;
+
+    comm.all_reduce_into(local_max, &mut global_max, SystemOperation::max());
+
+    global_max
+}
+
+/// Get the minimum value across all ranks
+pub fn global_min<T: Equivalence + Copy + Ord, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> T {
+    let local_min = arr.iter().min().unwrap();
+
+    // Just need to initialize global_min with something.
+    let mut global_min = *local_min;
+
+    comm.all_reduce_into(local_min, &mut global_min, SystemOperation::min());
+
+    global_min
+}
+
 /// Communicate the first element of each local array back to the previous rank.
 pub fn communicate_back<T: Equivalence, C: CommunicatorCollectives>(
     arr: &[T],

From 4eed598e6972d30ebe92afafcfbbfcf97952b1da Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 13:55:35 +0100
Subject: [PATCH 23/42] WIP: Fixing global min max

---
 Cargo.toml             |  1 +
 examples/mpi_cumsum.rs |  2 +-
 src/tools.rs           | 39 ++++++++++++++++++++++++++++++---------
 3 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index bff73c7..a75d666 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -28,6 +28,7 @@ num = "0.4.*"
 bytemuck = "1.*"
 vtkio = "0.6.*"
 mpi = {version = "0.8.*", features = ["derive", "user-operations"] }
+once_cell = "*"
 
 [profile.release]
 debug = 1
diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs
index 0c59245..08f8c97 100644
--- a/examples/mpi_cumsum.rs
+++ b/examples/mpi_cumsum.rs
@@ -25,7 +25,7 @@ pub fn main() {
 
     // Generate random numbers
 
-    let mut elems = Vec::<usize>::with_capacity(3 * nelems);
+    let mut elems = Vec::<usize>::with_capacity(nelems);
 
     for _ in 0..nelems {
         elems.push(rng.gen_range(0..100));
diff --git a/src/tools.rs b/src/tools.rs
index 46415fd..a55ee5c 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -2,12 +2,13 @@
 
 use itertools::{izip, Itertools};
 use mpi::{
-    collective::SystemOperation,
-    datatype::{Partition, PartitionMut},
+    collective::{SystemOperation, UserOperation},
+    datatype::{DynBuffer, Partition, PartitionMut},
     point_to_point as p2p,
+    raw::AsRaw,
     traits::{
-        CommunicatorCollectives, Destination, Equivalence, PartitionedBuffer, PartitionedBufferMut,
-        Root, Source,
+        AsDatatype, CommunicatorCollectives, Destination, Equivalence, PartitionedBuffer,
+        PartitionedBufferMut, Root, Source,
     },
 };
 use num::traits::Zero;
@@ -115,7 +116,17 @@ pub fn global_max<T: Equivalence + Copy + Ord, C: CommunicatorCollectives>(
     // Just need to initialize global_max with something.
     let mut global_max = *local_max;
 
-    comm.all_reduce_into(local_max, &mut global_max, SystemOperation::max());
+    comm.all_reduce_into(
+        local_max,
+        &mut global_max,
+        &UserOperation::commutative(|x, y| {
+            let x: &[T] = x.downcast().unwrap();
+            let y: &mut [T] = y.downcast().unwrap();
+            for (&x_i, y_i) in x.iter().zip(y) {
+                *y_i = x_i.max(*y_i);
+            }
+        }),
+    );
 
     global_max
 }
@@ -125,12 +136,22 @@ pub fn global_min<T: Equivalence + Copy + Ord, C: CommunicatorCollectives>(
     arr: &[T],
     comm: &C,
 ) -> T {
-    let local_min = arr.iter().min().unwrap();
+    let local_min = *arr.iter().min().unwrap();
 
     // Just need to initialize global_min with something.
-    let mut global_min = *local_min;
-
-    comm.all_reduce_into(local_min, &mut global_min, SystemOperation::min());
+    let mut global_min = local_min;
+
+    comm.all_reduce_into(
+        &local_min,
+        &mut global_min,
+        &UserOperation::commutative(|x, y| {
+            let x: &[T] = x.downcast().unwrap();
+            let y: &mut [T] = y.downcast().unwrap();
+            for (&x_i, y_i) in x.iter().zip(y) {
+                *y_i = x_i.min(*y_i);
+            }
+        }),
+    );
 
     global_min
 }

From a6a2cb7731a5eff09834ccd0568f0bcd47c922bd Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 14:13:18 +0100
Subject: [PATCH 24/42] Specialised parsort for Morton keys

---
 examples/parsort.rs | 13 ++++-----
 src/parsort.rs      | 68 ++++++++-------------------------------------
 src/tools.rs        | 24 ++++++++++++++++
 3 files changed, 42 insertions(+), 63 deletions(-)

diff --git a/examples/parsort.rs b/examples/parsort.rs
index 8a20706..5de9c07 100644
--- a/examples/parsort.rs
+++ b/examples/parsort.rs
@@ -1,5 +1,8 @@
 //! Testing the hyksort component.
-use bempp_octree::{parsort::parsort, tools::is_sorted_array};
+use bempp_octree::{
+    parsort::parsort,
+    tools::{generate_random_keys, is_sorted_array},
+};
 use mpi::traits::Communicator;
 use rand::prelude::*;
 
@@ -10,13 +13,9 @@ pub fn main() {
 
     let mut rng = rand::rngs::StdRng::seed_from_u64(0);
 
-    let mut arr = Vec::<u64>::new();
+    let keys = generate_random_keys(n_per_rank, &mut rng);
 
-    for _ in 0..n_per_rank {
-        arr.push(rng.gen());
-    }
-
-    let arr = parsort(&arr, &world, &mut rng);
+    let arr = parsort(&keys, &world, &mut rng);
 
     assert!(is_sorted_array(&arr, &world));
 
diff --git a/src/parsort.rs b/src/parsort.rs
index f38e469..7191783 100644
--- a/src/parsort.rs
+++ b/src/parsort.rs
@@ -1,71 +1,28 @@
 //! Implementation of a parallel samplesort.
 
 use std::fmt::Display;
-use std::mem::offset_of;
 
 use itertools::Itertools;
-use mpi::datatype::{UncommittedDatatypeRef, UncommittedUserDatatype, UserDatatype};
 use mpi::traits::CommunicatorCollectives;
 use mpi::traits::Equivalence;
 use rand::{seq::SliceRandom, Rng};
 
+use crate::morton::MortonKey;
 use crate::tools::{gather_to_all, global_max, global_min, redistribute_by_bins};
 
 const OVERSAMPLING: usize = 8;
 
-/// Sortable trait that each type fed into parsort needs to satisfy.
-pub trait ParallelSortable:
-    Equivalence + Copy + Clone + PartialEq + Eq + PartialOrd + Ord + Display + Sized
-{
-}
-
-impl<T: Equivalence + Copy + Clone + PartialEq + Eq + PartialOrd + Ord + Display + Sized>
-    ParallelSortable for T
-{
-}
-
 /// An internal struct. We convert every array element
 /// into this struct. The idea is that this is guaranteed to be unique
 /// as it encodes not only the element but also its rank and index.
-#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
-struct UniqueItem<T: ParallelSortable> {
-    pub value: T,
+#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Equivalence)]
+struct UniqueItem {
+    pub value: MortonKey,
     pub rank: usize,
     pub index: usize,
 }
 
-unsafe impl<T: ParallelSortable> Equivalence for UniqueItem<T> {
-    type Out = UserDatatype;
-
-    // Depending on the MPI implementation the below offset needs
-    // to be an i64 or isize. If it is an i64 Clippy warns about
-    // a useless conversion. But this warning is MPI implementation
-    // dependent. So switch off here.
-
-    #[allow(clippy::useless_conversion)]
-    fn equivalent_datatype() -> Self::Out {
-        UserDatatype::structured::<UncommittedDatatypeRef>(
-            &[1, 1, 1],
-            &[
-                (offset_of!(UniqueItem<T>, value) as i64)
-                    .try_into()
-                    .unwrap(),
-                (offset_of!(UniqueItem<T>, rank) as i64).try_into().unwrap(),
-                (offset_of!(UniqueItem<T>, index) as i64)
-                    .try_into()
-                    .unwrap(),
-            ],
-            &[
-                UncommittedUserDatatype::contiguous(1, &<T as Equivalence>::equivalent_datatype())
-                    .as_ref(),
-                usize::equivalent_datatype().into(),
-                usize::equivalent_datatype().into(),
-            ],
-        )
-    }
-}
-
-impl<T: ParallelSortable> Display for UniqueItem<T> {
+impl Display for UniqueItem {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
@@ -75,22 +32,21 @@ impl<T: ParallelSortable> Display for UniqueItem<T> {
     }
 }
 
-impl<T: ParallelSortable> UniqueItem<T> {
-    pub fn new(value: T, rank: usize, index: usize) -> Self {
+impl UniqueItem {
+    pub fn new(value: MortonKey, rank: usize, index: usize) -> Self {
         Self { value, rank, index }
     }
 }
 
-fn to_unique_item<T: ParallelSortable>(arr: &[T], rank: usize) -> Vec<UniqueItem<T>> {
+fn to_unique_item(arr: &[MortonKey], rank: usize) -> Vec<UniqueItem> {
     arr.iter()
         .enumerate()
         .map(|(index, &item)| UniqueItem::new(item, rank, index))
         .collect()
 }
 
-fn get_buckets<T, C, R>(arr: &[UniqueItem<T>], comm: &C, rng: &mut R) -> Vec<UniqueItem<T>>
+fn get_buckets<C, R>(arr: &[UniqueItem], comm: &C, rng: &mut R) -> Vec<UniqueItem>
 where
-    T: ParallelSortable,
     C: CommunicatorCollectives,
     R: Rng + ?Sized,
 {
@@ -149,11 +105,11 @@ where
 }
 
 /// Parallel sort
-pub fn parsort<T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized>(
-    arr: &[T],
+pub fn parsort<C: CommunicatorCollectives, R: Rng + ?Sized>(
+    arr: &[MortonKey],
     comm: &C,
     rng: &mut R,
-) -> Vec<T> {
+) -> Vec<MortonKey> {
     let size = comm.size() as usize;
     let rank = comm.rank() as usize;
     // If we only have a single rank simply sort the local array and return
diff --git a/src/tools.rs b/src/tools.rs
index a55ee5c..8f76b4c 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -12,6 +12,12 @@ use mpi::{
     },
 };
 use num::traits::Zero;
+use rand::Rng;
+
+use crate::{
+    constants::{DEEPEST_LEVEL, LEVEL_SIZE},
+    morton::MortonKey,
+};
 
 /// Gather array to all processes
 pub fn gather_to_all<T: Equivalence, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> Vec<T> {
@@ -352,6 +358,24 @@ pub fn redistribute_by_bins<T: Equivalence + Ord, C: CommunicatorCollectives>(
     redistribute(sorted_keys, &counts, comm)
 }
 
+/// Generate random keys for testing.
+pub fn generate_random_keys<R: Rng>(nkeys: usize, rng: &mut R) -> Vec<MortonKey> {
+    let mut result = Vec::<MortonKey>::with_capacity(nkeys);
+
+    let xindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+    let yindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+    let zindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+
+    for (xval, yval, zval) in izip!(xindices.iter(), yindices.iter(), zindices.iter()) {
+        result.push(MortonKey::from_index_and_level(
+            [xval, yval, zval],
+            DEEPEST_LEVEL as usize,
+        ));
+    }
+
+    result
+}
+
 /// Compute displacements from a vector of counts.
 ///
 /// This is useful for global MPI varcount operations. Let

From 2c2ed174762404a2925a1604e5360b710990eb42 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 14:14:40 +0100
Subject: [PATCH 25/42] Fixed warnings

---
 examples/mpi_complete_tree.rs | 6 +-----
 examples/mpi_cumsum.rs        | 6 +-----
 examples/parallel_tests.rs    | 3 +--
 src/tools.rs                  | 8 ++------
 4 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index d3bd214..60a095a 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -2,11 +2,7 @@
 
 use bempp_octree::{
     constants::DEEPEST_LEVEL,
-    geometry::PhysicalBox,
-    octree::{
-        complete_tree, compute_global_bounding_box, is_complete_linear_tree, points_to_morton,
-    },
-    tools::gather_to_root,
+    octree::{complete_tree, is_complete_linear_tree, points_to_morton},
 };
 use mpi::traits::*;
 use rand::prelude::*;
diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs
index 08f8c97..ab9e1b1 100644
--- a/examples/mpi_cumsum.rs
+++ b/examples/mpi_cumsum.rs
@@ -1,10 +1,6 @@
 //! Test the computation of a global bounding box across MPI ranks.
 
-use bempp_octree::{
-    geometry::PhysicalBox,
-    octree::compute_global_bounding_box,
-    tools::{gather_to_root, global_inclusive_cumsum},
-};
+use bempp_octree::tools::{gather_to_root, global_inclusive_cumsum};
 use itertools::{izip, Itertools};
 use mpi::traits::*;
 use rand::prelude::*;
diff --git a/examples/parallel_tests.rs b/examples/parallel_tests.rs
index 9d841ea..38e35cb 100644
--- a/examples/parallel_tests.rs
+++ b/examples/parallel_tests.rs
@@ -1,8 +1,7 @@
 //! Testing the hyksort component.
 use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE};
 use bempp_octree::morton::MortonKey;
-use bempp_octree::octree::{block_partition, linearize, partition};
-use bempp_octree::parsort::parsort;
+use bempp_octree::octree::{block_partition, linearize};
 use bempp_octree::tools::gather_to_root;
 use itertools::{izip, Itertools};
 use mpi::traits::*;
diff --git a/src/tools.rs b/src/tools.rs
index 8f76b4c..075804a 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -3,13 +3,9 @@
 use itertools::{izip, Itertools};
 use mpi::{
     collective::{SystemOperation, UserOperation},
-    datatype::{DynBuffer, Partition, PartitionMut},
+    datatype::{Partition, PartitionMut},
     point_to_point as p2p,
-    raw::AsRaw,
-    traits::{
-        AsDatatype, CommunicatorCollectives, Destination, Equivalence, PartitionedBuffer,
-        PartitionedBufferMut, Root, Source,
-    },
+    traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source},
 };
 use num::traits::Zero;
 use rand::Rng;

From f354653051a016e47063f4e8ab07a8621212d955 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 16:30:17 +0100
Subject: [PATCH 26/42] More cleanup

---
 src/octree.rs | 227 +++++++++++++++++++++++---------------------------
 1 file changed, 104 insertions(+), 123 deletions(-)

diff --git a/src/octree.rs b/src/octree.rs
index 0371f3f..574350d 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -1,25 +1,17 @@
 //! Parallel Octree structure
 
-use std::collections::HashMap;
-
 use crate::{
     constants::{DEEPEST_LEVEL, NSIBLINGS},
     geometry::PhysicalBox,
     morton::MortonKey,
     parsort::parsort,
-    tools::{communicate_back, gather_to_all, global_inclusive_cumsum, redistribute},
+    tools::{communicate_back, gather_to_all, global_inclusive_cumsum, redistribute, sort_to_bins},
 };
 
-use mpi::{
-    point_to_point as p2p,
-    traits::{Root, Source},
-};
+use mpi::traits::Root;
 
 use itertools::{izip, Itertools};
-use mpi::{
-    collective::SystemOperation,
-    traits::{CommunicatorCollectives, Destination},
-};
+use mpi::{collective::SystemOperation, traits::CommunicatorCollectives};
 use rand::Rng;
 
 /// Compute the global bounding box across all points on all processes.
@@ -283,12 +275,7 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
 
     let my_first = coarse_tree.first().unwrap();
 
-    let mut global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
-
-    // We now have the first index from each process. We also want
-    // an upper bound for the last index of the tree to make the sorting into
-    // bins easier.
-    global_bins.push(MortonKey::upper_bound());
+    let global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
 
     // We now have our bins. We go through our keys and store how
     // many keys are assigned to each rank. We are using here that
@@ -306,41 +293,41 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
     redistribute(&sorted_keys, &rank_counts, comm)
 }
 
-/// Create bins from sorted keys.
-pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec<usize> {
-    let mut bin_counts = vec![0 as usize; bins.len() - 1];
-
-    // This iterates over each possible bin and returns also the associated rank.
-    let mut bin_iter = izip!(
-        bin_counts.iter_mut(),
-        bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(),
-    );
-
-    // We take the first element of the bin iterator. There will always be at least one.
-    let mut r: &mut usize;
-    let mut bin_start: &MortonKey;
-    let mut bin_end: &MortonKey;
-    (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
-
-    for &key in sorted_keys.iter() {
-        if *bin_start <= key && key < *bin_end {
-            *r += 1;
-        } else {
-            // Move the bin forward until it fits. There will always be a fitting bin.
-            while let Some((rn, (bsn, ben))) = bin_iter.next() {
-                if *bsn <= key && key < *ben {
-                    *rn += 1;
-                    r = rn;
-                    bin_start = bsn;
-                    bin_end = ben;
-                    break;
-                }
-            }
-        }
-    }
-
-    bin_counts
-}
+// /// Create bins from sorted keys.
+// pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec<usize> {
+//     let mut bin_counts = vec![0 as usize; bins.len() - 1];
+
+//     // This iterates over each possible bin and returns also the associated rank.
+//     let mut bin_iter = izip!(
+//         bin_counts.iter_mut(),
+//         bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(),
+//     );
+
+//     // We take the first element of the bin iterator. There will always be at least one.
+//     let mut r: &mut usize;
+//     let mut bin_start: &MortonKey;
+//     let mut bin_end: &MortonKey;
+//     (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
+
+//     for &key in sorted_keys.iter() {
+//         if *bin_start <= key && key < *bin_end {
+//             *r += 1;
+//         } else {
+//             // Move the bin forward until it fits. There will always be a fitting bin.
+//             while let Some((rn, (bsn, ben))) = bin_iter.next() {
+//                 if *bsn <= key && key < *ben {
+//                     *rn += 1;
+//                     r = rn;
+//                     bin_start = bsn;
+//                     bin_end = ben;
+//                     break;
+//                 }
+//             }
+//         }
+//     }
+
+//     bin_counts
+// }
 
 /// Return a complete tree generated from local keys and associated coarse keys.
 ///
@@ -360,8 +347,7 @@ pub fn create_local_tree(
     // is associated with a coarse slice. For this we need to add an upper bound
     // coarse keys to ensure that we have suitable bins.
 
-    let mut bins = coarse_keys.to_vec();
-    bins.push(MortonKey::upper_bound());
+    let bins = coarse_keys.to_vec();
 
     let counts = sort_to_bins(&sorted_fine_keys, &bins);
 
@@ -390,7 +376,7 @@ pub fn create_local_tree(
         }
     }
 
-    coarse_keys.to_vec()
+    new_coarse_keys.to_vec()
 }
 
 /// Linearize a set of weighted Morton keys.
@@ -485,53 +471,59 @@ pub fn partition<C: CommunicatorCollectives>(
     let w = total_weight / (size as usize);
     let k = total_weight % (size as usize);
 
-    let mut hash_map = HashMap::<usize, Vec<MortonKey>>::new();
-
     // Sort the elements into bins according to which process they should be sent.
+    // We do not need to sort the Morton keys themselves into bins but the scanned weights.
+    // The corresponding counts are the right counts for the Morton keys.
+
+    let mut bins = Vec::<usize>::with_capacity(size as usize);
 
     for p in 1..=size as usize {
-        let q = if p <= k as usize {
-            izip!(sorted_keys, &scan)
-                .filter_map(|(&key, &s)| {
-                    if ((p - 1) * (1 + w) <= s && s < p * (w + 1))
-                        || (p == size as usize && (p - 1) * (1 + w) <= s)
-                    {
-                        Some(key)
-                    } else {
-                        None
-                    }
-                })
-                .collect_vec()
+        if p <= k {
+            bins.push((p - 1) * (1 + w));
         } else {
-            izip!(sorted_keys, &scan)
-                .filter_map(|(&key, &s)| {
-                    if ((p - 1) * w + k <= s && s < p * w + k)
-                        || (p == size as usize && (p - 1) * w + k <= s)
-                    {
-                        Some(key)
-                    } else {
-                        None
-                    }
-                })
-                .collect_vec()
-        };
-        hash_map.insert(p - 1, q);
+            bins.push((p - 1) * w + k);
+        }
     }
 
+    let counts = sort_to_bins(&scan, &bins)
+        .iter()
+        .map(|elem| *elem as i32)
+        .collect_vec();
+
+    // for p in 1..=size as usize {
+    //     let q = if p <= k as usize {
+    //         izip!(sorted_keys, &scan)
+    //             .filter_map(|(&key, &s)| {
+    //                 if ((p - 1) * (1 + w) <= s && s < p * (w + 1))
+    //                     || (p == size as usize && (p - 1) * (1 + w) <= s)
+    //                 {
+    //                     Some(key)
+    //                 } else {
+    //                     None
+    //                 }
+    //             })
+    //             .collect_vec()
+    //     } else {
+    //         izip!(sorted_keys, &scan)
+    //             .filter_map(|(&key, &s)| {
+    //                 if ((p - 1) * w + k <= s && s < p * w + k)
+    //                     || (p == size as usize && (p - 1) * w + k <= s)
+    //                 {
+    //                     Some(key)
+    //                 } else {
+    //                     None
+    //                 }
+    //             })
+    //             .collect_vec()
+    //     };
+    //     hash_map.insert(p - 1, q);
+    // }
+
     // Now distribute the data with an all to all v.
     // We create a vector of how many elements to send to each process and
     // then send the actual data.
 
-    let mut counts = vec![0 as i32; size as usize];
-
-    let mut all_elements = Vec::<MortonKey>::new();
-    for (index, c) in counts.iter_mut().enumerate() {
-        let elements = hash_map.get(&index).unwrap();
-        *c = elements.len() as i32;
-        all_elements.extend(elements.iter())
-    }
-
-    let mut recvbuffer = redistribute(&all_elements, &counts, comm);
+    let mut recvbuffer = redistribute(&sorted_keys, &counts, comm);
 
     recvbuffer.sort_unstable();
     recvbuffer
@@ -560,42 +552,31 @@ pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
     // ancestor of the deepest first key and first element. Correspondingly on the last process
     // we need to insert the last child of the finest ancester of the deepest last key and last element.
 
+    let next_key = communicate_back(&linearized_keys, comm);
+
+    if rank < size - 1 {
+        linearized_keys.push(next_key.unwrap());
+    }
+
+    // Now fix the first key on the first rank.
+
+    if rank == 0 {
+        let first_key = linearized_keys.first().unwrap();
+        let deepest_first = MortonKey::deepest_first();
+        if !first_key.is_ancestor(deepest_first) {
+            let ancestor = deepest_first.finest_common_ancestor(*first_key);
+            linearized_keys.insert(0, ancestor.children()[0]);
+        }
+    }
+
     if rank == size - 1 {
-        // On last process send first element to previous processes and insert last
-        // possible box from region into list.
-        comm.process_at_rank(rank - 1)
-            .send(linearized_keys.first().unwrap());
-        let last_key = *linearized_keys.last().unwrap();
+        let last_key = linearized_keys.last().unwrap();
         let deepest_last = MortonKey::deepest_last();
         if !last_key.is_ancestor(deepest_last) {
-            let ancestor = deepest_last.finest_common_ancestor(last_key);
+            let ancestor = deepest_last.finest_common_ancestor(*last_key);
             linearized_keys.push(ancestor.children()[NSIBLINGS - 1]);
         }
-    } else {
-        let (other, _status) = if rank > 0 {
-            // On intermediate process receive from the next process
-            // and send first element to previous process.
-            p2p::send_receive(
-                linearized_keys.first().unwrap(),
-                &comm.process_at_rank(rank - 1),
-                &comm.process_at_rank(rank + 1),
-            )
-        } else {
-            // On first process insert at the beginning the first possible
-            // box in the region and receive the key from next process.
-            let first_key = *linearized_keys.first().unwrap();
-            let deepest_first = MortonKey::deepest_first();
-            if !first_key.is_ancestor(deepest_first) {
-                let ancestor = deepest_first.finest_common_ancestor(first_key);
-                linearized_keys.insert(0, ancestor.children()[0]);
-            }
-
-            comm.process_at_rank(1).receive::<MortonKey>()
-        };
-        // If we are not at the last process we need to introduce the received key
-        // into our list.
-        linearized_keys.push(other);
-    };
+    }
 
     // Now complete the regions defined by the keys on each process.
 

From d4ebe77849b8078b3bc2afc1a650a0f771702c99 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 19:55:04 +0100
Subject: [PATCH 27/42] WIP: Better testing

---
 examples/mpi_complete_tree.rs |  10 ++-
 examples/parallel_tests.rs    |   2 +-
 src/octree.rs                 | 130 ++++++++++++++++++++++++----------
 3 files changed, 98 insertions(+), 44 deletions(-)

diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index 60a095a..052b214 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -2,7 +2,7 @@
 
 use bempp_octree::{
     constants::DEEPEST_LEVEL,
-    octree::{complete_tree, is_complete_linear_tree, points_to_morton},
+    octree::{complete_tree, is_complete_linear_tree, linearize, points_to_morton},
 };
 use mpi::traits::*;
 use rand::prelude::*;
@@ -32,14 +32,12 @@ pub fn main() {
     // Compute the Morton keys on the deepest level
     let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm);
 
-    assert!(!is_complete_linear_tree(&keys, &comm));
+    let linear_keys = linearize(&keys, &mut rng, &comm);
 
     // Generate a complete tree
-    let distributed_complete_tree = complete_tree(&keys, &mut rng, &comm);
+    let distributed_complete_tree = complete_tree(&linear_keys, &comm);
 
-    let is_complete_linear = is_complete_linear_tree(&distributed_complete_tree, &comm);
-
-    assert!(is_complete_linear);
+    assert!(is_complete_linear_tree(&distributed_complete_tree, &comm));
 
     if comm.rank() == 0 {
         println!("Distributed tree is complete and linear.");
diff --git a/examples/parallel_tests.rs b/examples/parallel_tests.rs
index 38e35cb..8285d7c 100644
--- a/examples/parallel_tests.rs
+++ b/examples/parallel_tests.rs
@@ -122,7 +122,7 @@ pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, co
 
     println!("Rank {} has {} keys. ", rank, keys.len());
 
-    let partitioned_tree = block_partition(&keys, rng, comm);
+    let partitioned_tree = block_partition(&keys, comm);
 
     println!(
         "Partitioned tree on rank {} has {} keys.",
diff --git a/src/octree.rs b/src/octree.rs
index 574350d..3185103 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -127,30 +127,27 @@ pub fn points_to_morton<C: CommunicatorCollectives>(
     (keys, bounding_box)
 }
 
-/// Block partition of tree.
-///
-/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned
-/// keys and the associated coarse keys.
-/// A necessary condition for the block partitioning is that
-// all sorted keys are on the same level.
-pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
-    sorted_keys: &[MortonKey],
-    rng: &mut R,
+/// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree.
+pub fn compute_coarse_tree<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
     comm: &C,
-) -> (Vec<MortonKey>, Vec<MortonKey>) {
-    let rank = comm.rank();
-    if comm.size() == 1 {
-        // On a single node block partitioning should not do anything.
-        return (sorted_keys.to_vec(), vec![MortonKey::root()]);
+) -> Vec<MortonKey> {
+    let size = comm.size();
+
+    debug_assert!(is_linear_tree(linear_keys, comm));
+
+    // On a single node a complete coarse tree is simply the root.
+    if size == 1 {
+        return vec![MortonKey::root()];
     }
 
-    let mut completed_region = sorted_keys
+    let mut completed_region = linear_keys
         .first()
         .unwrap()
-        .fill_between_keys(*sorted_keys.last().unwrap());
+        .fill_between_keys(*linear_keys.last().unwrap());
 
-    completed_region.insert(0, *sorted_keys.first().unwrap());
-    completed_region.push(*sorted_keys.last().unwrap());
+    completed_region.insert(0, *linear_keys.first().unwrap());
+    completed_region.push(*linear_keys.last().unwrap());
 
     // Get the smallest level members of the completed region.
 
@@ -169,7 +166,28 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
         .copied()
         .collect_vec();
 
-    let coarse_tree = complete_tree(&largest_boxes, rng, comm);
+    debug_assert!(is_linear_tree(&largest_boxes, comm));
+
+    complete_tree(&largest_boxes, comm)
+}
+
+/// Block partition of tree.
+///
+/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned
+/// keys and the associated coarse keys.
+/// A necessary condition for the block partitioning is that
+// all sorted keys are on the same level.
+pub fn block_partition<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    comm: &C,
+) -> (Vec<MortonKey>, Vec<MortonKey>) {
+    let rank = comm.rank();
+    if comm.size() == 1 {
+        // On a single node block partitioning should not do anything.
+        return (linear_keys.to_vec(), vec![MortonKey::root()]);
+    }
+
+    let coarse_tree = compute_coarse_tree(&linear_keys, comm);
 
     // We want to partition the coarse tree. But we need the correct weights. The idea
     // is that we use the number of original leafs that intersect with the coarse tree
@@ -195,7 +213,7 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
     // Let's find the start of our region. The start of our region is a coarse key that is an ancestor
     // of our current key. This works because the coarse tree has levels at most as high as the sorted keys.
 
-    let first_key = *sorted_keys.first().unwrap();
+    let first_key = *linear_keys.first().unwrap();
 
     let first_coarse_index = global_coarse_tree
         .iter()
@@ -204,7 +222,7 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
 
     // Now we need to find the end index of our region. For this again we find the index of our coarse tree that
     // is an ancestor of our last key.
-    let last_key = *sorted_keys.last().unwrap();
+    let last_key = *linear_keys.last().unwrap();
 
     let last_coarse_index = global_coarse_tree
         .iter()
@@ -218,7 +236,7 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
         local_weights[first_coarse_index..=last_coarse_index].iter_mut(),
         global_coarse_tree[first_coarse_index..=last_coarse_index].iter()
     ) {
-        *w += sorted_keys
+        *w += linear_keys
             .iter()
             .filter(|&&key| global_coarse_key.is_ancestor(key))
             .count();
@@ -246,7 +264,7 @@ pub fn block_partition<R: Rng, C: CommunicatorCollectives>(
     let coarse_tree = partition(&coarse_tree, &weights, comm);
 
     (
-        redistribute_with_respect_to_coarse_tree(&sorted_keys, &coarse_tree, comm),
+        redistribute_with_respect_to_coarse_tree(&linear_keys, &coarse_tree, comm),
         coarse_tree,
     )
 
@@ -529,19 +547,20 @@ pub fn partition<C: CommunicatorCollectives>(
     recvbuffer
 }
 
-/// Given a distributed set of keys, generate a complete linear Octree.
-pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
-    keys: &[MortonKey],
-    rng: &mut R,
+/// Given a distributed set of linear keys, generate a complete tree.
+pub fn complete_tree<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
     comm: &C,
 ) -> Vec<MortonKey> {
-    let mut linearized_keys = linearize(keys, rng, comm);
+    let mut linear_keys = linear_keys.to_vec();
+
+    debug_assert!(is_linear_tree(&linear_keys, comm));
 
     let size = comm.size();
     let rank = comm.rank();
 
     if size == 1 {
-        return MortonKey::complete_tree(linearized_keys.as_slice());
+        return MortonKey::complete_tree(linear_keys.as_slice());
     }
 
     // Now insert on the first and last process the first and last child of the
@@ -552,29 +571,29 @@ pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
     // ancestor of the deepest first key and first element. Correspondingly on the last process
     // we need to insert the last child of the finest ancester of the deepest last key and last element.
 
-    let next_key = communicate_back(&linearized_keys, comm);
+    let next_key = communicate_back(&linear_keys, comm);
 
     if rank < size - 1 {
-        linearized_keys.push(next_key.unwrap());
+        linear_keys.push(next_key.unwrap());
     }
 
     // Now fix the first key on the first rank.
 
     if rank == 0 {
-        let first_key = linearized_keys.first().unwrap();
+        let first_key = linear_keys.first().unwrap();
         let deepest_first = MortonKey::deepest_first();
         if !first_key.is_ancestor(deepest_first) {
             let ancestor = deepest_first.finest_common_ancestor(*first_key);
-            linearized_keys.insert(0, ancestor.children()[0]);
+            linear_keys.insert(0, ancestor.children()[0]);
         }
     }
 
     if rank == size - 1 {
-        let last_key = linearized_keys.last().unwrap();
+        let last_key = linear_keys.last().unwrap();
         let deepest_last = MortonKey::deepest_last();
         if !last_key.is_ancestor(deepest_last) {
             let ancestor = deepest_last.finest_common_ancestor(*last_key);
-            linearized_keys.push(ancestor.children()[NSIBLINGS - 1]);
+            linear_keys.push(ancestor.children()[NSIBLINGS - 1]);
         }
     }
 
@@ -582,18 +601,55 @@ pub fn complete_tree<R: Rng, C: CommunicatorCollectives>(
 
     let mut result = Vec::<MortonKey>::new();
 
-    for (&key1, &key2) in linearized_keys.iter().tuple_windows() {
+    for (&key1, &key2) in linear_keys.iter().tuple_windows() {
         result.push(key1);
         result.extend_from_slice(key1.fill_between_keys(key2).as_slice());
     }
 
     if rank == size - 1 {
-        result.push(*linearized_keys.last().unwrap());
+        result.push(*linear_keys.last().unwrap());
     }
 
+    debug_assert!(is_complete_linear_tree(&result, comm));
+
     result
 }
 
+/// Return true if the keys are linear.
+pub fn is_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
+    let mut is_linear = true;
+
+    for (&key1, &key2) in arr.iter().tuple_windows() {
+        if key1 >= key2 || key1.is_ancestor(key2) {
+            is_linear = false;
+            break;
+        }
+    }
+
+    if comm.size() == 1 {
+        return is_linear;
+    }
+
+    // Now check the interfaces
+
+    if let Some(next_key) = communicate_back(arr, comm) {
+        let last = *arr.last().unwrap();
+        if last >= next_key || last.is_ancestor(next_key) {
+            is_linear = false;
+        }
+    }
+
+    let mut global_is_linear = false;
+
+    comm.all_reduce_into(
+        &is_linear,
+        &mut global_is_linear,
+        SystemOperation::logical_and(),
+    );
+
+    global_is_linear
+}
+
 /// Return true on all ranks if distributed tree is complete. Otherwise, return false.
 pub fn is_complete_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
     // First check that the local tree on each node is complete.

From c2db50b0edcb105c1a4a8cba083b7ab83d13b4af Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 21:38:48 +0100
Subject: [PATCH 28/42] WIP: Tests involving coarse tree

---
 examples/mpi_coarse_tree.rs |  95 ++++++++++++++++++++++
 examples/parallel_tests.rs  | 152 ------------------------------------
 src/octree.rs               | 138 +++++++++-----------------------
 3 files changed, 132 insertions(+), 253 deletions(-)
 create mode 100644 examples/mpi_coarse_tree.rs
 delete mode 100644 examples/parallel_tests.rs

diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs
new file mode 100644
index 0000000..ef0095c
--- /dev/null
+++ b/examples/mpi_coarse_tree.rs
@@ -0,0 +1,95 @@
+//! Test the computation of a global bounding box across MPI ranks.
+
+use bempp_octree::{
+    constants::DEEPEST_LEVEL,
+    octree::{
+        complete_tree, compute_coarse_tree, compute_coarse_tree_weights, is_complete_linear_tree,
+        linearize, load_balance, points_to_morton, redistribute_with_respect_to_coarse_tree,
+    },
+    tools::global_size,
+};
+use mpi::{collective::SystemOperation, traits::*};
+use rand::prelude::*;
+use rand_chacha::ChaCha8Rng;
+
+pub fn main() {
+    // Initialise MPI
+    let universe = mpi::initialize().unwrap();
+
+    // Get the world communicator
+    let comm = universe.world();
+
+    // Initialise a seeded Rng.
+    let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
+
+    // Create `npoints` per rank.
+    let npoints = 10000;
+
+    // Generate random points.
+
+    let mut points = Vec::<f64>::with_capacity(3 * npoints);
+
+    for _ in 0..3 * npoints {
+        points.push(rng.gen());
+    }
+
+    // Compute the Morton keys on the deepest level
+    let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm);
+
+    // linearize the keys
+    let linear_keys = linearize(&keys, &mut rng, &comm);
+
+    // Generate the coarse tree
+    let coarse_tree = compute_coarse_tree(&linear_keys, &comm);
+    assert!(is_complete_linear_tree(&coarse_tree, &comm));
+
+    // We now compute the weights for the coarse tree.
+
+    let weights = compute_coarse_tree_weights(&linear_keys, &coarse_tree, &comm);
+
+    // Assert that the global sum of the weights is identical to the number of linearized keys.
+
+    let mut global_weight: usize = 0;
+
+    comm.all_reduce_into(
+        &(weights.iter().sum::<usize>()),
+        &mut global_weight,
+        SystemOperation::sum(),
+    );
+
+    assert_eq!(global_weight, global_size(&linear_keys, &comm));
+
+    // Now load balance the coarse tree
+
+    let balanced_keys = load_balance(&coarse_tree, &weights, &comm);
+
+    // Compute the weights of the balanced keys
+
+    let balanced_weights = compute_coarse_tree_weights(&linear_keys, &balanced_keys, &comm);
+
+    let mut global_balanced_weight: usize = 0;
+    comm.all_reduce_into(
+        &(balanced_weights.iter().sum::<usize>()),
+        &mut global_balanced_weight,
+        SystemOperation::sum(),
+    );
+
+    // The global weight of the non-balanced keys should be identical
+    // to the global weigth of the balanced keys.
+
+    assert_eq!(global_weight, global_balanced_weight);
+
+    // Now compute the new fine keys.
+
+    let redistributed_fine_keys =
+        redistribute_with_respect_to_coarse_tree(&linear_keys, &balanced_keys, &comm);
+
+    assert_eq!(
+        global_size(&redistributed_fine_keys, &comm),
+        global_size(&linear_keys, &comm)
+    );
+
+    if comm.rank() == 0 {
+        println!("Coarse tree successfully created and weights computed.");
+    }
+}
diff --git a/examples/parallel_tests.rs b/examples/parallel_tests.rs
deleted file mode 100644
index 8285d7c..0000000
--- a/examples/parallel_tests.rs
+++ /dev/null
@@ -1,152 +0,0 @@
-//! Testing the hyksort component.
-use bempp_octree::constants::{DEEPEST_LEVEL, LEVEL_SIZE};
-use bempp_octree::morton::MortonKey;
-use bempp_octree::octree::{block_partition, linearize};
-use bempp_octree::tools::gather_to_root;
-use itertools::{izip, Itertools};
-use mpi::traits::*;
-use rand::prelude::*;
-
-pub fn assert_linearized<C: CommunicatorCollectives>(arr: &Vec<MortonKey>, comm: &C) {
-    // Check that the keys are still linearized.
-    let arr = gather_to_root(&arr, comm);
-
-    if comm.rank() == 0 {
-        let arr = arr.unwrap();
-        for (&elem1, &elem2) in arr.iter().tuple_windows() {
-            assert!(!elem1.is_ancestor(elem2));
-        }
-        println!("{} keys are linearized.", &arr.len());
-    }
-}
-
-pub fn generate_random_keys<R: Rng>(nkeys: usize, rng: &mut R) -> Vec<MortonKey> {
-    let mut result = Vec::<MortonKey>::with_capacity(nkeys);
-
-    let xindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
-    let yindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
-    let zindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
-
-    for (xval, yval, zval) in izip!(xindices.iter(), yindices.iter(), zindices.iter()) {
-        result.push(MortonKey::from_index_and_level(
-            [xval, yval, zval],
-            DEEPEST_LEVEL as usize,
-        ));
-    }
-
-    result
-}
-
-pub fn generate_random_tree<R: Rng>(max_level: usize, rng: &mut R) -> Vec<MortonKey> {
-    pub fn add_level<R: Rng>(
-        keys: &mut Vec<MortonKey>,
-        current: MortonKey,
-        rng: &mut R,
-        max_level: usize,
-    ) {
-        keys.push(current);
-
-        if current.level() >= max_level {
-            return;
-        }
-
-        let mut children = current.children();
-
-        // This makes sure that the tree is not sorted.
-        children.shuffle(rng);
-
-        for child in children {
-            if rng.gen_bool(0.9) {
-                add_level(keys, child, rng, max_level);
-            }
-        }
-    }
-
-    let mut keys = Vec::<MortonKey>::new();
-    add_level(&mut keys, MortonKey::root(), rng, max_level);
-
-    keys
-}
-
-pub fn test_linearize<R: Rng, C: CommunicatorCollectives>(rng: &mut R, comm: &C) {
-    let max_level = 6;
-    let keys = generate_random_tree(max_level, rng);
-    let rank = comm.rank();
-
-    // We now linearize the keys.
-
-    if rank == 0 {
-        println!("Linearizing keys.");
-    }
-    let sorted_keys = linearize(&keys, rng, comm);
-
-    // Now check that the tree is properly linearized.
-
-    assert_linearized(&sorted_keys, comm);
-    if rank == 0 {
-        println!("Linearization successful.");
-    }
-
-    // Now form the coarse tree
-}
-
-pub fn test_coarse_partition<R: Rng, C: CommunicatorCollectives>(rng: &mut R, comm: &C) {
-    let rank = comm.rank();
-    let keys = if rank == 0 {
-        generate_random_keys(50, rng)
-    } else {
-        generate_random_keys(1000, rng)
-    };
-
-    // We now linearize the keys.
-
-    let mut keys = linearize(&keys, rng, comm);
-
-    // We move most keys over from rank 0 to rank 2 to check how the partitioning works.
-
-    let nsend = 400;
-    // Send the last 200 keys from rank 0 to rank 1.
-
-    if rank == 0 {
-        let send_keys = &keys[keys.len() - nsend..keys.len()];
-        comm.process_at_rank(1).send(send_keys);
-        keys = keys[0..keys.len() - nsend].to_vec();
-    }
-
-    if rank == 1 {
-        let mut recv_keys = vec![MortonKey::default(); nsend];
-        comm.process_at_rank(0).receive_into(&mut recv_keys);
-        recv_keys.extend(keys.iter());
-        keys = recv_keys;
-    }
-
-    println!("Rank {} has {} keys. ", rank, keys.len());
-
-    let partitioned_tree = block_partition(&keys, comm);
-
-    println!(
-        "Partitioned tree on rank {} has {} keys.",
-        rank,
-        partitioned_tree.0.len()
-    );
-
-    let arr = gather_to_root(&partitioned_tree.0, comm);
-
-    if rank == 0 {
-        let arr = arr.unwrap();
-        for (elem1, elem2) in arr.iter().tuple_windows() {
-            assert!(*elem1 <= *elem2);
-        }
-        println!("Keys are sorted.");
-    }
-}
-
-pub fn main() {
-    let universe = mpi::initialize().unwrap();
-    let comm = universe.world();
-    let rank = comm.rank() as u64;
-    // Each process gets its own rng
-    let mut rng = rand::rngs::StdRng::seed_from_u64(rank as u64);
-    test_linearize(&mut rng, &comm);
-    test_coarse_partition(&mut rng, &comm);
-}
diff --git a/src/octree.rs b/src/octree.rs
index 3185103..ca5261d 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -128,6 +128,7 @@ pub fn points_to_morton<C: CommunicatorCollectives>(
 }
 
 /// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree.
+/// The returned coarse tree is load balanced according to the number of linear keys in each coarse block.
 pub fn compute_coarse_tree<C: CommunicatorCollectives>(
     linear_keys: &[MortonKey],
     comm: &C,
@@ -171,24 +172,13 @@ pub fn compute_coarse_tree<C: CommunicatorCollectives>(
     complete_tree(&largest_boxes, comm)
 }
 
-/// Block partition of tree.
-///
-/// Returns a tuple `(partitioned_keys, coarse_keys)` of the partitioned
-/// keys and the associated coarse keys.
-/// A necessary condition for the block partitioning is that
-// all sorted keys are on the same level.
-pub fn block_partition<C: CommunicatorCollectives>(
+/// Compute the weights of each coarse tree block as the number of linear keys associated with each coarse block.
+pub fn compute_coarse_tree_weights<C: CommunicatorCollectives>(
     linear_keys: &[MortonKey],
+    coarse_tree: &[MortonKey],
     comm: &C,
-) -> (Vec<MortonKey>, Vec<MortonKey>) {
+) -> Vec<usize> {
     let rank = comm.rank();
-    if comm.size() == 1 {
-        // On a single node block partitioning should not do anything.
-        return (linear_keys.to_vec(), vec![MortonKey::root()]);
-    }
-
-    let coarse_tree = compute_coarse_tree(&linear_keys, comm);
-
     // We want to partition the coarse tree. But we need the correct weights. The idea
     // is that we use the number of original leafs that intersect with the coarse tree
     // as leafs. In order to compute this we send the coarse tree around to all processes
@@ -204,7 +194,7 @@ pub fn block_partition<C: CommunicatorCollectives>(
     let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm);
 
     // We now compute the local weights.
-    let mut local_weights = vec![0 as usize; global_coarse_tree.len()];
+    let mut local_weight_contribution = vec![0 as usize; global_coarse_tree.len()];
 
     // In the following loop we want to be a bit smart. We do not iterate through all the local elements.
     // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
@@ -233,7 +223,7 @@ pub fn block_partition<C: CommunicatorCollectives>(
     // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key).
 
     for (w, &global_coarse_key) in izip!(
-        local_weights[first_coarse_index..=last_coarse_index].iter_mut(),
+        local_weight_contribution[first_coarse_index..=last_coarse_index].iter_mut(),
         global_coarse_tree[first_coarse_index..=last_coarse_index].iter()
     ) {
         *w += linear_keys
@@ -244,14 +234,18 @@ pub fn block_partition<C: CommunicatorCollectives>(
 
     // We now need to sum up the weights across all processes.
 
-    let mut weights = vec![0 as usize; global_coarse_tree.len()];
+    let mut global_weights = vec![0 as usize; global_coarse_tree.len()];
 
-    comm.all_reduce_into(&local_weights, &mut weights, SystemOperation::sum());
+    comm.all_reduce_into(
+        &local_weight_contribution,
+        &mut global_weights,
+        SystemOperation::sum(),
+    );
 
     // Each process now has all weights. However, we only need the ones for the current process.
     // So we just filter the rest out.
 
-    let weights = izip!(coarse_tree_ranks, weights)
+    izip!(coarse_tree_ranks, global_weights)
         .filter_map(|(r, weight)| {
             if r == rank as usize {
                 Some(weight)
@@ -259,28 +253,19 @@ pub fn block_partition<C: CommunicatorCollectives>(
                 None
             }
         })
-        .collect_vec();
-
-    let coarse_tree = partition(&coarse_tree, &weights, comm);
-
-    (
-        redistribute_with_respect_to_coarse_tree(&linear_keys, &coarse_tree, comm),
-        coarse_tree,
-    )
-
-    // We now need to redistribute the global tree according to the coarse tree.
+        .collect_vec()
 }
 
 /// Redistribute sorted keys with respect to a linear coarse tree.
 pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
-    sorted_keys: &[MortonKey],
+    linear_keys: &[MortonKey],
     coarse_tree: &[MortonKey],
     comm: &C,
 ) -> Vec<MortonKey> {
     let size = comm.size();
 
     if size == 1 {
-        return sorted_keys.to_vec();
+        return linear_keys.to_vec();
     }
 
     // We want to globally redistribute keys so that the keys on each process are descendents
@@ -301,51 +286,31 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
 
     // This will store for each rank how many keys will be assigned to it.
 
-    let rank_counts = sort_to_bins(sorted_keys, &global_bins)
+    let rank_counts = sort_to_bins(linear_keys, &global_bins)
         .iter()
         .map(|&elem| elem as i32)
         .collect_vec();
 
     // We now have the counts for each rank. Let's redistribute accordingly and return.
 
-    redistribute(&sorted_keys, &rank_counts, comm)
-}
+    let result = redistribute(&linear_keys, &rank_counts, comm);
+
+    #[cfg(debug_assertions)]
+    {
+        // Check through that the first and last key of result are descendents
+        // of the first and last coarse bloack.
+        debug_assert!(coarse_tree
+            .first()
+            .unwrap()
+            .is_ancestor(*result.first().unwrap()));
+        debug_assert!(coarse_tree
+            .last()
+            .unwrap()
+            .is_ancestor(*result.last().unwrap()));
+    }
 
-// /// Create bins from sorted keys.
-// pub fn sort_to_bins(sorted_keys: &[MortonKey], bins: &[MortonKey]) -> Vec<usize> {
-//     let mut bin_counts = vec![0 as usize; bins.len() - 1];
-
-//     // This iterates over each possible bin and returns also the associated rank.
-//     let mut bin_iter = izip!(
-//         bin_counts.iter_mut(),
-//         bins.iter().tuple_windows::<(&MortonKey, &MortonKey)>(),
-//     );
-
-//     // We take the first element of the bin iterator. There will always be at least one.
-//     let mut r: &mut usize;
-//     let mut bin_start: &MortonKey;
-//     let mut bin_end: &MortonKey;
-//     (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
-
-//     for &key in sorted_keys.iter() {
-//         if *bin_start <= key && key < *bin_end {
-//             *r += 1;
-//         } else {
-//             // Move the bin forward until it fits. There will always be a fitting bin.
-//             while let Some((rn, (bsn, ben))) = bin_iter.next() {
-//                 if *bsn <= key && key < *ben {
-//                     *rn += 1;
-//                     r = rn;
-//                     bin_start = bsn;
-//                     bin_end = ben;
-//                     break;
-//                 }
-//             }
-//         }
-//     }
-
-//     bin_counts
-// }
+    result
+}
 
 /// Return a complete tree generated from local keys and associated coarse keys.
 ///
@@ -394,7 +359,7 @@ pub fn create_local_tree(
         }
     }
 
-    new_coarse_keys.to_vec()
+    new_coarse_keys
 }
 
 /// Linearize a set of weighted Morton keys.
@@ -452,7 +417,7 @@ pub fn linearize<R: Rng, C: CommunicatorCollectives>(
 }
 
 /// Balance a sorted list of Morton keys across processors given an array of corresponding weights.
-pub fn partition<C: CommunicatorCollectives>(
+pub fn load_balance<C: CommunicatorCollectives>(
     sorted_keys: &[MortonKey],
     weights: &[usize],
     comm: &C,
@@ -508,35 +473,6 @@ pub fn partition<C: CommunicatorCollectives>(
         .map(|elem| *elem as i32)
         .collect_vec();
 
-    // for p in 1..=size as usize {
-    //     let q = if p <= k as usize {
-    //         izip!(sorted_keys, &scan)
-    //             .filter_map(|(&key, &s)| {
-    //                 if ((p - 1) * (1 + w) <= s && s < p * (w + 1))
-    //                     || (p == size as usize && (p - 1) * (1 + w) <= s)
-    //                 {
-    //                     Some(key)
-    //                 } else {
-    //                     None
-    //                 }
-    //             })
-    //             .collect_vec()
-    //     } else {
-    //         izip!(sorted_keys, &scan)
-    //             .filter_map(|(&key, &s)| {
-    //                 if ((p - 1) * w + k <= s && s < p * w + k)
-    //                     || (p == size as usize && (p - 1) * w + k <= s)
-    //                 {
-    //                     Some(key)
-    //                 } else {
-    //                     None
-    //                 }
-    //             })
-    //             .collect_vec()
-    //     };
-    //     hash_map.insert(p - 1, q);
-    // }
-
     // Now distribute the data with an all to all v.
     // We create a vector of how many elements to send to each process and
     // then send the actual data.

From 887dfcf2bc85df8356717ca6c7e500e8c74adf1d Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Sun, 22 Sep 2024 23:41:32 +0100
Subject: [PATCH 29/42] WIP: distributed balanced tree

---
 examples/mpi_coarse_tree.rs | 28 ++++++++++++++++++++--------
 src/octree.rs               | 25 +++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs
index ef0095c..5874fe0 100644
--- a/examples/mpi_coarse_tree.rs
+++ b/examples/mpi_coarse_tree.rs
@@ -3,8 +3,9 @@
 use bempp_octree::{
     constants::DEEPEST_LEVEL,
     octree::{
-        complete_tree, compute_coarse_tree, compute_coarse_tree_weights, is_complete_linear_tree,
-        linearize, load_balance, points_to_morton, redistribute_with_respect_to_coarse_tree,
+        complete_tree, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree,
+        is_complete_linear_tree, linearize, load_balance, points_to_morton,
+        redistribute_with_respect_to_coarse_tree,
     },
     tools::global_size,
 };
@@ -61,15 +62,16 @@ pub fn main() {
 
     // Now load balance the coarse tree
 
-    let balanced_keys = load_balance(&coarse_tree, &weights, &comm);
+    let load_balanced_coarse_keys = load_balance(&coarse_tree, &weights, &comm);
 
     // Compute the weights of the balanced keys
 
-    let balanced_weights = compute_coarse_tree_weights(&linear_keys, &balanced_keys, &comm);
+    let load_balanced_weights =
+        compute_coarse_tree_weights(&linear_keys, &load_balanced_coarse_keys, &comm);
 
     let mut global_balanced_weight: usize = 0;
     comm.all_reduce_into(
-        &(balanced_weights.iter().sum::<usize>()),
+        &(load_balanced_weights.iter().sum::<usize>()),
         &mut global_balanced_weight,
         SystemOperation::sum(),
     );
@@ -81,15 +83,25 @@ pub fn main() {
 
     // Now compute the new fine keys.
 
-    let redistributed_fine_keys =
-        redistribute_with_respect_to_coarse_tree(&linear_keys, &balanced_keys, &comm);
+    let load_balanced_fine_keys =
+        redistribute_with_respect_to_coarse_tree(&linear_keys, &load_balanced_coarse_keys, &comm);
 
     assert_eq!(
-        global_size(&redistributed_fine_keys, &comm),
+        global_size(&load_balanced_fine_keys, &comm),
         global_size(&linear_keys, &comm)
     );
 
+    let refined_tree =
+        create_local_tree(&load_balanced_fine_keys, &load_balanced_coarse_keys, 6, 100);
+
     if comm.rank() == 0 {
+        println!("Coarse tree has {} keys.", load_balanced_coarse_keys.len());
+        println!("Refined tree has {} keys.", refined_tree.len());
+    }
+
+    assert!(is_complete_linear_tree(&refined_tree, &comm));
+
+    if comm.rank() == 1 {
         println!("Coarse tree successfully created and weights computed.");
     }
 }
diff --git a/src/octree.rs b/src/octree.rs
index ca5261d..54d8168 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -316,6 +316,7 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
 ///
 /// The coarse keys are refined until the maximum level is reached or until each coarse key
 /// is the ancestor of at most `max_keys` fine keys.
+/// It is assumed that the level of the fine keys is at least as large as `max_level`.
 pub fn create_local_tree(
     sorted_fine_keys: &[MortonKey],
     coarse_keys: &[MortonKey],
@@ -327,8 +328,7 @@ pub fn create_local_tree(
     }
 
     // We split the sorted fine keys into subslices so that each subslice
-    // is associated with a coarse slice. For this we need to add an upper bound
-    // coarse keys to ensure that we have suitable bins.
+    // is associated with a coarse slice.
 
     let bins = coarse_keys.to_vec();
 
@@ -413,6 +413,8 @@ pub fn linearize<R: Rng, C: CommunicatorCollectives>(
         }
     }
 
+    debug_assert!(is_linear_tree(&result, comm));
+
     result
 }
 
@@ -655,3 +657,22 @@ pub fn is_complete_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], co
 
     result
 }
+
+/// Return the deepest level of a distributed list of Morton keys.
+pub fn deepest_level<C: CommunicatorCollectives>(keys: &[MortonKey], comm: &C) -> usize {
+    let local_deepest_level = keys.iter().map(|elem| elem.level()).max().unwrap();
+
+    if comm.size() == 1 {
+        return local_deepest_level;
+    }
+
+    let mut global_deepest_level: usize = 0;
+
+    comm.all_reduce_into(
+        &local_deepest_level,
+        &mut global_deepest_level,
+        SystemOperation::max(),
+    );
+
+    global_deepest_level
+}

From a39ef8e49ba44d4724bcd3ab078214989ee9ff74 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Mon, 23 Sep 2024 00:43:29 +0100
Subject: [PATCH 30/42] Balancing implemented

---
 examples/mpi_coarse_tree.rs | 18 +++++---
 src/morton.rs               |  4 ++
 src/octree.rs               | 83 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 98 insertions(+), 7 deletions(-)

diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs
index 5874fe0..db3c672 100644
--- a/examples/mpi_coarse_tree.rs
+++ b/examples/mpi_coarse_tree.rs
@@ -3,7 +3,7 @@
 use bempp_octree::{
     constants::DEEPEST_LEVEL,
     octree::{
-        complete_tree, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree,
+        balance, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree,
         is_complete_linear_tree, linearize, load_balance, points_to_morton,
         redistribute_with_respect_to_coarse_tree,
     },
@@ -94,13 +94,19 @@ pub fn main() {
     let refined_tree =
         create_local_tree(&load_balanced_fine_keys, &load_balanced_coarse_keys, 6, 100);
 
-    if comm.rank() == 0 {
-        println!("Coarse tree has {} keys.", load_balanced_coarse_keys.len());
-        println!("Refined tree has {} keys.", refined_tree.len());
-    }
-
     assert!(is_complete_linear_tree(&refined_tree, &comm));
 
+    // Now balance the tree.
+
+    let balanced_tree = balance(&refined_tree, &mut rng, &comm);
+
+    // redistribute the balanced tree according to coarse tree
+
+    let balanced_tree =
+        redistribute_with_respect_to_coarse_tree(&balanced_tree, &load_balanced_coarse_keys, &comm);
+
+    assert!(is_complete_linear_tree(&balanced_tree, &comm));
+
     if comm.rank() == 1 {
         println!("Coarse tree successfully created and weights computed.");
     }
diff --git a/src/morton.rs b/src/morton.rs
index 43e4aa9..95ee563 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -389,6 +389,10 @@ impl MortonKey {
         let mut result = [MortonKey::default(); 26];
 
         let (level, [x, y, z]) = self.decode();
+
+        if level == 0 {
+            return result;
+        }
         let level_size = 1 << level;
 
         for (direction, res) in izip!(DIRECTIONS, result.iter_mut()) {
diff --git a/src/octree.rs b/src/octree.rs
index 54d8168..ebc8ea1 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -1,11 +1,16 @@
 //! Parallel Octree structure
 
+use std::collections::HashSet;
+
 use crate::{
     constants::{DEEPEST_LEVEL, NSIBLINGS},
     geometry::PhysicalBox,
     morton::MortonKey,
     parsort::parsort,
-    tools::{communicate_back, gather_to_all, global_inclusive_cumsum, redistribute, sort_to_bins},
+    tools::{
+        communicate_back, gather_to_all, gather_to_root, global_inclusive_cumsum, redistribute,
+        sort_to_bins,
+    },
 };
 
 use mpi::traits::Root;
@@ -553,6 +558,64 @@ pub fn complete_tree<C: CommunicatorCollectives>(
     result
 }
 
+/// Balance a distributed tree.
+pub fn balance<R: Rng, C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    let deepest_level = deepest_level(linear_keys, comm);
+
+    // Start with keys at deepest level
+    let mut work_list = linear_keys
+        .iter()
+        .copied()
+        .filter(|&key| key.level() == deepest_level)
+        .collect_vec();
+
+    let mut result = Vec::<MortonKey>::new();
+
+    // Now go through and make sure that for each key siblings and neighbours of parents are added
+
+    for level in (1..=deepest_level).rev() {
+        let mut parents = HashSet::<MortonKey>::new();
+        let mut new_work_list = Vec::<MortonKey>::new();
+        // We filter the work list by level and also make sure that
+        // only one sibling of each of the parents children is added to
+        // our current level list.
+        for key in work_list.iter() {
+            let parent = key.parent();
+            if !parents.contains(&parent) {
+                parents.insert(parent);
+                result.extend_from_slice(key.siblings().as_slice());
+                new_work_list.extend_from_slice(
+                    parent
+                        .neighbours()
+                        .iter()
+                        .copied()
+                        .filter(|&key| key.is_valid())
+                        .collect_vec()
+                        .as_slice(),
+                );
+            }
+        }
+        new_work_list.extend(
+            linear_keys
+                .iter()
+                .copied()
+                .filter(|&key| key.level() == level - 1),
+        );
+
+        work_list = new_work_list;
+        // Now extend the work list with the
+    }
+
+    let result = linearize(&result, rng, comm);
+
+    debug_assert!(is_complete_linear_and_balanced(&result, comm));
+    result
+}
+
 /// Return true if the keys are linear.
 pub fn is_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
     let mut is_linear = true;
@@ -676,3 +739,21 @@ pub fn deepest_level<C: CommunicatorCollectives>(keys: &[MortonKey], comm: &C) -
 
     global_deepest_level
 }
+
+/// Check if tree is balanced.
+pub fn is_complete_linear_and_balanced<C: CommunicatorCollectives>(
+    arr: &[MortonKey],
+    comm: &C,
+) -> bool {
+    // Send the tree to the root node and check there that it is balanced.
+
+    let mut balanced = false;
+
+    if let Some(arr) = gather_to_root(arr, comm) {
+        balanced = MortonKey::is_complete_linear_and_balanced(&arr);
+    }
+
+    comm.process_at_rank(0).broadcast_into(&mut balanced);
+
+    balanced
+}

From 5a8a8b75ab76733bb4d5f54e13d35a42839c14ce Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Mon, 23 Sep 2024 22:44:12 +0100
Subject: [PATCH 31/42] Added distribution of points to correct ranks.

---
 examples/mpi_coarse_tree.rs         | 40 +++++++++++---
 examples/mpi_complete_tree.rs       |  7 +--
 examples/mpi_global_bounding_box.rs | 10 ++--
 src/geometry.rs                     | 36 ++++++++++---
 src/morton.rs                       | 14 ++---
 src/octree.rs                       | 83 ++++++++++++++++++++++++-----
 src/serial.rs                       | 20 ++++---
 src/tools.rs                        | 20 +++++++
 8 files changed, 173 insertions(+), 57 deletions(-)

diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs
index db3c672..a79626d 100644
--- a/examples/mpi_coarse_tree.rs
+++ b/examples/mpi_coarse_tree.rs
@@ -2,12 +2,13 @@
 
 use bempp_octree::{
     constants::DEEPEST_LEVEL,
+    morton::MortonKey,
     octree::{
         balance, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree,
         is_complete_linear_tree, linearize, load_balance, points_to_morton,
-        redistribute_with_respect_to_coarse_tree,
+        redistribute_points_with_respect_to_coarse_tree, redistribute_with_respect_to_coarse_tree,
     },
-    tools::global_size,
+    tools::{communicate_back, generate_random_points, global_size, is_sorted_array},
 };
 use mpi::{collective::SystemOperation, traits::*};
 use rand::prelude::*;
@@ -28,11 +29,7 @@ pub fn main() {
 
     // Generate random points.
 
-    let mut points = Vec::<f64>::with_capacity(3 * npoints);
-
-    for _ in 0..3 * npoints {
-        points.push(rng.gen());
-    }
+    let points = generate_random_points(npoints, &mut rng, &comm);
 
     // Compute the Morton keys on the deepest level
     let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm);
@@ -107,7 +104,34 @@ pub fn main() {
 
     assert!(is_complete_linear_tree(&balanced_tree, &comm));
 
-    if comm.rank() == 1 {
+    // Redistribute original keys and points with respect to balanced coarse tree.
+
+    let (balanced_points, balanced_keys) = redistribute_points_with_respect_to_coarse_tree(
+        &points,
+        &keys,
+        &load_balanced_coarse_keys,
+        &comm,
+    );
+
+    let upper_bound;
+
+    if let Some(next_key) = communicate_back(&load_balanced_coarse_keys, &comm) {
+        upper_bound = next_key;
+    } else {
+        upper_bound = MortonKey::upper_bound();
+    }
+
+    assert!(load_balanced_coarse_keys.first().unwrap() <= balanced_keys.first().unwrap());
+    assert!(*balanced_keys.last().unwrap() < upper_bound);
+    assert!(is_sorted_array(&balanced_keys, &comm));
+
+    println!(
+        "Rank {} has {} balanced points.",
+        comm.rank(),
+        balanced_points.len(),
+    );
+
+    if comm.rank() == 0 {
         println!("Coarse tree successfully created and weights computed.");
     }
 }
diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index 052b214..d47fc6a 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -3,6 +3,7 @@
 use bempp_octree::{
     constants::DEEPEST_LEVEL,
     octree::{complete_tree, is_complete_linear_tree, linearize, points_to_morton},
+    tools::generate_random_points,
 };
 use mpi::traits::*;
 use rand::prelude::*;
@@ -23,11 +24,7 @@ pub fn main() {
 
     // Generate random points.
 
-    let mut points = Vec::<f64>::with_capacity(3 * npoints);
-
-    for _ in 0..3 * npoints {
-        points.push(rng.gen());
-    }
+    let points = generate_random_points(npoints, &mut rng, &comm);
 
     // Compute the Morton keys on the deepest level
     let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm);
diff --git a/examples/mpi_global_bounding_box.rs b/examples/mpi_global_bounding_box.rs
index 330a168..144d0e5 100644
--- a/examples/mpi_global_bounding_box.rs
+++ b/examples/mpi_global_bounding_box.rs
@@ -1,7 +1,9 @@
 //! Test the computation of a global bounding box across MPI ranks.
 
 use bempp_octree::{
-    geometry::PhysicalBox, octree::compute_global_bounding_box, tools::gather_to_root,
+    geometry::PhysicalBox,
+    octree::compute_global_bounding_box,
+    tools::{gather_to_root, generate_random_points},
 };
 use rand::prelude::*;
 use rand_chacha::ChaCha8Rng;
@@ -21,11 +23,7 @@ pub fn main() {
 
     // Generate random points.
 
-    let mut points = Vec::<f64>::with_capacity(3 * npoints);
-
-    for _ in 0..3 * npoints {
-        points.push(rng.gen());
-    }
+    let points = generate_random_points(npoints, &mut rng, &comm);
 
     // Compute the distributed bounding box.
 
diff --git a/src/geometry.rs b/src/geometry.rs
index 303fdf3..544e4d5 100644
--- a/src/geometry.rs
+++ b/src/geometry.rs
@@ -1,9 +1,33 @@
 //! Geometry information
 
-use bytemuck;
+use mpi::traits::Equivalence;
 
 use crate::constants::DEEPEST_LEVEL;
 
+/// Definition of a point.
+#[derive(Clone, Copy, Equivalence)]
+pub struct Point {
+    coords: [f64; 3],
+    global_id: usize,
+}
+
+impl Point {
+    /// Create a new point from coordinates and global id.
+    pub fn new(coords: [f64; 3], global_id: usize) -> Self {
+        Self { coords, global_id }
+    }
+
+    /// Return the coordintes of a point.
+    pub fn coords(&self) -> [f64; 3] {
+        self.coords
+    }
+
+    /// Return the global id of the point.
+    pub fn global_id(&self) -> usize {
+        self.global_id
+    }
+}
+
 /// A bounding box describes geometry in which an Octree lives.
 pub struct PhysicalBox {
     coords: [f64; 6],
@@ -18,11 +42,9 @@ impl PhysicalBox {
     }
 
     /// Give a slice of points. Compute an associated bounding box.
-    pub fn from_points(points: &[f64]) -> PhysicalBox {
+    pub fn from_points(points: &[Point]) -> PhysicalBox {
         assert_eq!(points.len() % 3, 0);
 
-        let points: &[[f64; 3]] = bytemuck::cast_slice(points);
-
         let mut xmin = f64::MAX;
         let mut xmax = f64::MIN;
 
@@ -33,9 +55,9 @@ impl PhysicalBox {
         let mut zmax = f64::MIN;
 
         for point in points {
-            let x = point[0];
-            let y = point[1];
-            let z = point[2];
+            let x = point.coords()[0];
+            let y = point.coords()[1];
+            let z = point.coords()[2];
 
             xmin = f64::min(xmin, x);
             xmax = f64::max(xmax, x);
diff --git a/src/morton.rs b/src/morton.rs
index 95ee563..5fee9a3 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -5,7 +5,7 @@ use crate::constants::{
     LEVEL_SIZE, NINE_BIT_MASK, NSIBLINGS, X_LOOKUP_DECODE, X_LOOKUP_ENCODE, Y_LOOKUP_DECODE,
     Y_LOOKUP_ENCODE, Z_LOOKUP_DECODE, Z_LOOKUP_ENCODE,
 };
-use crate::geometry::PhysicalBox;
+use crate::geometry::{PhysicalBox, Point};
 use itertools::izip;
 use itertools::Itertools;
 use mpi::traits::Equivalence;
@@ -155,9 +155,9 @@ impl MortonKey {
 
     /// Map a physical point within a bounding box to a Morton key on a given level.
     /// It is assumed that points are strictly contained within the bounding box.
-    pub fn from_physical_point(point: [f64; 3], bounding_box: &PhysicalBox, level: usize) -> Self {
+    pub fn from_physical_point(point: Point, bounding_box: &PhysicalBox, level: usize) -> Self {
         let level_size = 1 << level;
-        let reference = bounding_box.physical_to_reference(point);
+        let reference = bounding_box.physical_to_reference(point.coords());
         let x = (reference[0] * level_size as f64) as usize;
         let y = (reference[1] * level_size as f64) as usize;
         let z = (reference[2] * level_size as f64) as usize;
@@ -1284,7 +1284,7 @@ mod test {
     pub fn test_from_physical_point() {
         let bounding_box = PhysicalBox::new([-2.0, -3.0, -1.0, 4.0, 5.0, 6.0]);
 
-        let point = [1.5, -2.5, 5.0];
+        let point = Point::new([1.5, -2.5, 5.0], 0);
         let level = 10;
 
         let key = MortonKey::from_physical_point(point, &bounding_box, level);
@@ -1293,9 +1293,9 @@ mod test {
 
         let coords = physical_box.coordinates();
 
-        assert!(coords[0] <= point[0] && point[0] < coords[3]);
-        assert!(coords[1] <= point[1] && point[1] < coords[4]);
-        assert!(coords[2] <= point[2] && point[2] < coords[5]);
+        assert!(coords[0] <= point.coords()[0] && point.coords()[0] < coords[3]);
+        assert!(coords[1] <= point.coords()[1] && point.coords()[1] < coords[4]);
+        assert!(coords[2] <= point.coords()[2] && point.coords()[2] < coords[5]);
 
         // Now compute the box.
     }
diff --git a/src/octree.rs b/src/octree.rs
index ebc8ea1..ccae541 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -4,7 +4,7 @@ use std::collections::HashSet;
 
 use crate::{
     constants::{DEEPEST_LEVEL, NSIBLINGS},
-    geometry::PhysicalBox,
+    geometry::{PhysicalBox, Point},
     morton::MortonKey,
     parsort::parsort,
     tools::{
@@ -21,12 +21,10 @@ use rand::Rng;
 
 /// Compute the global bounding box across all points on all processes.
 pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
-    points: &[f64],
+    points: &[Point],
     comm: &C,
 ) -> PhysicalBox {
     // Make sure that the points array is a multiple of 3.
-    assert_eq!(points.len() % 3, 0);
-    let points: &[[f64; 3]] = bytemuck::cast_slice(points);
 
     // Now compute the minimum and maximum across each dimension.
 
@@ -40,9 +38,9 @@ pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
     let mut zmax = f64::MIN;
 
     for point in points {
-        let x = point[0];
-        let y = point[1];
-        let z = point[2];
+        let x = point.coords()[0];
+        let y = point.coords()[1];
+        let z = point.coords()[2];
 
         xmin = f64::min(xmin, x);
         xmax = f64::max(xmax, x);
@@ -102,13 +100,10 @@ pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
 
 /// Convert points to Morton keys on specified level.
 pub fn points_to_morton<C: CommunicatorCollectives>(
-    points: &[f64],
+    points: &[Point],
     max_level: usize,
     comm: &C,
 ) -> (Vec<MortonKey>, PhysicalBox) {
-    // Make sure that the points array is a multiple of 3.
-    assert_eq!(points.len() % 3, 0);
-
     // Make sure that max level never exceeds DEEPEST_LEVEL
     let max_level = if max_level > DEEPEST_LEVEL as usize {
         DEEPEST_LEVEL as usize
@@ -122,8 +117,6 @@ pub fn points_to_morton<C: CommunicatorCollectives>(
 
     // Bunch the points in arrays of 3.
 
-    let points: &[[f64; 3]] = bytemuck::cast_slice(points);
-
     let keys = points
         .iter()
         .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level))
@@ -651,6 +644,70 @@ pub fn is_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -
     global_is_linear
 }
 
+/// Redistribute points with respect to a given coarse tree
+pub fn redistribute_points_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
+    points: &[Point],
+    morton_keys_for_points: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> (Vec<Point>, Vec<MortonKey>) {
+    pub fn argsort<T: Ord + Copy>(arr: &[T]) -> Vec<usize> {
+        let mut sort_indices = (0..arr.len()).collect_vec();
+        sort_indices.sort_unstable_by_key(|&index| arr[index]);
+        sort_indices
+    }
+
+    pub fn reorder<T: Copy>(arr: &[T], permutation: &[usize]) -> Vec<T> {
+        let mut reordered = Vec::<T>::with_capacity(arr.len());
+        for &index in permutation.iter() {
+            reordered.push(arr[index])
+        }
+        reordered
+    }
+
+    assert_eq!(points.len(), morton_keys_for_points.len());
+
+    let size = comm.size();
+
+    if size == 1 {
+        return (points.to_vec(), morton_keys_for_points.to_vec());
+    }
+
+    let sort_indices = argsort(&morton_keys_for_points);
+    let sorted_keys = reorder(&morton_keys_for_points, &sort_indices);
+    let sorted_points = reorder(&points, &sort_indices);
+
+    // Now get the bins
+
+    let my_first = coarse_tree.first().unwrap();
+
+    let global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
+
+    // We now sort the morton indices into the bins.
+
+    // This will store for each rank how many keys will be assigned to it.
+
+    let counts = sort_to_bins(&sorted_keys, &global_bins)
+        .iter()
+        .map(|&elem| elem as i32)
+        .collect_vec();
+
+    // We now redistribute the points and the corresponding keys.
+
+    let (distributed_points, distributed_keys) = (
+        redistribute(&sorted_points, &counts, comm),
+        redistribute(&sorted_keys, &counts, comm),
+    );
+
+    // Now sort the distributed points and keys internally again.
+
+    let sort_indices = argsort(&distributed_keys);
+    let sorted_keys = reorder(&distributed_keys, &sort_indices);
+    let sorted_points = reorder(&distributed_points, &sort_indices);
+
+    (sorted_points, sorted_keys)
+}
+
 /// Return true on all ranks if distributed tree is complete. Otherwise, return false.
 pub fn is_complete_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
     // First check that the local tree on each node is complete.
diff --git a/src/serial.rs b/src/serial.rs
index 64cdb08..d32b3ac 100644
--- a/src/serial.rs
+++ b/src/serial.rs
@@ -2,10 +2,9 @@
 
 use crate::{
     constants::{DEEPEST_LEVEL, NLEVELS},
-    geometry::PhysicalBox,
+    geometry::{PhysicalBox, Point},
     morton::MortonKey,
 };
-use bytemuck;
 use std::collections::HashMap;
 use vtkio;
 
@@ -22,7 +21,7 @@ pub struct Neighbour {
 /// An octree
 pub struct Octree {
     leaf_keys: Vec<MortonKey>,
-    points: Vec<[f64; 3]>,
+    points: Vec<Point>,
     point_to_level_keys: [Vec<MortonKey>; NLEVELS],
     bounding_box: PhysicalBox,
     key_counts: HashMap<MortonKey, usize>,
@@ -32,7 +31,7 @@ pub struct Octree {
 
 impl Octree {
     /// Create octress from points
-    pub fn from_points(points: &[f64], max_level: usize, max_points_per_box: usize) -> Self {
+    pub fn from_points(points: &[Point], max_level: usize, max_points_per_box: usize) -> Self {
         // Make sure that the points array is a multiple of 3.
         assert_eq!(points.len() % 3, 0);
 
@@ -49,7 +48,6 @@ impl Octree {
 
         // Bunch the points in arrays of 3.
 
-        let points: &[[f64; 3]] = bytemuck::cast_slice(points);
         let npoints = points.len();
 
         // We create a vector of keys for each point on each level. We compute the
@@ -160,7 +158,7 @@ impl Octree {
     }
 
     /// Points
-    pub fn points(&self) -> &Vec<[f64; 3]> {
+    pub fn points(&self) -> &Vec<Point> {
         &self.points
     }
 
@@ -264,14 +262,16 @@ impl Octree {
 
 #[cfg(test)]
 mod test {
+    use crate::geometry::Point;
+
     use super::Octree;
     use rand::prelude::*;
 
-    fn get_points_on_sphere(npoints: usize) -> Vec<f64> {
+    fn get_points_on_sphere(npoints: usize) -> Vec<Point> {
         let mut rng = rand::rngs::StdRng::seed_from_u64(0);
         let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
 
-        let mut points = Vec::<f64>::with_capacity(3 * npoints);
+        let mut points = Vec::<Point>::with_capacity(npoints);
         for _ in 0..(npoints) {
             let x: f64 = normal.sample(&mut rng);
             let y: f64 = normal.sample(&mut rng);
@@ -279,9 +279,7 @@ mod test {
 
             let norm = (x * x + y * y + z * z).sqrt();
 
-            points.push(x / norm);
-            points.push(y / norm);
-            points.push(z / norm);
+            points.push(Point::new([x / norm, y / norm, z / norm], 0));
         }
 
         points
diff --git a/src/tools.rs b/src/tools.rs
index 075804a..56e0905 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -12,6 +12,7 @@ use rand::Rng;
 
 use crate::{
     constants::{DEEPEST_LEVEL, LEVEL_SIZE},
+    geometry::Point,
     morton::MortonKey,
 };
 
@@ -372,6 +373,25 @@ pub fn generate_random_keys<R: Rng>(nkeys: usize, rng: &mut R) -> Vec<MortonKey>
     result
 }
 
+/// Generate random points for testing.
+pub fn generate_random_points<R: Rng, C: CommunicatorCollectives>(
+    npoints: usize,
+    rng: &mut R,
+    comm: &C,
+) -> Vec<Point> {
+    let mut points = Vec::<Point>::with_capacity(npoints);
+    let rank = comm.rank() as usize;
+
+    for index in 0..npoints {
+        points.push(Point::new(
+            [rng.gen(), rng.gen(), rng.gen()],
+            npoints * rank + index,
+        ));
+    }
+
+    points
+}
+
 /// Compute displacements from a vector of counts.
 ///
 /// This is useful for global MPI varcount operations. Let

From a9caa553e2a742ed8265f3a72753b929be6b10fd Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Tue, 24 Sep 2024 23:11:24 +0100
Subject: [PATCH 32/42] Interior node topology

---
 src/geometry.rs        |   2 -
 src/octree.rs          | 863 +++++---------------------------------
 src/octree/parallel.rs | 912 +++++++++++++++++++++++++++++++++++++++++
 src/serial.rs          |   1 -
 src/tools.rs           |   8 +-
 5 files changed, 1016 insertions(+), 770 deletions(-)
 create mode 100644 src/octree/parallel.rs

diff --git a/src/geometry.rs b/src/geometry.rs
index 544e4d5..62b9753 100644
--- a/src/geometry.rs
+++ b/src/geometry.rs
@@ -43,8 +43,6 @@ impl PhysicalBox {
 
     /// Give a slice of points. Compute an associated bounding box.
     pub fn from_points(points: &[Point]) -> PhysicalBox {
-        assert_eq!(points.len() % 3, 0);
-
         let mut xmin = f64::MAX;
         let mut xmax = f64::MIN;
 
diff --git a/src/octree.rs b/src/octree.rs
index ccae541..6f175cc 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -1,816 +1,147 @@
-//! Parallel Octree structure
-
+pub mod parallel;
 use std::collections::HashSet;
 
+use mpi::traits::CommunicatorCollectives;
+pub use parallel::*;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
 use crate::{
-    constants::{DEEPEST_LEVEL, NSIBLINGS},
+    constants::DEEPEST_LEVEL,
     geometry::{PhysicalBox, Point},
     morton::MortonKey,
-    parsort::parsort,
-    tools::{
-        communicate_back, gather_to_all, gather_to_root, global_inclusive_cumsum, redistribute,
-        sort_to_bins,
-    },
 };
 
-use mpi::traits::Root;
-
-use itertools::{izip, Itertools};
-use mpi::{collective::SystemOperation, traits::CommunicatorCollectives};
-use rand::Rng;
-
-/// Compute the global bounding box across all points on all processes.
-pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
-    points: &[Point],
-    comm: &C,
-) -> PhysicalBox {
-    // Make sure that the points array is a multiple of 3.
-
-    // Now compute the minimum and maximum across each dimension.
-
-    let mut xmin = f64::MAX;
-    let mut xmax = f64::MIN;
-
-    let mut ymin = f64::MAX;
-    let mut ymax = f64::MIN;
-
-    let mut zmin = f64::MAX;
-    let mut zmax = f64::MIN;
-
-    for point in points {
-        let x = point.coords()[0];
-        let y = point.coords()[1];
-        let z = point.coords()[2];
-
-        xmin = f64::min(xmin, x);
-        xmax = f64::max(xmax, x);
-
-        ymin = f64::min(ymin, y);
-        ymax = f64::max(ymax, y);
-
-        zmin = f64::min(zmin, z);
-        zmax = f64::max(zmax, z);
-    }
-
-    let mut global_xmin = 0.0;
-    let mut global_xmax = 0.0;
-
-    let mut global_ymin = 0.0;
-    let mut global_ymax = 0.0;
-
-    let mut global_zmin = 0.0;
-    let mut global_zmax = 0.0;
-
-    comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min());
-    comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max());
-
-    comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min());
-    comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max());
-
-    comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min());
-    comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max());
-
-    let xdiam = global_xmax - global_xmin;
-    let ydiam = global_ymax - global_ymin;
-    let zdiam = global_zmax - global_zmin;
-
-    let xmean = global_xmin + 0.5 * xdiam;
-    let ymean = global_ymin + 0.5 * ydiam;
-    let zmean = global_zmin + 0.5 * zdiam;
-
-    // We increase diameters by box size on deepest level
-    // and use the maximum diameter to compute a
-    // cubic bounding box.
-
-    let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64;
-
-    let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap();
-
-    let max_diam = max_diam * (1.0 + deepest_box_diam);
-
-    PhysicalBox::new([
-        xmean - 0.5 * max_diam,
-        ymean - 0.5 * max_diam,
-        zmean - 0.5 * max_diam,
-        xmean + 0.5 * max_diam,
-        ymean + 0.5 * max_diam,
-        zmean + 0.5 * max_diam,
-    ])
-}
-
-/// Convert points to Morton keys on specified level.
-pub fn points_to_morton<C: CommunicatorCollectives>(
-    points: &[Point],
-    max_level: usize,
-    comm: &C,
-) -> (Vec<MortonKey>, PhysicalBox) {
-    // Make sure that max level never exceeds DEEPEST_LEVEL
-    let max_level = if max_level > DEEPEST_LEVEL as usize {
-        DEEPEST_LEVEL as usize
-    } else {
-        max_level
-    };
-
-    // Compute the physical bounding box.
-
-    let bounding_box = compute_global_bounding_box(points, comm);
-
-    // Bunch the points in arrays of 3.
-
-    let keys = points
-        .iter()
-        .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level))
-        .collect_vec();
-
-    (keys, bounding_box)
+/// A general structure for octrees.
+pub struct Octree<'o, C> {
+    points: Vec<Point>,
+    point_keys: Vec<MortonKey>,
+    coarse_tree: Vec<MortonKey>,
+    leaf_tree: Vec<MortonKey>,
+    coarse_tree_bounds: Vec<MortonKey>,
+    bounding_box: PhysicalBox,
+    comm: &'o C,
 }
 
-/// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree.
-/// The returned coarse tree is load balanced according to the number of linear keys in each coarse block.
-pub fn compute_coarse_tree<C: CommunicatorCollectives>(
-    linear_keys: &[MortonKey],
-    comm: &C,
-) -> Vec<MortonKey> {
-    let size = comm.size();
+impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
+    /// Create a new distributed Octree.
+    pub fn new(points: &[Point], max_level: usize, max_leaf_points: usize, comm: &'o C) -> Self {
+        // We need a random number generator for sorting. For simplicity we use a ChaCha8 random number generator
+        // seeded with the rank of the process.
+        let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
 
-    debug_assert!(is_linear_tree(linear_keys, comm));
+        // First compute the Morton keys of the points.
+        let (point_keys, bounding_box) = points_to_morton(points, DEEPEST_LEVEL as usize, comm);
 
-    // On a single node a complete coarse tree is simply the root.
-    if size == 1 {
-        return vec![MortonKey::root()];
-    }
-
-    let mut completed_region = linear_keys
-        .first()
-        .unwrap()
-        .fill_between_keys(*linear_keys.last().unwrap());
+        // Generate the coarse tree
 
-    completed_region.insert(0, *linear_keys.first().unwrap());
-    completed_region.push(*linear_keys.last().unwrap());
+        let (coarse_tree, leaf_tree) = {
+            // Linearize the keys.
+            let linear_keys = linearize(&point_keys, &mut rng, comm);
 
-    // Get the smallest level members of the completed region.
+            // Compute the first version of the coarse tree without load balancing.
+            let coarse_tree = compute_coarse_tree(&linear_keys, comm);
+            debug_assert!(is_complete_linear_tree(&coarse_tree, comm));
 
-    let min_level = completed_region
-        .iter()
-        .map(|elem| elem.level())
-        .min()
-        .unwrap();
+            // We now compute the weights for the initial coarse tree.
 
-    // Each process selects its largest boxes. These are used to create
-    // a coarse tree.
+            let weights = compute_coarse_tree_weights(&linear_keys, &coarse_tree, comm);
 
-    let largest_boxes = completed_region
-        .iter()
-        .filter(|elem| elem.level() == min_level)
-        .copied()
-        .collect_vec();
+            // We now load balance the initial coarse tree. This forms our final coarse tree
+            // that is used from now on.
 
-    debug_assert!(is_linear_tree(&largest_boxes, comm));
+            let coarse_tree = load_balance(&coarse_tree, &weights, comm);
 
-    complete_tree(&largest_boxes, comm)
-}
-
-/// Compute the weights of each coarse tree block as the number of linear keys associated with each coarse block.
-pub fn compute_coarse_tree_weights<C: CommunicatorCollectives>(
-    linear_keys: &[MortonKey],
-    coarse_tree: &[MortonKey],
-    comm: &C,
-) -> Vec<usize> {
-    let rank = comm.rank();
-    // We want to partition the coarse tree. But we need the correct weights. The idea
-    // is that we use the number of original leafs that intersect with the coarse tree
-    // as leafs. In order to compute this we send the coarse tree around to all processes
-    // so that each process computes for each coarse tree element how many of its keys
-    // intersect with each node of the coarse tree. We then sum up the local weight for each
-    // coarse tree node across all nodes to get the weight.
-
-    let global_coarse_tree = gather_to_all(&coarse_tree, comm);
-
-    // We also want to send around a corresponding array of ranks so that for each global coarse tree key
-    // we have the rank of where it originates from.
-
-    let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm);
-
-    // We now compute the local weights.
-    let mut local_weight_contribution = vec![0 as usize; global_coarse_tree.len()];
-
-    // In the following loop we want to be a bit smart. We do not iterate through all the local elements.
-    // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
-    // of our sorted keys that overlaps with the coarse tree region.
-
-    // Let's find the start of our region. The start of our region is a coarse key that is an ancestor
-    // of our current key. This works because the coarse tree has levels at most as high as the sorted keys.
-
-    let first_key = *linear_keys.first().unwrap();
-
-    let first_coarse_index = global_coarse_tree
-        .iter()
-        .take_while(|coarse_key| !coarse_key.is_ancestor(first_key))
-        .count();
-
-    // Now we need to find the end index of our region. For this again we find the index of our coarse tree that
-    // is an ancestor of our last key.
-    let last_key = *linear_keys.last().unwrap();
-
-    let last_coarse_index = global_coarse_tree
-        .iter()
-        .take_while(|coarse_key| !coarse_key.is_ancestor(last_key))
-        .count();
-
-    // We now only need to iterate through between the first and last coarse index in the coarse tree.
-    // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key).
-
-    for (w, &global_coarse_key) in izip!(
-        local_weight_contribution[first_coarse_index..=last_coarse_index].iter_mut(),
-        global_coarse_tree[first_coarse_index..=last_coarse_index].iter()
-    ) {
-        *w += linear_keys
-            .iter()
-            .filter(|&&key| global_coarse_key.is_ancestor(key))
-            .count();
-    }
-
-    // We now need to sum up the weights across all processes.
-
-    let mut global_weights = vec![0 as usize; global_coarse_tree.len()];
-
-    comm.all_reduce_into(
-        &local_weight_contribution,
-        &mut global_weights,
-        SystemOperation::sum(),
-    );
-
-    // Each process now has all weights. However, we only need the ones for the current process.
-    // So we just filter the rest out.
-
-    izip!(coarse_tree_ranks, global_weights)
-        .filter_map(|(r, weight)| {
-            if r == rank as usize {
-                Some(weight)
-            } else {
-                None
-            }
-        })
-        .collect_vec()
-}
-
-/// Redistribute sorted keys with respect to a linear coarse tree.
-pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
-    linear_keys: &[MortonKey],
-    coarse_tree: &[MortonKey],
-    comm: &C,
-) -> Vec<MortonKey> {
-    let size = comm.size();
-
-    if size == 1 {
-        return linear_keys.to_vec();
-    }
+            // We also want to redistribute the fine keys with respect to the load balanced coarse trees.
 
-    // We want to globally redistribute keys so that the keys on each process are descendents
-    // of the local coarse tree keys.
+            let fine_keys =
+                redistribute_with_respect_to_coarse_tree(&linear_keys, &coarse_tree, comm);
 
-    // We are using here the fact that the coarse tree is complete and sorted.
-    // We are sending around to each process the first local index. This
-    // defines bins in which we sort our keys. The keys are then sent around to the correct
-    // processes via an alltoallv operation.
+            // We now create the refined tree by recursing the coarse tree until we are at max level
+            // or the fine tree keys per coarse tree box is small enough.
+            let refined_tree =
+                create_local_tree(&fine_keys, &coarse_tree, max_level, max_leaf_points);
 
-    let my_first = coarse_tree.first().unwrap();
+            // We now need to 2:1 balance the refined tree and then redistribute again with respect to the coarse tree.
 
-    let global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
-
-    // We now have our bins. We go through our keys and store how
-    // many keys are assigned to each rank. We are using here that
-    // our keys and the coarse tree are both sorted.
-
-    // This will store for each rank how many keys will be assigned to it.
-
-    let rank_counts = sort_to_bins(linear_keys, &global_bins)
-        .iter()
-        .map(|&elem| elem as i32)
-        .collect_vec();
-
-    // We now have the counts for each rank. Let's redistribute accordingly and return.
-
-    let result = redistribute(&linear_keys, &rank_counts, comm);
-
-    #[cfg(debug_assertions)]
-    {
-        // Check through that the first and last key of result are descendents
-        // of the first and last coarse bloack.
-        debug_assert!(coarse_tree
-            .first()
-            .unwrap()
-            .is_ancestor(*result.first().unwrap()));
-        debug_assert!(coarse_tree
-            .last()
-            .unwrap()
-            .is_ancestor(*result.last().unwrap()));
-    }
-
-    result
-}
-
-/// Return a complete tree generated from local keys and associated coarse keys.
-///
-/// The coarse keys are refined until the maximum level is reached or until each coarse key
-/// is the ancestor of at most `max_keys` fine keys.
-/// It is assumed that the level of the fine keys is at least as large as `max_level`.
-pub fn create_local_tree(
-    sorted_fine_keys: &[MortonKey],
-    coarse_keys: &[MortonKey],
-    mut max_level: usize,
-    max_keys: usize,
-) -> Vec<MortonKey> {
-    if max_level > DEEPEST_LEVEL as usize {
-        max_level = DEEPEST_LEVEL as usize;
-    }
-
-    // We split the sorted fine keys into subslices so that each subslice
-    // is associated with a coarse slice.
-
-    let bins = coarse_keys.to_vec();
-
-    let counts = sort_to_bins(&sorted_fine_keys, &bins);
-
-    // We now know how many fine keys are associated with each coarse block. We iterate
-    // through and locally refine for each block that requires it.
-
-    let mut remainder = sorted_fine_keys;
-    let mut new_coarse_keys = Vec::<MortonKey>::new();
-
-    for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) {
-        let current;
-        (current, remainder) = remainder.split_at(count);
-        if coarse_key.level() < max_level && current.len() > max_keys {
-            // We need to refine the current split.
-            new_coarse_keys.extend_from_slice(
-                create_local_tree(
-                    current,
-                    coarse_key.children().as_slice(),
-                    max_level,
-                    max_keys,
-                )
-                .as_slice(),
+            let refined_tree = redistribute_with_respect_to_coarse_tree(
+                &balance(&refined_tree, &mut rng, comm),
+                &coarse_tree,
+                comm,
             );
-        } else {
-            new_coarse_keys.push(coarse_key)
-        }
-    }
-
-    new_coarse_keys
-}
-
-/// Linearize a set of weighted Morton keys.
-pub fn linearize<R: Rng, C: CommunicatorCollectives>(
-    keys: &[MortonKey],
-    rng: &mut R,
-    comm: &C,
-) -> Vec<MortonKey> {
-    let size = comm.size();
-    let rank = comm.rank();
-
-    // If we only have one process we use the standard serial linearization.
-
-    if size == 1 {
-        return MortonKey::linearize(keys);
-    }
-
-    // We are first sorting the keys. Then in a linear process across all processors we
-    // go through the arrays and delete ancestors of nodes.
-
-    let sorted_keys = parsort(&keys, comm, rng);
-
-    // Each process needs to send its first element to the previous process. Each process
-    // then goes through its own list and retains elements that are not ancestors of the
-    // next element.
-
-    let mut result = Vec::<MortonKey>::new();
-
-    let next_key = communicate_back(&sorted_keys, comm);
-
-    // Treat the local keys
-    for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
-        // m1 is also ancestor of m2 if they are identical.
-        if m1.is_ancestor(m2) {
-            continue;
-        } else {
-            result.push(m1);
-        }
-    }
-
-    // If we are at the last process simply push the last key.
-    // Otherwise check whether it might be the ancestor of `next_key`,
-    // the first key on the next process. If yes, don't push it. Otherwise do.
-
-    if rank == size - 1 {
-        result.push(*sorted_keys.last().unwrap());
-    } else {
-        let last = *sorted_keys.last().unwrap();
-        if !last.is_ancestor(next_key.unwrap()) {
-            result.push(last);
-        }
-    }
-
-    debug_assert!(is_linear_tree(&result, comm));
-
-    result
-}
-
-/// Balance a sorted list of Morton keys across processors given an array of corresponding weights.
-pub fn load_balance<C: CommunicatorCollectives>(
-    sorted_keys: &[MortonKey],
-    weights: &[usize],
-    comm: &C,
-) -> Vec<MortonKey> {
-    assert_eq!(sorted_keys.len(), weights.len());
-
-    let size = comm.size();
-    let rank = comm.rank();
-
-    // If we only have one process we simply return.
-
-    if size == 1 {
-        return sorted_keys.to_vec();
-    }
-
-    // First scan the weight.
-    // We scan the local arrays, then use a global scan operation on the last element
-    // of each array to get the global sums and then we update the array of each rank
-    // with the sum from the previous ranks.
 
-    let scan = global_inclusive_cumsum(&weights, comm);
+            (coarse_tree, refined_tree)
 
-    // Now broadcast the total weight to all processes.
+            // redistribute the balanced tree according to coarse tree
+        };
 
-    let mut total_weight = if rank == size - 1 {
-        *scan.last().unwrap()
-    } else {
-        0
-    };
-
-    comm.process_at_rank(size - 1)
-        .broadcast_into(&mut total_weight);
-
-    let w = total_weight / (size as usize);
-    let k = total_weight % (size as usize);
-
-    // Sort the elements into bins according to which process they should be sent.
-    // We do not need to sort the Morton keys themselves into bins but the scanned weights.
-    // The corresponding counts are the right counts for the Morton keys.
-
-    let mut bins = Vec::<usize>::with_capacity(size as usize);
-
-    for p in 1..=size as usize {
-        if p <= k {
-            bins.push((p - 1) * (1 + w));
-        } else {
-            bins.push((p - 1) * w + k);
-        }
-    }
-
-    let counts = sort_to_bins(&scan, &bins)
-        .iter()
-        .map(|elem| *elem as i32)
-        .collect_vec();
-
-    // Now distribute the data with an all to all v.
-    // We create a vector of how many elements to send to each process and
-    // then send the actual data.
-
-    let mut recvbuffer = redistribute(&sorted_keys, &counts, comm);
-
-    recvbuffer.sort_unstable();
-    recvbuffer
-}
-
-/// Given a distributed set of linear keys, generate a complete tree.
-pub fn complete_tree<C: CommunicatorCollectives>(
-    linear_keys: &[MortonKey],
-    comm: &C,
-) -> Vec<MortonKey> {
-    let mut linear_keys = linear_keys.to_vec();
-
-    debug_assert!(is_linear_tree(&linear_keys, comm));
-
-    let size = comm.size();
-    let rank = comm.rank();
-
-    if size == 1 {
-        return MortonKey::complete_tree(linear_keys.as_slice());
-    }
-
-    // Now insert on the first and last process the first and last child of the
-    // finest ancestor of first/last box on deepest level
-
-    // Send first element to previous rank and insert into local keys.
-    // On the first process we also need to insert the first child of the finest
-    // ancestor of the deepest first key and first element. Correspondingly on the last process
-    // we need to insert the last child of the finest ancester of the deepest last key and last element.
-
-    let next_key = communicate_back(&linear_keys, comm);
-
-    if rank < size - 1 {
-        linear_keys.push(next_key.unwrap());
-    }
-
-    // Now fix the first key on the first rank.
-
-    if rank == 0 {
-        let first_key = linear_keys.first().unwrap();
-        let deepest_first = MortonKey::deepest_first();
-        if !first_key.is_ancestor(deepest_first) {
-            let ancestor = deepest_first.finest_common_ancestor(*first_key);
-            linear_keys.insert(0, ancestor.children()[0]);
-        }
-    }
-
-    if rank == size - 1 {
-        let last_key = linear_keys.last().unwrap();
-        let deepest_last = MortonKey::deepest_last();
-        if !last_key.is_ancestor(deepest_last) {
-            let ancestor = deepest_last.finest_common_ancestor(*last_key);
-            linear_keys.push(ancestor.children()[NSIBLINGS - 1]);
-        }
-    }
-
-    // Now complete the regions defined by the keys on each process.
-
-    let mut result = Vec::<MortonKey>::new();
-
-    for (&key1, &key2) in linear_keys.iter().tuple_windows() {
-        result.push(key1);
-        result.extend_from_slice(key1.fill_between_keys(key2).as_slice());
-    }
-
-    if rank == size - 1 {
-        result.push(*linear_keys.last().unwrap());
-    }
-
-    debug_assert!(is_complete_linear_tree(&result, comm));
-
-    result
-}
-
-/// Balance a distributed tree.
-pub fn balance<R: Rng, C: CommunicatorCollectives>(
-    linear_keys: &[MortonKey],
-    rng: &mut R,
-    comm: &C,
-) -> Vec<MortonKey> {
-    let deepest_level = deepest_level(linear_keys, comm);
-
-    // Start with keys at deepest level
-    let mut work_list = linear_keys
-        .iter()
-        .copied()
-        .filter(|&key| key.level() == deepest_level)
-        .collect_vec();
-
-    let mut result = Vec::<MortonKey>::new();
-
-    // Now go through and make sure that for each key siblings and neighbours of parents are added
-
-    for level in (1..=deepest_level).rev() {
-        let mut parents = HashSet::<MortonKey>::new();
-        let mut new_work_list = Vec::<MortonKey>::new();
-        // We filter the work list by level and also make sure that
-        // only one sibling of each of the parents children is added to
-        // our current level list.
-        for key in work_list.iter() {
-            let parent = key.parent();
-            if !parents.contains(&parent) {
-                parents.insert(parent);
-                result.extend_from_slice(key.siblings().as_slice());
-                new_work_list.extend_from_slice(
-                    parent
-                        .neighbours()
-                        .iter()
-                        .copied()
-                        .filter(|&key| key.is_valid())
-                        .collect_vec()
-                        .as_slice(),
-                );
-            }
-        }
-        new_work_list.extend(
-            linear_keys
-                .iter()
-                .copied()
-                .filter(|&key| key.level() == level - 1),
+        let (points, point_keys) = redistribute_points_with_respect_to_coarse_tree(
+            points,
+            &point_keys,
+            &coarse_tree,
+            comm,
         );
 
-        work_list = new_work_list;
-        // Now extend the work list with the
-    }
-
-    let result = linearize(&result, rng, comm);
+        let coarse_tree_bounds = get_tree_bins(&coarse_tree, comm);
 
-    debug_assert!(is_complete_linear_and_balanced(&result, comm));
-    result
-}
-
-/// Return true if the keys are linear.
-pub fn is_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
-    let mut is_linear = true;
-
-    for (&key1, &key2) in arr.iter().tuple_windows() {
-        if key1 >= key2 || key1.is_ancestor(key2) {
-            is_linear = false;
-            break;
+        Self {
+            points: points.to_vec(),
+            point_keys,
+            coarse_tree,
+            leaf_tree,
+            coarse_tree_bounds,
+            bounding_box,
+            comm,
         }
     }
 
-    if comm.size() == 1 {
-        return is_linear;
+    /// Return the keys associated with the redistributed points.
+    pub fn point_keys(&self) -> &Vec<MortonKey> {
+        &self.point_keys
     }
 
-    // Now check the interfaces
-
-    if let Some(next_key) = communicate_back(arr, comm) {
-        let last = *arr.last().unwrap();
-        if last >= next_key || last.is_ancestor(next_key) {
-            is_linear = false;
-        }
+    /// Return the bounding box.
+    pub fn bounding_box(&self) -> &PhysicalBox {
+        &self.bounding_box
     }
 
-    let mut global_is_linear = false;
-
-    comm.all_reduce_into(
-        &is_linear,
-        &mut global_is_linear,
-        SystemOperation::logical_and(),
-    );
-
-    global_is_linear
-}
-
-/// Redistribute points with respect to a given coarse tree
-pub fn redistribute_points_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
-    points: &[Point],
-    morton_keys_for_points: &[MortonKey],
-    coarse_tree: &[MortonKey],
-    comm: &C,
-) -> (Vec<Point>, Vec<MortonKey>) {
-    pub fn argsort<T: Ord + Copy>(arr: &[T]) -> Vec<usize> {
-        let mut sort_indices = (0..arr.len()).collect_vec();
-        sort_indices.sort_unstable_by_key(|&index| arr[index]);
-        sort_indices
-    }
-
-    pub fn reorder<T: Copy>(arr: &[T], permutation: &[usize]) -> Vec<T> {
-        let mut reordered = Vec::<T>::with_capacity(arr.len());
-        for &index in permutation.iter() {
-            reordered.push(arr[index])
-        }
-        reordered
-    }
-
-    assert_eq!(points.len(), morton_keys_for_points.len());
-
-    let size = comm.size();
-
-    if size == 1 {
-        return (points.to_vec(), morton_keys_for_points.to_vec());
+    /// Return the associated coarse tree.
+    pub fn coarse_tree(&self) -> &Vec<MortonKey> {
+        &self.coarse_tree
     }
 
-    let sort_indices = argsort(&morton_keys_for_points);
-    let sorted_keys = reorder(&morton_keys_for_points, &sort_indices);
-    let sorted_points = reorder(&points, &sort_indices);
-
-    // Now get the bins
-
-    let my_first = coarse_tree.first().unwrap();
-
-    let global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
-
-    // We now sort the morton indices into the bins.
-
-    // This will store for each rank how many keys will be assigned to it.
-
-    let counts = sort_to_bins(&sorted_keys, &global_bins)
-        .iter()
-        .map(|&elem| elem as i32)
-        .collect_vec();
-
-    // We now redistribute the points and the corresponding keys.
-
-    let (distributed_points, distributed_keys) = (
-        redistribute(&sorted_points, &counts, comm),
-        redistribute(&sorted_keys, &counts, comm),
-    );
-
-    // Now sort the distributed points and keys internally again.
-
-    let sort_indices = argsort(&distributed_keys);
-    let sorted_keys = reorder(&distributed_keys, &sort_indices);
-    let sorted_points = reorder(&distributed_points, &sort_indices);
-
-    (sorted_points, sorted_keys)
-}
-
-/// Return true on all ranks if distributed tree is complete. Otherwise, return false.
-pub fn is_complete_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
-    // First check that the local tree on each node is complete.
-
-    let mut complete_linear = true;
-    for (key1, key2) in arr.iter().tuple_windows() {
-        // Make sure that the keys are sorted and not duplicated.
-        if key1 >= key2 {
-            complete_linear = false;
-            break;
-        }
-        // The next key should be an ancestor of the next non-descendent key.
-        if let Some(expected_next) = key1.next_non_descendent_key() {
-            if !key2.is_ancestor(expected_next) {
-                complete_linear = false;
-                break;
-            }
-        } else {
-            // Only for the very last key there should not be a next non-descendent key.
-            complete_linear = false;
-        }
+    /// Return the points.
+    ///
+    /// Points are distributed across the nodes as part of the tree generation.
+    /// This function returns the redistributed points.
+    pub fn points(&self) -> &Vec<Point> {
+        &self.points
     }
 
-    // We now check the interfaces.
-
-    if let Some(next_first) = communicate_back(arr, comm) {
-        // We are on any but the last rank
-        let last_key = arr.last().unwrap();
-
-        // Check that the keys are sorted and not duplicated.
-        if *last_key >= next_first {
-            complete_linear = false;
-        }
-
-        // Check that the next key is an encestor of the next non-descendent.
-        if let Some(expected_next) = last_key.next_non_descendent_key() {
-            if !next_first.is_ancestor(expected_next) {
-                complete_linear = false;
-            }
-        } else {
-            complete_linear = false;
-        }
-    } else {
-        // We are on the last rank
-        // Check that the last key is ancestor of deepest last.
-        if !arr.last().unwrap().is_ancestor(MortonKey::deepest_last()) {
-            complete_linear = false;
-        }
+    /// Return the leaf tree.
+    pub fn leaf_tree(&self) -> &Vec<MortonKey> {
+        &self.leaf_tree
     }
 
-    // Now check that at the first rank we include the deepest first.
-
-    if comm.rank() == 0 {
-        if !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) {
-            complete_linear = false;
-        }
+    /// Get the coarse tree bounds.
+    ///
+    /// This returns an array of size the number of ranks,
+    /// where each element consists of the smallest Morton key in
+    /// the corresponding rank.
+    pub fn coarse_tree_bounds(&self) -> &Vec<MortonKey> {
+        &self.coarse_tree_bounds
     }
 
-    // Now communicate everything together.
-
-    let mut result = false;
-    comm.all_reduce_into(
-        &complete_linear,
-        &mut result,
-        SystemOperation::logical_and(),
-    );
-
-    result
-}
-
-/// Return the deepest level of a distributed list of Morton keys.
-pub fn deepest_level<C: CommunicatorCollectives>(keys: &[MortonKey], comm: &C) -> usize {
-    let local_deepest_level = keys.iter().map(|elem| elem.level()).max().unwrap();
-
-    if comm.size() == 1 {
-        return local_deepest_level;
+    /// Return the communicator.
+    pub fn comm(&self) -> &C {
+        self.comm
     }
 
-    let mut global_deepest_level: usize = 0;
+    pub fn generate_status(&self) {
+        let mut keys_with_status = HashSet::<MortonKey>::new();
 
-    comm.all_reduce_into(
-        &local_deepest_level,
-        &mut global_deepest_level,
-        SystemOperation::max(),
-    );
+        // Start from the leafs and work up the tree.
 
-    global_deepest_level
-}
-
-/// Check if tree is balanced.
-pub fn is_complete_linear_and_balanced<C: CommunicatorCollectives>(
-    arr: &[MortonKey],
-    comm: &C,
-) -> bool {
-    // Send the tree to the root node and check there that it is balanced.
-
-    let mut balanced = false;
-
-    if let Some(arr) = gather_to_root(arr, comm) {
-        balanced = MortonKey::is_complete_linear_and_balanced(&arr);
+        for leaf in self.leaf_tree() {}
     }
-
-    comm.process_at_rank(0).broadcast_into(&mut balanced);
-
-    balanced
 }
diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs
new file mode 100644
index 0000000..2981a08
--- /dev/null
+++ b/src/octree/parallel.rs
@@ -0,0 +1,912 @@
+//! Parallel Octree structure
+
+use std::collections::{HashMap, HashSet};
+
+use crate::{
+    constants::{DEEPEST_LEVEL, NSIBLINGS},
+    geometry::{PhysicalBox, Point},
+    morton::MortonKey,
+    parsort::parsort,
+    tools::{
+        communicate_back, gather_to_all, gather_to_root, global_inclusive_cumsum, redistribute,
+        sort_to_bins,
+    },
+};
+
+use mpi::traits::{Equivalence, Root};
+
+use itertools::{izip, Itertools};
+use mpi::{collective::SystemOperation, traits::CommunicatorCollectives};
+use rand::Rng;
+
+/// Structure to store ghost keys and their original ranks.
+///
+/// The status is
+/// - 0 for a local interior node.
+/// - 1 for a local leaf node.
+/// - 2 for a global node.
+/// - 3 for a ghost node.
+#[derive(Copy, Clone, Equivalence)]
+pub struct KeyWithStatus {
+    key: MortonKey,
+    // Ideally we would use a typed enum that
+    // combines rank and status. But this is not
+    // supported by the rsmpi Equivalence Macro.
+    status: usize,
+    rank: usize,
+}
+
+impl KeyWithStatus {
+    /// Create a new ghost.
+    pub fn new(key: MortonKey, status: usize, rank: usize) -> Self {
+        Self { key, status, rank }
+    }
+}
+
+/// Compute the global bounding box across all points on all processes.
+pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
+    points: &[Point],
+    comm: &C,
+) -> PhysicalBox {
+    // Make sure that the points array is a multiple of 3.
+
+    // Now compute the minimum and maximum across each dimension.
+
+    let mut xmin = f64::MAX;
+    let mut xmax = f64::MIN;
+
+    let mut ymin = f64::MAX;
+    let mut ymax = f64::MIN;
+
+    let mut zmin = f64::MAX;
+    let mut zmax = f64::MIN;
+
+    for point in points {
+        let x = point.coords()[0];
+        let y = point.coords()[1];
+        let z = point.coords()[2];
+
+        xmin = f64::min(xmin, x);
+        xmax = f64::max(xmax, x);
+
+        ymin = f64::min(ymin, y);
+        ymax = f64::max(ymax, y);
+
+        zmin = f64::min(zmin, z);
+        zmax = f64::max(zmax, z);
+    }
+
+    let mut global_xmin = 0.0;
+    let mut global_xmax = 0.0;
+
+    let mut global_ymin = 0.0;
+    let mut global_ymax = 0.0;
+
+    let mut global_zmin = 0.0;
+    let mut global_zmax = 0.0;
+
+    comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min());
+    comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max());
+
+    comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min());
+    comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max());
+
+    comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min());
+    comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max());
+
+    let xdiam = global_xmax - global_xmin;
+    let ydiam = global_ymax - global_ymin;
+    let zdiam = global_zmax - global_zmin;
+
+    let xmean = global_xmin + 0.5 * xdiam;
+    let ymean = global_ymin + 0.5 * ydiam;
+    let zmean = global_zmin + 0.5 * zdiam;
+
+    // We increase diameters by box size on deepest level
+    // and use the maximum diameter to compute a
+    // cubic bounding box.
+
+    let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64;
+
+    let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap();
+
+    let max_diam = max_diam * (1.0 + deepest_box_diam);
+
+    PhysicalBox::new([
+        xmean - 0.5 * max_diam,
+        ymean - 0.5 * max_diam,
+        zmean - 0.5 * max_diam,
+        xmean + 0.5 * max_diam,
+        ymean + 0.5 * max_diam,
+        zmean + 0.5 * max_diam,
+    ])
+}
+
+/// Convert points to Morton keys on specified level.
+pub fn points_to_morton<C: CommunicatorCollectives>(
+    points: &[Point],
+    max_level: usize,
+    comm: &C,
+) -> (Vec<MortonKey>, PhysicalBox) {
+    // Make sure that max level never exceeds DEEPEST_LEVEL
+    let max_level = if max_level > DEEPEST_LEVEL as usize {
+        DEEPEST_LEVEL as usize
+    } else {
+        max_level
+    };
+
+    // Compute the physical bounding box.
+
+    let bounding_box = compute_global_bounding_box(points, comm);
+
+    // Bunch the points in arrays of 3.
+
+    let keys = points
+        .iter()
+        .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level))
+        .collect_vec();
+
+    (keys, bounding_box)
+}
+
+/// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree.
+/// The returned coarse tree is load balanced according to the number of linear keys in each coarse block.
+pub fn compute_coarse_tree<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    let size = comm.size();
+
+    debug_assert!(is_linear_tree(linear_keys, comm));
+
+    // On a single node a complete coarse tree is simply the root.
+    if size == 1 {
+        return vec![MortonKey::root()];
+    }
+
+    let mut completed_region = linear_keys
+        .first()
+        .unwrap()
+        .fill_between_keys(*linear_keys.last().unwrap());
+
+    completed_region.insert(0, *linear_keys.first().unwrap());
+    completed_region.push(*linear_keys.last().unwrap());
+
+    // Get the smallest level members of the completed region.
+
+    let min_level = completed_region
+        .iter()
+        .map(|elem| elem.level())
+        .min()
+        .unwrap();
+
+    // Each process selects its largest boxes. These are used to create
+    // a coarse tree.
+
+    let largest_boxes = completed_region
+        .iter()
+        .filter(|elem| elem.level() == min_level)
+        .copied()
+        .collect_vec();
+
+    debug_assert!(is_linear_tree(&largest_boxes, comm));
+
+    complete_tree(&largest_boxes, comm)
+}
+
+/// Compute the weights of each coarse tree block as the number of linear keys associated with each coarse block.
+pub fn compute_coarse_tree_weights<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> Vec<usize> {
+    let rank = comm.rank();
+    // We want to partition the coarse tree. But we need the correct weights. The idea
+    // is that we use the number of original leafs that intersect with the coarse tree
+    // as leafs. In order to compute this we send the coarse tree around to all processes
+    // so that each process computes for each coarse tree element how many of its keys
+    // intersect with each node of the coarse tree. We then sum up the local weight for each
+    // coarse tree node across all nodes to get the weight.
+
+    let global_coarse_tree = gather_to_all(&coarse_tree, comm);
+
+    // We also want to send around a corresponding array of ranks so that for each global coarse tree key
+    // we have the rank of where it originates from.
+
+    let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm);
+
+    // We now compute the local weights.
+    let mut local_weight_contribution = vec![0 as usize; global_coarse_tree.len()];
+
+    // In the following loop we want to be a bit smart. We do not iterate through all the local elements.
+    // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
+    // of our sorted keys that overlaps with the coarse tree region.
+
+    // Let's find the start of our region. The start of our region is a coarse key that is an ancestor
+    // of our current key. This works because the coarse tree has levels at most as high as the sorted keys.
+
+    let first_key = *linear_keys.first().unwrap();
+
+    let first_coarse_index = global_coarse_tree
+        .iter()
+        .take_while(|coarse_key| !coarse_key.is_ancestor(first_key))
+        .count();
+
+    // Now we need to find the end index of our region. For this again we find the index of our coarse tree that
+    // is an ancestor of our last key.
+    let last_key = *linear_keys.last().unwrap();
+
+    let last_coarse_index = global_coarse_tree
+        .iter()
+        .take_while(|coarse_key| !coarse_key.is_ancestor(last_key))
+        .count();
+
+    // We now only need to iterate through between the first and last coarse index in the coarse tree.
+    // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key).
+
+    for (w, &global_coarse_key) in izip!(
+        local_weight_contribution[first_coarse_index..=last_coarse_index].iter_mut(),
+        global_coarse_tree[first_coarse_index..=last_coarse_index].iter()
+    ) {
+        *w += linear_keys
+            .iter()
+            .filter(|&&key| global_coarse_key.is_ancestor(key))
+            .count();
+    }
+
+    // We now need to sum up the weights across all processes.
+
+    let mut global_weights = vec![0 as usize; global_coarse_tree.len()];
+
+    comm.all_reduce_into(
+        &local_weight_contribution,
+        &mut global_weights,
+        SystemOperation::sum(),
+    );
+
+    // Each process now has all weights. However, we only need the ones for the current process.
+    // So we just filter the rest out.
+
+    izip!(coarse_tree_ranks, global_weights)
+        .filter_map(|(r, weight)| {
+            if r == rank as usize {
+                Some(weight)
+            } else {
+                None
+            }
+        })
+        .collect_vec()
+}
+
+/// Redistribute sorted keys with respect to a linear coarse tree.
+pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    let size = comm.size();
+
+    if size == 1 {
+        return linear_keys.to_vec();
+    }
+
+    // We want to globally redistribute keys so that the keys on each process are descendents
+    // of the local coarse tree keys.
+
+    // We are using here the fact that the coarse tree is complete and sorted.
+    // We are sending around to each process the first local index. This
+    // defines bins in which we sort our keys. The keys are then sent around to the correct
+    // processes via an alltoallv operation.
+
+    let my_first = coarse_tree.first().unwrap();
+
+    let global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
+
+    // We now have our bins. We go through our keys and store how
+    // many keys are assigned to each rank. We are using here that
+    // our keys and the coarse tree are both sorted.
+
+    // This will store for each rank how many keys will be assigned to it.
+
+    let rank_counts = sort_to_bins(linear_keys, &global_bins)
+        .iter()
+        .map(|&elem| elem as i32)
+        .collect_vec();
+
+    // We now have the counts for each rank. Let's redistribute accordingly and return.
+
+    let result = redistribute(&linear_keys, &rank_counts, comm);
+
+    #[cfg(debug_assertions)]
+    {
+        // Check through that the first and last key of result are descendents
+        // of the first and last coarse bloack.
+        debug_assert!(coarse_tree
+            .first()
+            .unwrap()
+            .is_ancestor(*result.first().unwrap()));
+        debug_assert!(coarse_tree
+            .last()
+            .unwrap()
+            .is_ancestor(*result.last().unwrap()));
+    }
+
+    result
+}
+
+/// Return a complete tree generated from local keys and associated coarse keys.
+///
+/// The coarse keys are refined until the maximum level is reached or until each coarse key
+/// is the ancestor of at most `max_keys` fine keys.
+/// It is assumed that the level of the fine keys is at least as large as `max_level`.
+pub fn create_local_tree(
+    sorted_fine_keys: &[MortonKey],
+    coarse_keys: &[MortonKey],
+    mut max_level: usize,
+    max_keys: usize,
+) -> Vec<MortonKey> {
+    if max_level > DEEPEST_LEVEL as usize {
+        max_level = DEEPEST_LEVEL as usize;
+    }
+
+    // We split the sorted fine keys into subslices so that each subslice
+    // is associated with a coarse slice.
+
+    let bins = coarse_keys.to_vec();
+
+    let counts = sort_to_bins(&sorted_fine_keys, &bins);
+
+    // We now know how many fine keys are associated with each coarse block. We iterate
+    // through and locally refine for each block that requires it.
+
+    let mut remainder = sorted_fine_keys;
+    let mut new_coarse_keys = Vec::<MortonKey>::new();
+
+    for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) {
+        let current;
+        (current, remainder) = remainder.split_at(count);
+        if coarse_key.level() < max_level && current.len() > max_keys {
+            // We need to refine the current split.
+            new_coarse_keys.extend_from_slice(
+                create_local_tree(
+                    current,
+                    coarse_key.children().as_slice(),
+                    max_level,
+                    max_keys,
+                )
+                .as_slice(),
+            );
+        } else {
+            new_coarse_keys.push(coarse_key)
+        }
+    }
+
+    new_coarse_keys
+}
+
+/// Linearize a set of weighted Morton keys.
+pub fn linearize<R: Rng, C: CommunicatorCollectives>(
+    keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    let size = comm.size();
+    let rank = comm.rank();
+
+    // If we only have one process we use the standard serial linearization.
+
+    if size == 1 {
+        return MortonKey::linearize(keys);
+    }
+
+    // We are first sorting the keys. Then in a linear process across all processors we
+    // go through the arrays and delete ancestors of nodes.
+
+    let sorted_keys = parsort(&keys, comm, rng);
+
+    // Each process needs to send its first element to the previous process. Each process
+    // then goes through its own list and retains elements that are not ancestors of the
+    // next element.
+
+    let mut result = Vec::<MortonKey>::new();
+
+    let next_key = communicate_back(&sorted_keys, comm);
+
+    // Treat the local keys
+    for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
+        // m1 is also ancestor of m2 if they are identical.
+        if m1.is_ancestor(m2) {
+            continue;
+        } else {
+            result.push(m1);
+        }
+    }
+
+    // If we are at the last process simply push the last key.
+    // Otherwise check whether it might be the ancestor of `next_key`,
+    // the first key on the next process. If yes, don't push it. Otherwise do.
+
+    if rank == size - 1 {
+        result.push(*sorted_keys.last().unwrap());
+    } else {
+        let last = *sorted_keys.last().unwrap();
+        if !last.is_ancestor(next_key.unwrap()) {
+            result.push(last);
+        }
+    }
+
+    debug_assert!(is_linear_tree(&result, comm));
+
+    result
+}
+
+/// Balance a sorted list of Morton keys across processors given an array of corresponding weights.
+pub fn load_balance<C: CommunicatorCollectives>(
+    sorted_keys: &[MortonKey],
+    weights: &[usize],
+    comm: &C,
+) -> Vec<MortonKey> {
+    assert_eq!(sorted_keys.len(), weights.len());
+
+    let size = comm.size();
+    let rank = comm.rank();
+
+    // If we only have one process we simply return.
+
+    if size == 1 {
+        return sorted_keys.to_vec();
+    }
+
+    // First scan the weight.
+    // We scan the local arrays, then use a global scan operation on the last element
+    // of each array to get the global sums and then we update the array of each rank
+    // with the sum from the previous ranks.
+
+    let scan = global_inclusive_cumsum(&weights, comm);
+
+    // Now broadcast the total weight to all processes.
+
+    let mut total_weight = if rank == size - 1 {
+        *scan.last().unwrap()
+    } else {
+        0
+    };
+
+    comm.process_at_rank(size - 1)
+        .broadcast_into(&mut total_weight);
+
+    let w = total_weight / (size as usize);
+    let k = total_weight % (size as usize);
+
+    // Sort the elements into bins according to which process they should be sent.
+    // We do not need to sort the Morton keys themselves into bins but the scanned weights.
+    // The corresponding counts are the right counts for the Morton keys.
+
+    let mut bins = Vec::<usize>::with_capacity(size as usize);
+
+    for p in 1..=size as usize {
+        if p <= k {
+            bins.push((p - 1) * (1 + w));
+        } else {
+            bins.push((p - 1) * w + k);
+        }
+    }
+
+    let counts = sort_to_bins(&scan, &bins)
+        .iter()
+        .map(|elem| *elem as i32)
+        .collect_vec();
+
+    // Now distribute the data with an all to all v.
+    // We create a vector of how many elements to send to each process and
+    // then send the actual data.
+
+    let mut recvbuffer = redistribute(&sorted_keys, &counts, comm);
+
+    recvbuffer.sort_unstable();
+    recvbuffer
+}
+
+/// Given a distributed set of linear keys, generate a complete tree.
+pub fn complete_tree<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    let mut linear_keys = linear_keys.to_vec();
+
+    debug_assert!(is_linear_tree(&linear_keys, comm));
+
+    let size = comm.size();
+    let rank = comm.rank();
+
+    if size == 1 {
+        return MortonKey::complete_tree(linear_keys.as_slice());
+    }
+
+    // Now insert on the first and last process the first and last child of the
+    // finest ancestor of first/last box on deepest level
+
+    // Send first element to previous rank and insert into local keys.
+    // On the first process we also need to insert the first child of the finest
+    // ancestor of the deepest first key and first element. Correspondingly on the last process
+    // we need to insert the last child of the finest ancester of the deepest last key and last element.
+
+    let next_key = communicate_back(&linear_keys, comm);
+
+    if rank < size - 1 {
+        linear_keys.push(next_key.unwrap());
+    }
+
+    // Now fix the first key on the first rank.
+
+    if rank == 0 {
+        let first_key = linear_keys.first().unwrap();
+        let deepest_first = MortonKey::deepest_first();
+        if !first_key.is_ancestor(deepest_first) {
+            let ancestor = deepest_first.finest_common_ancestor(*first_key);
+            linear_keys.insert(0, ancestor.children()[0]);
+        }
+    }
+
+    if rank == size - 1 {
+        let last_key = linear_keys.last().unwrap();
+        let deepest_last = MortonKey::deepest_last();
+        if !last_key.is_ancestor(deepest_last) {
+            let ancestor = deepest_last.finest_common_ancestor(*last_key);
+            linear_keys.push(ancestor.children()[NSIBLINGS - 1]);
+        }
+    }
+
+    // Now complete the regions defined by the keys on each process.
+
+    let mut result = Vec::<MortonKey>::new();
+
+    for (&key1, &key2) in linear_keys.iter().tuple_windows() {
+        result.push(key1);
+        result.extend_from_slice(key1.fill_between_keys(key2).as_slice());
+    }
+
+    if rank == size - 1 {
+        result.push(*linear_keys.last().unwrap());
+    }
+
+    debug_assert!(is_complete_linear_tree(&result, comm));
+
+    result
+}
+
+/// Balance a distributed tree.
+pub fn balance<R: Rng, C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    let deepest_level = deepest_level(linear_keys, comm);
+
+    // Start with keys at deepest level
+    let mut work_list = linear_keys
+        .iter()
+        .copied()
+        .filter(|&key| key.level() == deepest_level)
+        .collect_vec();
+
+    let mut result = Vec::<MortonKey>::new();
+
+    // Now go through and make sure that for each key siblings and neighbours of parents are added
+
+    for level in (1..=deepest_level).rev() {
+        let mut parents = HashSet::<MortonKey>::new();
+        let mut new_work_list = Vec::<MortonKey>::new();
+        // We filter the work list by level and also make sure that
+        // only one sibling of each of the parents children is added to
+        // our current level list.
+        for key in work_list.iter() {
+            let parent = key.parent();
+            if !parents.contains(&parent) {
+                parents.insert(parent);
+                result.extend_from_slice(key.siblings().as_slice());
+                new_work_list.extend_from_slice(
+                    parent
+                        .neighbours()
+                        .iter()
+                        .copied()
+                        .filter(|&key| key.is_valid())
+                        .collect_vec()
+                        .as_slice(),
+                );
+            }
+        }
+        new_work_list.extend(
+            linear_keys
+                .iter()
+                .copied()
+                .filter(|&key| key.level() == level - 1),
+        );
+
+        work_list = new_work_list;
+        // Now extend the work list with the
+    }
+
+    let result = linearize(&result, rng, comm);
+
+    debug_assert!(is_complete_linear_and_balanced(&result, comm));
+    result
+}
+
+/// Return true if the keys are linear.
+pub fn is_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
+    let mut is_linear = true;
+
+    for (&key1, &key2) in arr.iter().tuple_windows() {
+        if key1 >= key2 || key1.is_ancestor(key2) {
+            is_linear = false;
+            break;
+        }
+    }
+
+    if comm.size() == 1 {
+        return is_linear;
+    }
+
+    // Now check the interfaces
+
+    if let Some(next_key) = communicate_back(arr, comm) {
+        let last = *arr.last().unwrap();
+        if last >= next_key || last.is_ancestor(next_key) {
+            is_linear = false;
+        }
+    }
+
+    let mut global_is_linear = false;
+
+    comm.all_reduce_into(
+        &is_linear,
+        &mut global_is_linear,
+        SystemOperation::logical_and(),
+    );
+
+    global_is_linear
+}
+
+/// Redistribute points with respect to a given coarse tree
+pub fn redistribute_points_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
+    points: &[Point],
+    morton_keys_for_points: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> (Vec<Point>, Vec<MortonKey>) {
+    pub fn argsort<T: Ord + Copy>(arr: &[T]) -> Vec<usize> {
+        let mut sort_indices = (0..arr.len()).collect_vec();
+        sort_indices.sort_unstable_by_key(|&index| arr[index]);
+        sort_indices
+    }
+
+    pub fn reorder<T: Copy>(arr: &[T], permutation: &[usize]) -> Vec<T> {
+        let mut reordered = Vec::<T>::with_capacity(arr.len());
+        for &index in permutation.iter() {
+            reordered.push(arr[index])
+        }
+        reordered
+    }
+
+    assert_eq!(points.len(), morton_keys_for_points.len());
+
+    let size = comm.size();
+
+    if size == 1 {
+        return (points.to_vec(), morton_keys_for_points.to_vec());
+    }
+
+    let sort_indices = argsort(&morton_keys_for_points);
+    let sorted_keys = reorder(&morton_keys_for_points, &sort_indices);
+    let sorted_points = reorder(&points, &sort_indices);
+
+    // Now get the bins
+
+    let my_first = coarse_tree.first().unwrap();
+
+    let global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
+
+    // We now sort the morton indices into the bins.
+
+    // This will store for each rank how many keys will be assigned to it.
+
+    let counts = sort_to_bins(&sorted_keys, &global_bins)
+        .iter()
+        .map(|&elem| elem as i32)
+        .collect_vec();
+
+    // We now redistribute the points and the corresponding keys.
+
+    let (distributed_points, distributed_keys) = (
+        redistribute(&sorted_points, &counts, comm),
+        redistribute(&sorted_keys, &counts, comm),
+    );
+
+    // Now sort the distributed points and keys internally again.
+
+    let sort_indices = argsort(&distributed_keys);
+    let sorted_keys = reorder(&distributed_keys, &sort_indices);
+    let sorted_points = reorder(&distributed_points, &sort_indices);
+
+    (sorted_points, sorted_keys)
+}
+
+/// Return true on all ranks if distributed tree is complete. Otherwise, return false.
+pub fn is_complete_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
+    // First check that the local tree on each node is complete.
+
+    let mut complete_linear = true;
+    for (key1, key2) in arr.iter().tuple_windows() {
+        // Make sure that the keys are sorted and not duplicated.
+        if key1 >= key2 {
+            complete_linear = false;
+            break;
+        }
+        // The next key should be an ancestor of the next non-descendent key.
+        if let Some(expected_next) = key1.next_non_descendent_key() {
+            if !key2.is_ancestor(expected_next) {
+                complete_linear = false;
+                break;
+            }
+        } else {
+            // Only for the very last key there should not be a next non-descendent key.
+            complete_linear = false;
+        }
+    }
+
+    // We now check the interfaces.
+
+    if let Some(next_first) = communicate_back(arr, comm) {
+        // We are on any but the last rank
+        let last_key = arr.last().unwrap();
+
+        // Check that the keys are sorted and not duplicated.
+        if *last_key >= next_first {
+            complete_linear = false;
+        }
+
+        // Check that the next key is an encestor of the next non-descendent.
+        if let Some(expected_next) = last_key.next_non_descendent_key() {
+            if !next_first.is_ancestor(expected_next) {
+                complete_linear = false;
+            }
+        } else {
+            complete_linear = false;
+        }
+    } else {
+        // We are on the last rank
+        // Check that the last key is ancestor of deepest last.
+        if !arr.last().unwrap().is_ancestor(MortonKey::deepest_last()) {
+            complete_linear = false;
+        }
+    }
+
+    // Now check that at the first rank we include the deepest first.
+
+    if comm.rank() == 0 {
+        if !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) {
+            complete_linear = false;
+        }
+    }
+
+    // Now communicate everything together.
+
+    let mut result = false;
+    comm.all_reduce_into(
+        &complete_linear,
+        &mut result,
+        SystemOperation::logical_and(),
+    );
+
+    result
+}
+
+/// Return the deepest level of a distributed list of Morton keys.
+pub fn deepest_level<C: CommunicatorCollectives>(keys: &[MortonKey], comm: &C) -> usize {
+    let local_deepest_level = keys.iter().map(|elem| elem.level()).max().unwrap();
+
+    if comm.size() == 1 {
+        return local_deepest_level;
+    }
+
+    let mut global_deepest_level: usize = 0;
+
+    comm.all_reduce_into(
+        &local_deepest_level,
+        &mut global_deepest_level,
+        SystemOperation::max(),
+    );
+
+    global_deepest_level
+}
+
+/// Check if tree is balanced.
+pub fn is_complete_linear_and_balanced<C: CommunicatorCollectives>(
+    arr: &[MortonKey],
+    comm: &C,
+) -> bool {
+    // Send the tree to the root node and check there that it is balanced.
+
+    let mut balanced = false;
+
+    if let Some(arr) = gather_to_root(arr, comm) {
+        balanced = MortonKey::is_complete_linear_and_balanced(&arr);
+    }
+
+    comm.process_at_rank(0).broadcast_into(&mut balanced);
+
+    balanced
+}
+
+/// For a complete linear bin get on each process the first key of all processes.
+///
+/// This information can be used to query on which process a key is living.
+pub fn get_tree_bins<C: CommunicatorCollectives>(
+    complete_linear_tree: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    gather_to_all(
+        std::slice::from_ref(complete_linear_tree.first().unwrap()),
+        comm,
+    )
+}
+
+/// For a sorted array return either position of the key or positioin directly before search key.
+pub fn get_key_index(arr: &[MortonKey], key: MortonKey) -> usize {
+    // Does a binary search of the key. If the key is found with Ok(..)
+    // the exact index is returned of the found key. If the key is not found
+    // the closest larger index is returned. So we subtract one to get the closest
+    // smaller index.
+
+    match arr.binary_search(&key) {
+        Ok(index) => index,
+        Err(index) => index - 1,
+    }
+}
+
+/// Check if a key is associated with the current rank.
+///
+/// Note that the key does not need to exist as leaf. It just needs
+/// to be descendent of a coarse key on the current rank.
+pub fn key_on_current_rank(
+    key: MortonKey,
+    coarse_tree_bounds: &[MortonKey],
+    rank: usize,
+    size: usize,
+) -> bool {
+    if rank == size - 1 {
+        key >= *coarse_tree_bounds.last().unwrap()
+    } else {
+        coarse_tree_bounds[rank] <= key && key < coarse_tree_bounds[rank + 1]
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::{
+        octree::get_key_index,
+        tools::{generate_random_keys, seeded_rng},
+    };
+
+    #[test]
+    fn test_get_key_rank() {
+        let mut rng = seeded_rng(0);
+
+        let mut keys = generate_random_keys(50, &mut rng);
+
+        keys.sort_unstable();
+
+        let mid = keys[25];
+
+        assert_eq!(25, get_key_index(&keys, mid));
+
+        // Now remove the mid index and do the same again.
+
+        keys.remove(25);
+
+        // The result should be 24.
+
+        assert_eq!(24, get_key_index(&keys, mid));
+    }
+}
diff --git a/src/serial.rs b/src/serial.rs
index d32b3ac..0249bb8 100644
--- a/src/serial.rs
+++ b/src/serial.rs
@@ -33,7 +33,6 @@ impl Octree {
     /// Create octress from points
     pub fn from_points(points: &[Point], max_level: usize, max_points_per_box: usize) -> Self {
         // Make sure that the points array is a multiple of 3.
-        assert_eq!(points.len() % 3, 0);
 
         // Make sure that max level never exceeds DEEPEST_LEVEL
         let max_level = if max_level > DEEPEST_LEVEL as usize {
diff --git a/src/tools.rs b/src/tools.rs
index 56e0905..dec8779 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -8,7 +8,8 @@ use mpi::{
     traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source},
 };
 use num::traits::Zero;
-use rand::Rng;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
 
 use crate::{
     constants::{DEEPEST_LEVEL, LEVEL_SIZE},
@@ -392,6 +393,11 @@ pub fn generate_random_points<R: Rng, C: CommunicatorCollectives>(
     points
 }
 
+/// Get a seeded rng
+pub fn seeded_rng(seed: usize) -> ChaCha8Rng {
+    ChaCha8Rng::seed_from_u64(seed as u64)
+}
+
 /// Compute displacements from a vector of counts.
 ///
 /// This is useful for global MPI varcount operations. Let

From 31765dbed9eb207faaf7bb6c61a840acc9fb14e3 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Wed, 25 Sep 2024 12:05:23 +0100
Subject: [PATCH 33/42] WIP: Ghosts

---
 src/octree.rs          | 95 +++++++++++++++++++++++++++++++++++++++---
 src/octree/parallel.rs | 24 -----------
 2 files changed, 90 insertions(+), 29 deletions(-)

diff --git a/src/octree.rs b/src/octree.rs
index 6f175cc..b0ac95c 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -1,7 +1,7 @@
 pub mod parallel;
-use std::collections::HashSet;
+use std::collections::{HashMap, HashSet};
 
-use mpi::traits::CommunicatorCollectives;
+use mpi::traits::{CommunicatorCollectives, Equivalence};
 pub use parallel::*;
 use rand::{Rng, SeedableRng};
 use rand_chacha::ChaCha8Rng;
@@ -10,8 +10,17 @@ use crate::{
     constants::DEEPEST_LEVEL,
     geometry::{PhysicalBox, Point},
     morton::MortonKey,
+    tools::gather_to_all,
 };
 
+#[derive(PartialEq, Eq, Hash, Copy, Clone)]
+pub enum KeyStatus {
+    LocalLeaf,
+    LocalInterior,
+    Global,
+    Ghost(usize),
+}
+
 /// A general structure for octrees.
 pub struct Octree<'o, C> {
     points: Vec<Point>,
@@ -84,6 +93,8 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
 
         let coarse_tree_bounds = get_tree_bins(&coarse_tree, comm);
 
+        // Duplicate the coarse tree across all nodes
+
         Self {
             points: points.to_vec(),
             point_keys,
@@ -137,11 +148,85 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
         self.comm
     }
 
-    pub fn generate_status(&self) {
-        let mut keys_with_status = HashSet::<MortonKey>::new();
+    /// Generate all leaf and interior keys.
+    pub fn generate_all_keys(&self) -> HashMap<MortonKey, KeyStatus> {
+        let rank = self.comm().rank() as usize;
+        let size = self.comm().size() as usize;
+
+        let mut all_keys = HashMap::<MortonKey, KeyStatus>::new();
 
         // Start from the leafs and work up the tree.
 
-        for leaf in self.leaf_tree() {}
+        // First deal with the parents of the coarse tree. These are different
+        // as they may exist on multiple nodes, so receive a different label.
+
+        let mut leaf_keys: HashSet<MortonKey> =
+            HashSet::from_iter(self.leaf_tree().iter().copied());
+
+        for &key in self.coarse_tree() {
+            // Need to distingush if coarse tree node is already a leaf or not.
+            if leaf_keys.contains(&key) {
+                all_keys.insert(key, KeyStatus::LocalLeaf);
+                leaf_keys.remove(&key);
+            } else {
+                all_keys.insert(key, KeyStatus::LocalInterior);
+            }
+
+            // We now iterate the parents of the coarse tree. There is no guarantee
+            // that the parents only exist on a single rank. Hence, they get the `Global`
+            // tag.
+
+            let mut parent = key.parent();
+            while parent.level() > 0 && !all_keys.contains_key(&parent) {
+                all_keys.insert(parent, KeyStatus::Global);
+                parent = parent.parent();
+            }
+        }
+
+        // We now deal with the fine leafs and their ancestors.
+
+        for leaf in leaf_keys {
+            debug_assert!(!all_keys.contains_key(&leaf));
+            all_keys.insert(leaf, KeyStatus::LocalLeaf);
+            let mut parent = leaf.parent();
+            while parent.level() > 0 && !all_keys.contains_key(&parent) {
+                all_keys.insert(parent, KeyStatus::LocalInterior);
+                parent = parent.parent();
+            }
+        }
+
+        // This maps from rank to the keys that we want to send to the ranks
+        let mut rank_key = HashMap::<usize, Vec<MortonKey>>::new();
+        for index in 0..size - 1 {
+            rank_key.insert(index, Vec::<MortonKey>::new());
+        }
+
+        for (&key, &status) in all_keys.iter() {
+            // We need not send around global keys to neighbors.
+            if status == KeyStatus::Global {
+                continue;
+            }
+            for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) {
+                // Get rank of the neighbour
+                let neighbour_rank = get_key_index(&self.coarse_tree_bounds(), neighbor);
+                rank_key.entry(neighbour);
+            }
+        }
+
+        // We now know which key needs to be sent to which rank.
+        // Turn to array, get the counts and send around.
+
+        let (arr, counts) = {
+            let mut arr = Vec::<MortonKey>::new();
+            let mut counts = Vec::<usize>::new();
+            for index in 0..size - 1 {
+                let value = rank_key.get(&index).unwrap();
+                arr.extend(value.iter());
+                counts.push(value.len());
+            }
+            (arr, counts)
+                    }
+
+        all_keys
     }
 }
diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs
index 2981a08..28949c3 100644
--- a/src/octree/parallel.rs
+++ b/src/octree/parallel.rs
@@ -19,30 +19,6 @@ use itertools::{izip, Itertools};
 use mpi::{collective::SystemOperation, traits::CommunicatorCollectives};
 use rand::Rng;
 
-/// Structure to store ghost keys and their original ranks.
-///
-/// The status is
-/// - 0 for a local interior node.
-/// - 1 for a local leaf node.
-/// - 2 for a global node.
-/// - 3 for a ghost node.
-#[derive(Copy, Clone, Equivalence)]
-pub struct KeyWithStatus {
-    key: MortonKey,
-    // Ideally we would use a typed enum that
-    // combines rank and status. But this is not
-    // supported by the rsmpi Equivalence Macro.
-    status: usize,
-    rank: usize,
-}
-
-impl KeyWithStatus {
-    /// Create a new ghost.
-    pub fn new(key: MortonKey, status: usize, rank: usize) -> Self {
-        Self { key, status, rank }
-    }
-}
-
 /// Compute the global bounding box across all points on all processes.
 pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
     points: &[Point],

From 17d44c60e7eae0919ad57e7befac81f7ce82c166 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Tue, 1 Oct 2024 12:32:14 +0100
Subject: [PATCH 34/42] WIP: Neighbour keys

---
 src/octree.rs          | 57 ++++++++++++++++++++++++++----------------
 src/octree/parallel.rs |  8 +++---
 2 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/src/octree.rs b/src/octree.rs
index b0ac95c..8a40463 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -1,9 +1,10 @@
 pub mod parallel;
 use std::collections::{HashMap, HashSet};
 
+use itertools::Itertools;
 use mpi::traits::{CommunicatorCollectives, Equivalence};
 pub use parallel::*;
-use rand::{Rng, SeedableRng};
+use rand::SeedableRng;
 use rand_chacha::ChaCha8Rng;
 
 use crate::{
@@ -95,6 +96,8 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
 
         // Duplicate the coarse tree across all nodes
 
+        // let coarse_tree = gather_to_all(&coarse_tree, comm);
+
         Self {
             points: points.to_vec(),
             point_keys,
@@ -154,36 +157,40 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
         let size = self.comm().size() as usize;
 
         let mut all_keys = HashMap::<MortonKey, KeyStatus>::new();
+        let mut leaf_keys: HashSet<MortonKey> =
+            HashSet::from_iter(self.leaf_tree().iter().copied());
 
-        // Start from the leafs and work up the tree.
+        let mut global_keys = HashSet::<MortonKey>::new();
 
         // First deal with the parents of the coarse tree. These are different
         // as they may exist on multiple nodes, so receive a different label.
 
-        let mut leaf_keys: HashSet<MortonKey> =
-            HashSet::from_iter(self.leaf_tree().iter().copied());
-
         for &key in self.coarse_tree() {
-            // Need to distingush if coarse tree node is already a leaf or not.
-            if leaf_keys.contains(&key) {
-                all_keys.insert(key, KeyStatus::LocalLeaf);
-                leaf_keys.remove(&key);
-            } else {
-                all_keys.insert(key, KeyStatus::LocalInterior);
-            }
-
-            // We now iterate the parents of the coarse tree. There is no guarantee
-            // that the parents only exist on a single rank. Hence, they get the `Global`
-            // tag.
-
             let mut parent = key.parent();
             while parent.level() > 0 && !all_keys.contains_key(&parent) {
-                all_keys.insert(parent, KeyStatus::Global);
+                global_keys.insert(parent);
                 parent = parent.parent();
             }
         }
 
+        // We now send around the parents of the coarse tree to every node. These will
+        // be global keys.
+
+        let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), self.comm());
+
+        // We can now insert the global keys into `all_keys` with the `Global` label.
+        // There may be duplicates in the `global_keys` array. So need to check for that.
+
+        for &key in &global_keys {
+            if !all_keys.contains_key(&key) {
+                all_keys.insert(key, KeyStatus::Global);
+            }
+        }
+
         // We now deal with the fine leafs and their ancestors.
+        // The leafs of the coarse tree will also be either part
+        // of the fine tree leafs or will be interior keys. In either
+        // case the following loop catches them.
 
         for leaf in leaf_keys {
             debug_assert!(!all_keys.contains_key(&leaf));
@@ -207,9 +214,17 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
                 continue;
             }
             for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) {
+                // If the neighbour is a global key then continue.
+                if let Some(&value) = all_keys.get(&neighbor) {
+                    if value == KeyStatus::Global {
+                        continue;
+                    }
+                }
                 // Get rank of the neighbour
-                let neighbour_rank = get_key_index(&self.coarse_tree_bounds(), neighbor);
-                rank_key.entry(neighbour);
+                let neighbor_rank = get_key_index(&self.coarse_tree_bounds(), neighbor);
+                rank_key
+                    .entry(neighbor_rank)
+                    .and_modify(|keys| keys.push(key));
             }
         }
 
@@ -225,7 +240,7 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
                 counts.push(value.len());
             }
             (arr, counts)
-                    }
+        };
 
         all_keys
     }
diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs
index 28949c3..5f8348f 100644
--- a/src/octree/parallel.rs
+++ b/src/octree/parallel.rs
@@ -336,14 +336,14 @@ pub fn create_local_tree(
     // through and locally refine for each block that requires it.
 
     let mut remainder = sorted_fine_keys;
-    let mut new_coarse_keys = Vec::<MortonKey>::new();
+    let mut refined_keys = Vec::<MortonKey>::new();
 
     for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) {
         let current;
         (current, remainder) = remainder.split_at(count);
         if coarse_key.level() < max_level && current.len() > max_keys {
             // We need to refine the current split.
-            new_coarse_keys.extend_from_slice(
+            refined_keys.extend_from_slice(
                 create_local_tree(
                     current,
                     coarse_key.children().as_slice(),
@@ -353,11 +353,11 @@ pub fn create_local_tree(
                 .as_slice(),
             );
         } else {
-            new_coarse_keys.push(coarse_key)
+            refined_keys.push(coarse_key)
         }
     }
 
-    new_coarse_keys
+    refined_keys
 }
 
 /// Linearize a set of weighted Morton keys.

From baa5bb91062241e20ad808e7b3ffa8e560db5a0f Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Thu, 3 Oct 2024 21:02:34 +0100
Subject: [PATCH 35/42] Testing creation of all keys.

---
 examples/mpi_coarse_tree.rs   |   5 +-
 examples/mpi_complete_tree.rs |   2 +-
 examples/mpi_cumsum.rs        |   4 +-
 src/octree.rs                 | 113 ++++----------------------
 src/octree/parallel.rs        | 146 ++++++++++++++++++++++++++++++----
 src/tools.rs                  |  10 +--
 6 files changed, 159 insertions(+), 121 deletions(-)

diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs
index a79626d..b8f1095 100644
--- a/examples/mpi_coarse_tree.rs
+++ b/examples/mpi_coarse_tree.rs
@@ -10,7 +10,10 @@ use bempp_octree::{
     },
     tools::{communicate_back, generate_random_points, global_size, is_sorted_array},
 };
-use mpi::{collective::SystemOperation, traits::*};
+use mpi::{
+    collective::SystemOperation,
+    traits::{Communicator, CommunicatorCollectives},
+};
 use rand::prelude::*;
 use rand_chacha::ChaCha8Rng;
 
diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index d47fc6a..b0a5f43 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -5,7 +5,7 @@ use bempp_octree::{
     octree::{complete_tree, is_complete_linear_tree, linearize, points_to_morton},
     tools::generate_random_points,
 };
-use mpi::traits::*;
+use mpi::traits::Communicator;
 use rand::prelude::*;
 use rand_chacha::ChaCha8Rng;
 
diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs
index ab9e1b1..1b3fcc3 100644
--- a/examples/mpi_cumsum.rs
+++ b/examples/mpi_cumsum.rs
@@ -2,7 +2,7 @@
 
 use bempp_octree::tools::{gather_to_root, global_inclusive_cumsum};
 use itertools::{izip, Itertools};
-use mpi::traits::*;
+use mpi::traits::Communicator;
 use rand::prelude::*;
 use rand_chacha::ChaCha8Rng;
 
@@ -42,7 +42,7 @@ pub fn main() {
         let expected_cum_sum = original_array
             .iter()
             .scan(0, |state, x| {
-                *state = *x + *state;
+                *state += *x;
                 Some(*state)
             })
             .collect_vec();
diff --git a/src/octree.rs b/src/octree.rs
index 8a40463..537fa1f 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -1,8 +1,8 @@
+//! Definition of Octree.
 pub mod parallel;
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 
-use itertools::Itertools;
-use mpi::traits::{CommunicatorCollectives, Equivalence};
+use mpi::traits::CommunicatorCollectives;
 pub use parallel::*;
 use rand::SeedableRng;
 use rand_chacha::ChaCha8Rng;
@@ -11,14 +11,18 @@ use crate::{
     constants::DEEPEST_LEVEL,
     geometry::{PhysicalBox, Point},
     morton::MortonKey,
-    tools::gather_to_all,
 };
 
+/// Stores what type of key it is.
 #[derive(PartialEq, Eq, Hash, Copy, Clone)]
-pub enum KeyStatus {
+pub enum KeyType {
+    /// A local leaf.
     LocalLeaf,
+    /// A local interior key.
     LocalInterior,
+    /// A global key.
     Global,
+    /// A ghost key from a specific process.
     Ghost(usize),
 }
 
@@ -29,6 +33,7 @@ pub struct Octree<'o, C> {
     coarse_tree: Vec<MortonKey>,
     leaf_tree: Vec<MortonKey>,
     coarse_tree_bounds: Vec<MortonKey>,
+    all_keys: HashMap<MortonKey, KeyType>,
     bounding_box: PhysicalBox,
     comm: &'o C,
 }
@@ -98,12 +103,15 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
 
         // let coarse_tree = gather_to_all(&coarse_tree, comm);
 
+        let all_keys = generate_all_keys(&leaf_tree, &coarse_tree, &coarse_tree_bounds, comm);
+
         Self {
             points: points.to_vec(),
             point_keys,
             coarse_tree,
             leaf_tree,
             coarse_tree_bounds,
+            all_keys,
             bounding_box,
             comm,
         }
@@ -151,97 +159,8 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
         self.comm
     }
 
-    /// Generate all leaf and interior keys.
-    pub fn generate_all_keys(&self) -> HashMap<MortonKey, KeyStatus> {
-        let rank = self.comm().rank() as usize;
-        let size = self.comm().size() as usize;
-
-        let mut all_keys = HashMap::<MortonKey, KeyStatus>::new();
-        let mut leaf_keys: HashSet<MortonKey> =
-            HashSet::from_iter(self.leaf_tree().iter().copied());
-
-        let mut global_keys = HashSet::<MortonKey>::new();
-
-        // First deal with the parents of the coarse tree. These are different
-        // as they may exist on multiple nodes, so receive a different label.
-
-        for &key in self.coarse_tree() {
-            let mut parent = key.parent();
-            while parent.level() > 0 && !all_keys.contains_key(&parent) {
-                global_keys.insert(parent);
-                parent = parent.parent();
-            }
-        }
-
-        // We now send around the parents of the coarse tree to every node. These will
-        // be global keys.
-
-        let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), self.comm());
-
-        // We can now insert the global keys into `all_keys` with the `Global` label.
-        // There may be duplicates in the `global_keys` array. So need to check for that.
-
-        for &key in &global_keys {
-            if !all_keys.contains_key(&key) {
-                all_keys.insert(key, KeyStatus::Global);
-            }
-        }
-
-        // We now deal with the fine leafs and their ancestors.
-        // The leafs of the coarse tree will also be either part
-        // of the fine tree leafs or will be interior keys. In either
-        // case the following loop catches them.
-
-        for leaf in leaf_keys {
-            debug_assert!(!all_keys.contains_key(&leaf));
-            all_keys.insert(leaf, KeyStatus::LocalLeaf);
-            let mut parent = leaf.parent();
-            while parent.level() > 0 && !all_keys.contains_key(&parent) {
-                all_keys.insert(parent, KeyStatus::LocalInterior);
-                parent = parent.parent();
-            }
-        }
-
-        // This maps from rank to the keys that we want to send to the ranks
-        let mut rank_key = HashMap::<usize, Vec<MortonKey>>::new();
-        for index in 0..size - 1 {
-            rank_key.insert(index, Vec::<MortonKey>::new());
-        }
-
-        for (&key, &status) in all_keys.iter() {
-            // We need not send around global keys to neighbors.
-            if status == KeyStatus::Global {
-                continue;
-            }
-            for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) {
-                // If the neighbour is a global key then continue.
-                if let Some(&value) = all_keys.get(&neighbor) {
-                    if value == KeyStatus::Global {
-                        continue;
-                    }
-                }
-                // Get rank of the neighbour
-                let neighbor_rank = get_key_index(&self.coarse_tree_bounds(), neighbor);
-                rank_key
-                    .entry(neighbor_rank)
-                    .and_modify(|keys| keys.push(key));
-            }
-        }
-
-        // We now know which key needs to be sent to which rank.
-        // Turn to array, get the counts and send around.
-
-        let (arr, counts) = {
-            let mut arr = Vec::<MortonKey>::new();
-            let mut counts = Vec::<usize>::new();
-            for index in 0..size - 1 {
-                let value = rank_key.get(&index).unwrap();
-                arr.extend(value.iter());
-                counts.push(value.len());
-            }
-            (arr, counts)
-        };
-
-        all_keys
+    /// Return a map of all keys.
+    pub fn all_keys(&self) -> &HashMap<MortonKey, KeyType> {
+        &self.all_keys
     }
 }
diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs
index 5f8348f..1b873fb 100644
--- a/src/octree/parallel.rs
+++ b/src/octree/parallel.rs
@@ -19,6 +19,8 @@ use itertools::{izip, Itertools};
 use mpi::{collective::SystemOperation, traits::CommunicatorCollectives};
 use rand::Rng;
 
+use super::KeyType;
+
 /// Compute the global bounding box across all points on all processes.
 pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
     points: &[Point],
@@ -184,7 +186,7 @@ pub fn compute_coarse_tree_weights<C: CommunicatorCollectives>(
     // intersect with each node of the coarse tree. We then sum up the local weight for each
     // coarse tree node across all nodes to get the weight.
 
-    let global_coarse_tree = gather_to_all(&coarse_tree, comm);
+    let global_coarse_tree = gather_to_all(coarse_tree, comm);
 
     // We also want to send around a corresponding array of ranks so that for each global coarse tree key
     // we have the rank of where it originates from.
@@ -192,7 +194,7 @@ pub fn compute_coarse_tree_weights<C: CommunicatorCollectives>(
     let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm);
 
     // We now compute the local weights.
-    let mut local_weight_contribution = vec![0 as usize; global_coarse_tree.len()];
+    let mut local_weight_contribution = vec![0; global_coarse_tree.len()];
 
     // In the following loop we want to be a bit smart. We do not iterate through all the local elements.
     // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
@@ -232,7 +234,7 @@ pub fn compute_coarse_tree_weights<C: CommunicatorCollectives>(
 
     // We now need to sum up the weights across all processes.
 
-    let mut global_weights = vec![0 as usize; global_coarse_tree.len()];
+    let mut global_weights = vec![0; global_coarse_tree.len()];
 
     comm.all_reduce_into(
         &local_weight_contribution,
@@ -291,7 +293,7 @@ pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
 
     // We now have the counts for each rank. Let's redistribute accordingly and return.
 
-    let result = redistribute(&linear_keys, &rank_counts, comm);
+    let result = redistribute(linear_keys, &rank_counts, comm);
 
     #[cfg(debug_assertions)]
     {
@@ -330,7 +332,7 @@ pub fn create_local_tree(
 
     let bins = coarse_keys.to_vec();
 
-    let counts = sort_to_bins(&sorted_fine_keys, &bins);
+    let counts = sort_to_bins(sorted_fine_keys, &bins);
 
     // We now know how many fine keys are associated with each coarse block. We iterate
     // through and locally refine for each block that requires it.
@@ -378,7 +380,7 @@ pub fn linearize<R: Rng, C: CommunicatorCollectives>(
     // We are first sorting the keys. Then in a linear process across all processors we
     // go through the arrays and delete ancestors of nodes.
 
-    let sorted_keys = parsort(&keys, comm, rng);
+    let sorted_keys = parsort(keys, comm, rng);
 
     // Each process needs to send its first element to the previous process. Each process
     // then goes through its own list and retains elements that are not ancestors of the
@@ -438,7 +440,7 @@ pub fn load_balance<C: CommunicatorCollectives>(
     // of each array to get the global sums and then we update the array of each rank
     // with the sum from the previous ranks.
 
-    let scan = global_inclusive_cumsum(&weights, comm);
+    let scan = global_inclusive_cumsum(weights, comm);
 
     // Now broadcast the total weight to all processes.
 
@@ -477,7 +479,7 @@ pub fn load_balance<C: CommunicatorCollectives>(
     // We create a vector of how many elements to send to each process and
     // then send the actual data.
 
-    let mut recvbuffer = redistribute(&sorted_keys, &counts, comm);
+    let mut recvbuffer = redistribute(sorted_keys, &counts, comm);
 
     recvbuffer.sort_unstable();
     recvbuffer
@@ -673,9 +675,9 @@ pub fn redistribute_points_with_respect_to_coarse_tree<C: CommunicatorCollective
         return (points.to_vec(), morton_keys_for_points.to_vec());
     }
 
-    let sort_indices = argsort(&morton_keys_for_points);
-    let sorted_keys = reorder(&morton_keys_for_points, &sort_indices);
-    let sorted_points = reorder(&points, &sort_indices);
+    let sort_indices = argsort(morton_keys_for_points);
+    let sorted_keys = reorder(morton_keys_for_points, &sort_indices);
+    let sorted_points = reorder(points, &sort_indices);
 
     // Now get the bins
 
@@ -760,10 +762,8 @@ pub fn is_complete_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], co
 
     // Now check that at the first rank we include the deepest first.
 
-    if comm.rank() == 0 {
-        if !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) {
-            complete_linear = false;
-        }
+    if comm.rank() == 0 && !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) {
+        complete_linear = false;
     }
 
     // Now communicate everything together.
@@ -858,6 +858,122 @@ pub fn key_on_current_rank(
     }
 }
 
+/// Generate all leaf and interior keys.
+pub fn generate_all_keys<C: CommunicatorCollectives>(
+    leaf_tree: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    coarse_tree_bounds: &[MortonKey],
+    comm: &C,
+) -> HashMap<MortonKey, KeyType> {
+    /// This struct combines rank and key information for sending ghosts to neighbors.
+    #[derive(Copy, Clone, Equivalence)]
+    struct KeyWithRank {
+        key: MortonKey,
+        rank: usize,
+    }
+
+    let rank = comm.rank() as usize;
+    let size = comm.size() as usize;
+
+    let mut all_keys = HashMap::<MortonKey, KeyType>::new();
+    let leaf_keys: HashSet<MortonKey> = HashSet::from_iter(leaf_tree.iter().copied());
+
+    let mut global_keys = HashSet::<MortonKey>::new();
+
+    // First deal with the parents of the coarse tree. These are different
+    // as they may exist on multiple nodes, so receive a different label.
+
+    for &key in coarse_tree {
+        let mut parent = key.parent();
+        while parent.level() > 0 && !all_keys.contains_key(&parent) {
+            global_keys.insert(parent);
+            parent = parent.parent();
+        }
+    }
+
+    // We now send around the parents of the coarse tree to every node. These will
+    // be global keys.
+
+    let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), comm);
+
+    // We can now insert the global keys into `all_keys` with the `Global` label.
+
+    for &key in &global_keys {
+        all_keys.entry(key).or_insert(KeyType::Global);
+    }
+
+    // We now deal with the fine leafs and their ancestors.
+    // The leafs of the coarse tree will also be either part
+    // of the fine tree leafs or will be interior keys. In either
+    // case the following loop catches them.
+
+    for leaf in leaf_keys {
+        debug_assert!(!all_keys.contains_key(&leaf));
+        all_keys.insert(leaf, KeyType::LocalLeaf);
+        let mut parent = leaf.parent();
+        while parent.level() > 0 && !all_keys.contains_key(&parent) {
+            all_keys.insert(parent, KeyType::LocalInterior);
+            parent = parent.parent();
+        }
+    }
+
+    // This maps from rank to the keys that we want to send to the ranks
+    let mut rank_send_ghost = HashMap::<usize, Vec<KeyWithRank>>::new();
+    for index in 0..size - 1 {
+        rank_send_ghost.insert(index, Vec::<KeyWithRank>::new());
+    }
+
+    for (&key, &status) in all_keys.iter() {
+        // We need not send around global keys to neighbors.
+        if status == KeyType::Global {
+            continue;
+        }
+        for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) {
+            // If the neighbour is a global key then continue.
+            if let Some(&value) = all_keys.get(&neighbor) {
+                if value == KeyType::Global {
+                    continue;
+                }
+            }
+            // Get rank of the neighbour
+            let neighbor_rank = get_key_index(coarse_tree_bounds, neighbor);
+            rank_send_ghost
+                .entry(neighbor_rank)
+                .and_modify(|keys| keys.push(KeyWithRank { key, rank }));
+        }
+    }
+
+    // We now know which key needs to be sent to which rank.
+    // Turn to array, get the counts and send around.
+
+    let (arr, counts) = {
+        let mut arr = Vec::<KeyWithRank>::new();
+        let mut counts = Vec::<i32>::new();
+        for index in 0..size - 1 {
+            let keys = rank_send_ghost.get(&index).unwrap();
+            arr.extend(keys.iter());
+            counts.push(keys.len() as i32);
+        }
+        (arr, counts)
+    };
+
+    // These are all the keys that are neighbors to our keys. We now go through
+    // and store those that do not live on our tree as into `all_keys` with a label
+    // of `Ghost`.
+    let ghost_keys = redistribute(&arr, &counts, comm);
+
+    for key in &ghost_keys {
+        if key.rank == rank {
+            // Don't need to add the keys that are already on the rank.
+            continue;
+        }
+        debug_assert!(!all_keys.contains_key(&key.key));
+        all_keys.insert(key.key, KeyType::Ghost(key.rank));
+    }
+
+    all_keys
+}
+
 #[cfg(test)]
 mod test {
     use crate::{
diff --git a/src/tools.rs b/src/tools.rs
index dec8779..a404f88 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -25,7 +25,7 @@ pub fn gather_to_all<T: Equivalence, C: CommunicatorCollectives>(arr: &[T], comm
 
     let local_len = arr.len() as i32;
 
-    let mut sizes = vec![0 as i32; size as usize];
+    let mut sizes = vec![0; size as usize];
 
     comm.all_gather_into(&local_len, &mut sizes);
 
@@ -170,7 +170,7 @@ pub fn communicate_back<T: Equivalence, C: CommunicatorCollectives>(
 
     if rank == size - 1 {
         comm.process_at_rank(rank - 1).send(arr.first().unwrap());
-        return None;
+        None
     } else {
         let (new_last, _status) = if rank > 0 {
             p2p::send_receive(
@@ -221,9 +221,9 @@ pub fn redistribute<T: Equivalence, C: CommunicatorCollectives>(
 
     // First send the counts around via an alltoall operation.
 
-    let mut recv_counts = vec![0 as i32; counts.len()];
+    let mut recv_counts = vec![0; counts.len()];
 
-    comm.all_to_all_into(&counts[..], &mut recv_counts);
+    comm.all_to_all_into(counts, &mut recv_counts);
 
     // We have the recv_counts. Allocate space and setup the partitions.
 
@@ -288,7 +288,7 @@ pub fn sort_to_bins<T: Ord>(sorted_keys: &[T], bins: &[T]) -> Vec<usize> {
         return vec![sorted_keys.len(); 1];
     }
 
-    let mut bin_counts = vec![0 as usize; nbins];
+    let mut bin_counts = vec![0; nbins];
 
     // This iterates over each possible bin and returns also the associated rank.
     // The last bin position is not iterated over since for an array with p elements

From 6f01f6c585a51b98cb7afd73a32796bb2e537dac Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 4 Oct 2024 11:06:41 +0100
Subject: [PATCH 36/42] Testing parallel octree

---
 examples/mpi_complete_tree.rs |  75 ++++++++++++++---
 src/octree.rs                 |   6 +-
 src/octree/parallel.rs        | 151 ++++++++++++++++++++--------------
 src/tools.rs                  |   4 +
 4 files changed, 162 insertions(+), 74 deletions(-)

diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index b0a5f43..78a0c23 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -1,10 +1,11 @@
-//! Test the computation of a global bounding box across MPI ranks.
+//! Test the computation of a complete octree.
 
 use bempp_octree::{
-    constants::DEEPEST_LEVEL,
-    octree::{complete_tree, is_complete_linear_tree, linearize, points_to_morton},
-    tools::generate_random_points,
+    morton::MortonKey,
+    octree::{is_complete_linear_and_balanced, KeyType, Octree},
+    tools::{gather_to_all, generate_random_points},
 };
+use itertools::Itertools;
 use mpi::traits::Communicator;
 use rand::prelude::*;
 use rand_chacha::ChaCha8Rng;
@@ -20,21 +21,73 @@ pub fn main() {
     let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
 
     // Create `npoints` per rank.
-    let npoints = 10;
+    let npoints = 10000;
 
     // Generate random points.
 
     let points = generate_random_points(npoints, &mut rng, &comm);
 
-    // Compute the Morton keys on the deepest level
-    let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm);
+    let tree = Octree::new(&points, 15, 50, &comm);
 
-    let linear_keys = linearize(&keys, &mut rng, &comm);
+    // We now check that each node of the tree has all its neighbors available.
 
-    // Generate a complete tree
-    let distributed_complete_tree = complete_tree(&linear_keys, &comm);
+    let leaf_tree = tree.leaf_tree();
+    let all_keys = tree.all_keys();
 
-    assert!(is_complete_linear_tree(&distributed_complete_tree, &comm));
+    assert!(is_complete_linear_and_balanced(leaf_tree, &comm));
+    for &key in leaf_tree {
+        let mut parent = key;
+        while parent.level() > 0 {
+            // Check that the key itself is there.
+            assert!(all_keys.contains_key(&key));
+            // Check that all its neighbours are there.
+            for neighbor in parent.neighbours().iter().filter(|&key| key.is_valid()) {
+                if !all_keys.contains_key(neighbor) {
+                    println!(
+                        "Missing neighbor: {}. Key type {:#?}",
+                        neighbor,
+                        all_keys.get(&parent).unwrap()
+                    );
+                }
+                assert!(all_keys.contains_key(neighbor));
+            }
+            parent = parent.parent();
+            // Check that the parent is there.
+            assert!(all_keys.contains_key(&parent));
+        }
+    }
+
+    // At the end check that the root of the tree is also contained.
+    assert!(all_keys.contains_key(&MortonKey::root()));
+
+    // Count the number of ghosts on each rank
+
+    // Count the number of global keys on each rank.
+
+    // Assert that all ghosts are from a different rank and count them.
+
+    let nghosts = all_keys
+        .iter()
+        .filter_map(|(_, &value)| {
+            if let KeyType::Ghost(rank) = value {
+                assert!(rank != comm.size() as usize);
+                Some(rank)
+            } else {
+                None
+            }
+        })
+        .count();
+
+    let nglobal = all_keys
+        .iter()
+        .filter(|(_, &value)| matches!(value, KeyType::Global))
+        .count();
+
+    // Assert that all globals across all ranks have the same count.
+
+    let nglobals = gather_to_all(std::slice::from_ref(&nglobal), &comm);
+
+    assert_eq!(nglobals.iter().unique().count(), 1);
 
     if comm.rank() == 0 {
         println!("Distributed tree is complete and linear.");
diff --git a/src/octree.rs b/src/octree.rs
index 537fa1f..5dbd076 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -14,7 +14,7 @@ use crate::{
 };
 
 /// Stores what type of key it is.
-#[derive(PartialEq, Eq, Hash, Copy, Clone)]
+#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
 pub enum KeyType {
     /// A local leaf.
     LocalLeaf,
@@ -55,7 +55,10 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
             let linear_keys = linearize(&point_keys, &mut rng, comm);
 
             // Compute the first version of the coarse tree without load balancing.
+            // We want to ensure that it is 2:1 balanced.
             let coarse_tree = compute_coarse_tree(&linear_keys, comm);
+
+            let coarse_tree = balance(&coarse_tree, &mut rng, comm);
             debug_assert!(is_complete_linear_tree(&coarse_tree, comm));
 
             // We now compute the weights for the initial coarse tree.
@@ -66,7 +69,6 @@ impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
             // that is used from now on.
 
             let coarse_tree = load_balance(&coarse_tree, &weights, comm);
-
             // We also want to redistribute the fine keys with respect to the load balanced coarse trees.
 
             let fine_keys =
diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs
index 1b873fb..4a8a80f 100644
--- a/src/octree/parallel.rs
+++ b/src/octree/parallel.rs
@@ -559,6 +559,13 @@ pub fn balance<R: Rng, C: CommunicatorCollectives>(
     rng: &mut R,
     comm: &C,
 ) -> Vec<MortonKey> {
+    // Treat the case that the length of the keys is one and is only the root.
+    // This would lead to an empty output below as we only iterate up to level 1.
+
+    if linear_keys.len() == 1 && *linear_keys.first().unwrap() == MortonKey::root() {
+        return vec![MortonKey::root()];
+    }
+
     let deepest_level = deepest_level(linear_keys, comm);
 
     // Start with keys at deepest level
@@ -602,7 +609,6 @@ pub fn balance<R: Rng, C: CommunicatorCollectives>(
         );
 
         work_list = new_work_list;
-        // Now extend the work list with the
     }
 
     let result = linearize(&result, rng, comm);
@@ -653,6 +659,10 @@ pub fn redistribute_points_with_respect_to_coarse_tree<C: CommunicatorCollective
     coarse_tree: &[MortonKey],
     comm: &C,
 ) -> (Vec<Point>, Vec<MortonKey>) {
+    if comm.size() == 1 {
+        return (points.to_vec(), morton_keys_for_points.to_vec());
+    }
+
     pub fn argsort<T: Ord + Copy>(arr: &[T]) -> Vec<usize> {
         let mut sort_indices = (0..arr.len()).collect_vec();
         sort_indices.sort_unstable_by_key(|&index| arr[index]);
@@ -878,28 +888,32 @@ pub fn generate_all_keys<C: CommunicatorCollectives>(
     let mut all_keys = HashMap::<MortonKey, KeyType>::new();
     let leaf_keys: HashSet<MortonKey> = HashSet::from_iter(leaf_tree.iter().copied());
 
-    let mut global_keys = HashSet::<MortonKey>::new();
+    // If size == 1 we simply create locally the keys, so don't need to treat the global keys.
 
-    // First deal with the parents of the coarse tree. These are different
-    // as they may exist on multiple nodes, so receive a different label.
+    if size > 1 {
+        let mut global_keys = HashSet::<MortonKey>::new();
 
-    for &key in coarse_tree {
-        let mut parent = key.parent();
-        while parent.level() > 0 && !all_keys.contains_key(&parent) {
-            global_keys.insert(parent);
-            parent = parent.parent();
+        // First deal with the parents of the coarse tree. These are different
+        // as they may exist on multiple nodes, so receive a different label.
+
+        for &key in coarse_tree {
+            let mut parent = key.parent();
+            while parent.level() > 0 && !all_keys.contains_key(&parent) {
+                global_keys.insert(parent);
+                parent = parent.parent();
+            }
         }
-    }
 
-    // We now send around the parents of the coarse tree to every node. These will
-    // be global keys.
+        // We now send around the parents of the coarse tree to every node. These will
+        // be global keys.
 
-    let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), comm);
+        let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), comm);
 
-    // We can now insert the global keys into `all_keys` with the `Global` label.
+        // We can now insert the global keys into `all_keys` with the `Global` label.
 
-    for &key in &global_keys {
-        all_keys.entry(key).or_insert(KeyType::Global);
+        for &key in &global_keys {
+            all_keys.entry(key).or_insert(KeyType::Global);
+        }
     }
 
     // We now deal with the fine leafs and their ancestors.
@@ -917,58 +931,73 @@ pub fn generate_all_keys<C: CommunicatorCollectives>(
         }
     }
 
-    // This maps from rank to the keys that we want to send to the ranks
-    let mut rank_send_ghost = HashMap::<usize, Vec<KeyWithRank>>::new();
-    for index in 0..size - 1 {
-        rank_send_ghost.insert(index, Vec::<KeyWithRank>::new());
-    }
+    // Need to explicitly add the root at the end.
+    all_keys.entry(MortonKey::root()).or_insert(KeyType::Global);
 
-    for (&key, &status) in all_keys.iter() {
-        // We need not send around global keys to neighbors.
-        if status == KeyType::Global {
-            continue;
-        }
-        for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) {
-            // If the neighbour is a global key then continue.
-            if let Some(&value) = all_keys.get(&neighbor) {
-                if value == KeyType::Global {
-                    continue;
-                }
-            }
-            // Get rank of the neighbour
-            let neighbor_rank = get_key_index(coarse_tree_bounds, neighbor);
-            rank_send_ghost
-                .entry(neighbor_rank)
-                .and_modify(|keys| keys.push(KeyWithRank { key, rank }));
-        }
-    }
+    // We only need to deal with ghosts if the size is larger than 1.
 
-    // We now know which key needs to be sent to which rank.
-    // Turn to array, get the counts and send around.
+    if size > 1 {
+        // This maps from rank to the keys that we want to send to the ranks
 
-    let (arr, counts) = {
-        let mut arr = Vec::<KeyWithRank>::new();
-        let mut counts = Vec::<i32>::new();
-        for index in 0..size - 1 {
-            let keys = rank_send_ghost.get(&index).unwrap();
-            arr.extend(keys.iter());
-            counts.push(keys.len() as i32);
+        let mut rank_send_ghost = HashMap::<usize, Vec<KeyWithRank>>::new();
+        for index in 0..size {
+            rank_send_ghost.insert(index, Vec::<KeyWithRank>::new());
         }
-        (arr, counts)
-    };
 
-    // These are all the keys that are neighbors to our keys. We now go through
-    // and store those that do not live on our tree as into `all_keys` with a label
-    // of `Ghost`.
-    let ghost_keys = redistribute(&arr, &counts, comm);
+        let mut send_to_all = Vec::<KeyWithRank>::new();
 
-    for key in &ghost_keys {
-        if key.rank == rank {
-            // Don't need to add the keys that are already on the rank.
-            continue;
+        for (&key, &status) in all_keys.iter() {
+            // We need not send around global keys to neighbors.
+            if status == KeyType::Global {
+                continue;
+            }
+            for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) {
+                // If the neighbour is a global key then continue.
+                if all_keys
+                    .get(&neighbor)
+                    .is_some_and(|&value| value == KeyType::Global)
+                {
+                    // Global keys exist on all nodes, so need to send their neighbors to all nodes.
+                    send_to_all.push(KeyWithRank { key, rank });
+                } else {
+                    // Get rank of the neighbour
+                    let neighbor_rank = get_key_index(coarse_tree_bounds, neighbor);
+                    rank_send_ghost
+                        .entry(neighbor_rank)
+                        .and_modify(|keys| keys.push(KeyWithRank { key, rank }));
+                }
+            }
+        }
+
+        let send_ghost_to_all = gather_to_all(&send_to_all, comm);
+        // We now know which key needs to be sent to which rank.
+        // Turn to array, get the counts and send around.
+
+        let (arr, counts) = {
+            let mut arr = Vec::<KeyWithRank>::new();
+            let mut counts = Vec::<i32>::new();
+            for index in 0..size {
+                let keys = rank_send_ghost.get(&index).unwrap();
+                arr.extend(keys.iter());
+                counts.push(keys.len() as i32);
+            }
+            (arr, counts)
+        };
+
+        // These are all the keys that are neighbors to our keys. We now go through
+        // and store those that do not live on our tree as into `all_keys` with a label
+        // of `Ghost`.
+        let mut ghost_keys = redistribute(&arr, &counts, comm);
+        // Add the neighbors of any global key.
+        ghost_keys.extend(send_ghost_to_all.iter());
+
+        for key in &ghost_keys {
+            if key.rank == rank {
+                // Don't need to add the keys that are already on the rank.
+                continue;
+            }
+            all_keys.insert(key.key, KeyType::Ghost(key.rank));
         }
-        debug_assert!(!all_keys.contains_key(&key.key));
-        all_keys.insert(key.key, KeyType::Ghost(key.rank));
     }
 
     all_keys
diff --git a/src/tools.rs b/src/tools.rs
index a404f88..ad4d8b5 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -168,6 +168,10 @@ pub fn communicate_back<T: Equivalence, C: CommunicatorCollectives>(
     let rank = comm.rank();
     let size = comm.size();
 
+    if size == 1 {
+        return None;
+    }
+
     if rank == size - 1 {
         comm.process_at_rank(rank - 1).send(arr.first().unwrap());
         None

From 5edcfb0d80955b974bc4837b386525075767cb0a Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 4 Oct 2024 11:26:59 +0100
Subject: [PATCH 37/42] Dealing with neighbours

---
 examples/mpi_complete_tree.rs | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index 78a0c23..3c727ab 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -36,19 +36,14 @@ pub fn main() {
 
     assert!(is_complete_linear_and_balanced(leaf_tree, &comm));
     for &key in leaf_tree {
-        let mut parent = key;
+        // We only check interior keys. Leaf keys may not have a neighbor
+        // on the same level.
+        let mut parent = key.parent();
         while parent.level() > 0 {
             // Check that the key itself is there.
             assert!(all_keys.contains_key(&key));
             // Check that all its neighbours are there.
             for neighbor in parent.neighbours().iter().filter(|&key| key.is_valid()) {
-                if !all_keys.contains_key(neighbor) {
-                    println!(
-                        "Missing neighbor: {}. Key type {:#?}",
-                        neighbor,
-                        all_keys.get(&parent).unwrap()
-                    );
-                }
                 assert!(all_keys.contains_key(neighbor));
             }
             parent = parent.parent();

From de1b130a4d36c754c8e67b9e8ca7a3ad5fc34d55 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 4 Oct 2024 12:33:19 +0100
Subject: [PATCH 38/42] Fixed doc bugs

---
 examples/mpi_complete_tree.rs | 6 ++++++
 src/tools.rs                  | 6 +++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index 3c727ab..bcff3b8 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -73,6 +73,12 @@ pub fn main() {
         })
         .count();
 
+    if comm.size() == 0 {
+        assert_eq!(nghosts, 0);
+    } else {
+        assert!(nghosts > 0);
+    }
+
     let nglobal = all_keys
         .iter()
         .filter(|(_, &value)| matches!(value, KeyType::Global))
diff --git a/src/tools.rs b/src/tools.rs
index ad4d8b5..dbe8edd 100644
--- a/src/tools.rs
+++ b/src/tools.rs
@@ -347,9 +347,9 @@ pub fn sort_to_bins<T: Ord>(sorted_keys: &[T], bins: &[T]) -> Vec<usize> {
 ///
 /// - The array `sorted_keys` is assumed to be sorted within each process. It needs not be globally sorted.
 /// - If there are `r` ranks in the communicator, the size of `bins` must be `r`.
-/// - The bins are defined through half-open intervals (bin[0], bin[1]), .... This defines r-1 bins. The
-///   last bin is the half-open interval [bin[r-1], \infty).
-/// - All array elements must be larger or equal bin[0]. This means that each element can be sorted into a bin.
+/// - The bins are defined through half-open intervals `(bin[0], bin[1])`, .... This defines r-1 bins. The
+///   last bin is the half-open interval `[bin[r-1], \infty)`.
+/// - All array elements must be larger or equal `bin[0]`. This means that each element can be sorted into a bin.
 pub fn redistribute_by_bins<T: Equivalence + Ord, C: CommunicatorCollectives>(
     sorted_keys: &[T],
     bins: &[T],

From ff8ecfe9a6eebc4cd3bab63ca757f03606344a24 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 4 Oct 2024 16:31:24 +0100
Subject: [PATCH 39/42] Removed unnecessary dependency

---
 Cargo.toml                    | 9 +++++----
 examples/mpi_complete_tree.rs | 1 -
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index a75d666..ffd3a3e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,7 +7,9 @@ battleship = []
 name = "bempp-octree"
 version = "0.0.1-dev"
 edition = "2021"
-authors = ["Srinath Kailasa <srinathkailasa@gmail.com>, Timo Betcke <timo.betcke@gmail.com>"]
+authors = [
+    "Srinath Kailasa <srinathkailasa@gmail.com>, Timo Betcke <timo.betcke@gmail.com>",
+]
 description = "A library to create Octrees"
 license = "BSD-3-Clause"
 homepage = "https://github.com/bempp/octree"
@@ -27,12 +29,11 @@ rand_chacha = "0.3.*"
 num = "0.4.*"
 bytemuck = "1.*"
 vtkio = "0.6.*"
-mpi = {version = "0.8.*", features = ["derive", "user-operations"] }
-once_cell = "*"
+mpi = { version = "0.8.*", features = ["derive", "user-operations"] }
 
 [profile.release]
 debug = 1
- 
+
 [dev-dependencies]
 rand_distr = "0.4.3"
 #criterion = { version = "0.5.*", features = ["html_reports"]}
diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index bcff3b8..a99a5ca 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -56,7 +56,6 @@ pub fn main() {
     assert!(all_keys.contains_key(&MortonKey::root()));
 
     // Count the number of ghosts on each rank
-
     // Count the number of global keys on each rank.
 
     // Assert that all ghosts are from a different rank and count them.

From 70eebe7bc3e1898fe502d62a040948060bb28ecf Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 4 Oct 2024 16:38:20 +0100
Subject: [PATCH 40/42] Removed bytemuck

---
 Cargo.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index ffd3a3e..63eb3ca 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,7 +27,6 @@ itertools = "0.13.*"
 rand = { version = "0.8.5", features = ["alloc"] }
 rand_chacha = "0.3.*"
 num = "0.4.*"
-bytemuck = "1.*"
 vtkio = "0.6.*"
 mpi = { version = "0.8.*", features = ["derive", "user-operations"] }
 

From c8e5ac7b1ee1623ee794bd81782fe2def62197f0 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 4 Oct 2024 16:41:37 +0100
Subject: [PATCH 41/42] Fixed error

---
 examples/mpi_complete_tree.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
index a99a5ca..68633e1 100644
--- a/examples/mpi_complete_tree.rs
+++ b/examples/mpi_complete_tree.rs
@@ -72,7 +72,7 @@ pub fn main() {
         })
         .count();
 
-    if comm.size() == 0 {
+    if comm.size() == 1 {
         assert_eq!(nghosts, 0);
     } else {
         assert!(nghosts > 0);

From eacb84899ce74ccc711f64c27989e472a19a03b5 Mon Sep 17 00:00:00 2001
From: Timo Betcke <timo.betcke@gmail.com>
Date: Fri, 4 Oct 2024 16:45:27 +0100
Subject: [PATCH 42/42] Deleted coarse tree example

---
 examples/mpi_coarse_tree.rs | 140 ------------------------------------
 1 file changed, 140 deletions(-)
 delete mode 100644 examples/mpi_coarse_tree.rs

diff --git a/examples/mpi_coarse_tree.rs b/examples/mpi_coarse_tree.rs
deleted file mode 100644
index b8f1095..0000000
--- a/examples/mpi_coarse_tree.rs
+++ /dev/null
@@ -1,140 +0,0 @@
-//! Test the computation of a global bounding box across MPI ranks.
-
-use bempp_octree::{
-    constants::DEEPEST_LEVEL,
-    morton::MortonKey,
-    octree::{
-        balance, compute_coarse_tree, compute_coarse_tree_weights, create_local_tree,
-        is_complete_linear_tree, linearize, load_balance, points_to_morton,
-        redistribute_points_with_respect_to_coarse_tree, redistribute_with_respect_to_coarse_tree,
-    },
-    tools::{communicate_back, generate_random_points, global_size, is_sorted_array},
-};
-use mpi::{
-    collective::SystemOperation,
-    traits::{Communicator, CommunicatorCollectives},
-};
-use rand::prelude::*;
-use rand_chacha::ChaCha8Rng;
-
-pub fn main() {
-    // Initialise MPI
-    let universe = mpi::initialize().unwrap();
-
-    // Get the world communicator
-    let comm = universe.world();
-
-    // Initialise a seeded Rng.
-    let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
-
-    // Create `npoints` per rank.
-    let npoints = 10000;
-
-    // Generate random points.
-
-    let points = generate_random_points(npoints, &mut rng, &comm);
-
-    // Compute the Morton keys on the deepest level
-    let (keys, _) = points_to_morton(&points, DEEPEST_LEVEL as usize, &comm);
-
-    // linearize the keys
-    let linear_keys = linearize(&keys, &mut rng, &comm);
-
-    // Generate the coarse tree
-    let coarse_tree = compute_coarse_tree(&linear_keys, &comm);
-    assert!(is_complete_linear_tree(&coarse_tree, &comm));
-
-    // We now compute the weights for the coarse tree.
-
-    let weights = compute_coarse_tree_weights(&linear_keys, &coarse_tree, &comm);
-
-    // Assert that the global sum of the weights is identical to the number of linearized keys.
-
-    let mut global_weight: usize = 0;
-
-    comm.all_reduce_into(
-        &(weights.iter().sum::<usize>()),
-        &mut global_weight,
-        SystemOperation::sum(),
-    );
-
-    assert_eq!(global_weight, global_size(&linear_keys, &comm));
-
-    // Now load balance the coarse tree
-
-    let load_balanced_coarse_keys = load_balance(&coarse_tree, &weights, &comm);
-
-    // Compute the weights of the balanced keys
-
-    let load_balanced_weights =
-        compute_coarse_tree_weights(&linear_keys, &load_balanced_coarse_keys, &comm);
-
-    let mut global_balanced_weight: usize = 0;
-    comm.all_reduce_into(
-        &(load_balanced_weights.iter().sum::<usize>()),
-        &mut global_balanced_weight,
-        SystemOperation::sum(),
-    );
-
-    // The global weight of the non-balanced keys should be identical
-    // to the global weigth of the balanced keys.
-
-    assert_eq!(global_weight, global_balanced_weight);
-
-    // Now compute the new fine keys.
-
-    let load_balanced_fine_keys =
-        redistribute_with_respect_to_coarse_tree(&linear_keys, &load_balanced_coarse_keys, &comm);
-
-    assert_eq!(
-        global_size(&load_balanced_fine_keys, &comm),
-        global_size(&linear_keys, &comm)
-    );
-
-    let refined_tree =
-        create_local_tree(&load_balanced_fine_keys, &load_balanced_coarse_keys, 6, 100);
-
-    assert!(is_complete_linear_tree(&refined_tree, &comm));
-
-    // Now balance the tree.
-
-    let balanced_tree = balance(&refined_tree, &mut rng, &comm);
-
-    // redistribute the balanced tree according to coarse tree
-
-    let balanced_tree =
-        redistribute_with_respect_to_coarse_tree(&balanced_tree, &load_balanced_coarse_keys, &comm);
-
-    assert!(is_complete_linear_tree(&balanced_tree, &comm));
-
-    // Redistribute original keys and points with respect to balanced coarse tree.
-
-    let (balanced_points, balanced_keys) = redistribute_points_with_respect_to_coarse_tree(
-        &points,
-        &keys,
-        &load_balanced_coarse_keys,
-        &comm,
-    );
-
-    let upper_bound;
-
-    if let Some(next_key) = communicate_back(&load_balanced_coarse_keys, &comm) {
-        upper_bound = next_key;
-    } else {
-        upper_bound = MortonKey::upper_bound();
-    }
-
-    assert!(load_balanced_coarse_keys.first().unwrap() <= balanced_keys.first().unwrap());
-    assert!(*balanced_keys.last().unwrap() < upper_bound);
-    assert!(is_sorted_array(&balanced_keys, &comm));
-
-    println!(
-        "Rank {} has {} balanced points.",
-        comm.rank(),
-        balanced_points.len(),
-    );
-
-    if comm.rank() == 0 {
-        println!("Coarse tree successfully created and weights computed.");
-    }
-}