diff --git a/Cargo.toml b/Cargo.toml
index e089311..63eb3ca 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,7 +7,9 @@ battleship = []
 name = "bempp-octree"
 version = "0.0.1-dev"
 edition = "2021"
-authors = ["Srinath Kailasa <srinathkailasa@gmail.com>, Timo Betcke <timo.betcke@gmail.com>"]
+authors = [
+    "Srinath Kailasa <srinathkailasa@gmail.com>, Timo Betcke <timo.betcke@gmail.com>",
+]
 description = "A library to create Octrees"
 license = "BSD-3-Clause"
 homepage = "https://github.com/bempp/octree"
@@ -22,14 +24,15 @@ crate-type = ["cdylib", "lib"]
 
 [dependencies]
 itertools = "0.13.*"
-rand = "0.8.5"
-bytemuck = "1.*"
+rand = { version = "0.8.5", features = ["alloc"] }
+rand_chacha = "0.3.*"
+num = "0.4.*"
 vtkio = "0.6.*"
-mpi = {version = "0.8.*", features = ["derive", "user-operations"] }
+mpi = { version = "0.8.*", features = ["derive", "user-operations"] }
 
 [profile.release]
 debug = 1
- 
+
 [dev-dependencies]
 rand_distr = "0.4.3"
 #criterion = { version = "0.5.*", features = ["html_reports"]}
diff --git a/examples/battleship.rs b/examples/battleship.rs
index 97f7c81..50587d9 100644
--- a/examples/battleship.rs
+++ b/examples/battleship.rs
@@ -4,7 +4,7 @@
 use std::time::Instant;
 
 #[cfg(feature = "battleship")]
-use bempp_octree::octree::Octree;
+use bempp_octree::serial::Octree;
 #[cfg(feature = "battleship")]
 use vtkio::model::*;
 
diff --git a/examples/mpi_complete_tree.rs b/examples/mpi_complete_tree.rs
new file mode 100644
index 0000000..68633e1
--- /dev/null
+++ b/examples/mpi_complete_tree.rs
@@ -0,0 +1,95 @@
+//! Test the computation of a complete octree.
+
+use bempp_octree::{
+    morton::MortonKey,
+    octree::{is_complete_linear_and_balanced, KeyType, Octree},
+    tools::{gather_to_all, generate_random_points},
+};
+use itertools::Itertools;
+use mpi::traits::Communicator;
+use rand::prelude::*;
+use rand_chacha::ChaCha8Rng;
+
+pub fn main() {
+    // Initialise MPI
+    let universe = mpi::initialize().unwrap();
+
+    // Get the world communicator
+    let comm = universe.world();
+
+    // Initialise a seeded Rng.
+    let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
+
+    // Create `npoints` per rank.
+    let npoints = 10000;
+
+    // Generate random points.
+
+    let points = generate_random_points(npoints, &mut rng, &comm);
+
+    let tree = Octree::new(&points, 15, 50, &comm);
+
+    // We now check that each node of the tree has all its neighbors available.
+
+    let leaf_tree = tree.leaf_tree();
+    let all_keys = tree.all_keys();
+
+    assert!(is_complete_linear_and_balanced(leaf_tree, &comm));
+    for &key in leaf_tree {
+        // We only check interior keys. Leaf keys may not have a neighbor
+        // on the same level.
+        let mut parent = key.parent();
+        while parent.level() > 0 {
+            // Check that the key itself is there.
+            assert!(all_keys.contains_key(&key));
+            // Check that all its neighbours are there.
+            for neighbor in parent.neighbours().iter().filter(|&key| key.is_valid()) {
+                assert!(all_keys.contains_key(neighbor));
+            }
+            parent = parent.parent();
+            // Check that the parent is there.
+            assert!(all_keys.contains_key(&parent));
+        }
+    }
+
+    // At the end check that the root of the tree is also contained.
+    assert!(all_keys.contains_key(&MortonKey::root()));
+
+    // Count the number of ghosts on each rank
+    // Count the number of global keys on each rank.
+
+    // Assert that all ghosts are from a different rank and count them.
+
+    let nghosts = all_keys
+        .iter()
+        .filter_map(|(_, &value)| {
+            if let KeyType::Ghost(rank) = value {
+                assert!(rank != comm.size() as usize);
+                Some(rank)
+            } else {
+                None
+            }
+        })
+        .count();
+
+    if comm.size() == 1 {
+        assert_eq!(nghosts, 0);
+    } else {
+        assert!(nghosts > 0);
+    }
+
+    let nglobal = all_keys
+        .iter()
+        .filter(|(_, &value)| matches!(value, KeyType::Global))
+        .count();
+
+    // Assert that all globals across all ranks have the same count.
+
+    let nglobals = gather_to_all(std::slice::from_ref(&nglobal), &comm);
+
+    assert_eq!(nglobals.iter().unique().count(), 1);
+
+    if comm.rank() == 0 {
+        println!("Distributed tree is complete and linear.");
+    }
+}
diff --git a/examples/mpi_cumsum.rs b/examples/mpi_cumsum.rs
new file mode 100644
index 0000000..1b3fcc3
--- /dev/null
+++ b/examples/mpi_cumsum.rs
@@ -0,0 +1,62 @@
+//! Test the computation of a global bounding box across MPI ranks.
+
+use bempp_octree::tools::{gather_to_root, global_inclusive_cumsum};
+use itertools::{izip, Itertools};
+use mpi::traits::Communicator;
+use rand::prelude::*;
+use rand_chacha::ChaCha8Rng;
+
+pub fn main() {
+    // Initialise MPI
+    let universe = mpi::initialize().unwrap();
+
+    // Get the world communicator
+    let comm = universe.world();
+
+    // Initialise a seeded Rng.
+    let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
+
+    // Create `npoints` per rank.
+    let nelems = 10;
+
+    // Generate random numbers
+
+    let mut elems = Vec::<usize>::with_capacity(nelems);
+
+    for _ in 0..nelems {
+        elems.push(rng.gen_range(0..100));
+    }
+
+    // Compute the cumulative sum.
+
+    let global_cum_sum = global_inclusive_cumsum(&elems, &comm);
+
+    // Copy array to root and compare with inclusive scan there.
+
+    if let (Some(cum_sum_root), Some(original_array)) = (
+        gather_to_root(&global_cum_sum, &comm),
+        gather_to_root(&elems, &comm),
+    ) {
+        // Scan on root
+
+        let expected_cum_sum = original_array
+            .iter()
+            .scan(0, |state, x| {
+                *state += *x;
+                Some(*state)
+            })
+            .collect_vec();
+
+        // Check that the first element is not modified (inclusive cumsum)
+        assert_eq!(
+            original_array.first().unwrap(),
+            cum_sum_root.first().unwrap()
+        );
+
+        for (actual, expected) in izip!(cum_sum_root.iter(), expected_cum_sum.iter()) {
+            assert_eq!(*actual, *expected);
+        }
+
+        println!("Cumulative sum computed.");
+    }
+}
diff --git a/examples/mpi_global_bounding_box.rs b/examples/mpi_global_bounding_box.rs
new file mode 100644
index 0000000..144d0e5
--- /dev/null
+++ b/examples/mpi_global_bounding_box.rs
@@ -0,0 +1,40 @@
+//! Test the computation of a global bounding box across MPI ranks.
+
+use bempp_octree::{
+    geometry::PhysicalBox,
+    octree::compute_global_bounding_box,
+    tools::{gather_to_root, generate_random_points},
+};
+use rand::prelude::*;
+use rand_chacha::ChaCha8Rng;
+
+pub fn main() {
+    // Initialise MPI
+    let universe = mpi::initialize().unwrap();
+
+    // Get the world communicator
+    let comm = universe.world();
+
+    // Initialise a seeded Rng.
+    let mut rng = ChaCha8Rng::seed_from_u64(2);
+
+    // Create `npoints` per rank.
+    let npoints = 10;
+
+    // Generate random points.
+
+    let points = generate_random_points(npoints, &mut rng, &comm);
+
+    // Compute the distributed bounding box.
+
+    let bounding_box = compute_global_bounding_box(&points, &comm);
+
+    // Copy all points to root and compare local bounding box there.
+
+    if let Some(points_root) = gather_to_root(&points, &comm) {
+        // Compute the bounding box on root.
+
+        let expected = PhysicalBox::from_points(&points_root);
+        assert_eq!(expected.coordinates(), bounding_box.coordinates());
+    }
+}
diff --git a/examples/parsort.rs b/examples/parsort.rs
index 6361f6c..5de9c07 100644
--- a/examples/parsort.rs
+++ b/examples/parsort.rs
@@ -1,37 +1,25 @@
 //! Testing the hyksort component.
-use bempp_octree::parsort::{array_to_root, parsort};
-use itertools::Itertools;
+use bempp_octree::{
+    parsort::parsort,
+    tools::{generate_random_keys, is_sorted_array},
+};
 use mpi::traits::Communicator;
 use rand::prelude::*;
 
 pub fn main() {
     let universe = mpi::initialize().unwrap();
     let world = universe.world();
-    let rank = world.rank() as u64;
     let n_per_rank = 1000;
 
     let mut rng = rand::rngs::StdRng::seed_from_u64(0);
 
-    let mut arr = Vec::<u64>::new();
+    let keys = generate_random_keys(n_per_rank, &mut rng);
 
-    for _ in 0..n_per_rank {
-        arr.push(rng.gen());
-    }
-
-    // let splitters = get_splitters(&arr, &world, &mut rng);
-
-    // let bin_displs = get_bin_displacements(&arr, &splitters);
-
-    let arr = parsort(&arr, &world, &mut rng);
-    let arr = array_to_root(&arr, &world);
+    let arr = parsort(&keys, &world, &mut rng);
 
-    if rank == 0 {
-        let arr = arr.unwrap();
+    assert!(is_sorted_array(&arr, &world));
 
-        for (elem1, elem2) in arr.iter().tuple_windows() {
-            assert!(elem1 <= elem2);
-        }
-        println!("Sorted {} elements.", arr.len());
-        println!("Finished.");
+    if world.rank() == 0 {
+        println!("Array is sorted.");
     }
 }
diff --git a/src/geometry.rs b/src/geometry.rs
index 303fdf3..62b9753 100644
--- a/src/geometry.rs
+++ b/src/geometry.rs
@@ -1,9 +1,33 @@
 //! Geometry information
 
-use bytemuck;
+use mpi::traits::Equivalence;
 
 use crate::constants::DEEPEST_LEVEL;
 
+/// Definition of a point.
+#[derive(Clone, Copy, Equivalence)]
+pub struct Point {
+    coords: [f64; 3],
+    global_id: usize,
+}
+
+impl Point {
+    /// Create a new point from coordinates and global id.
+    pub fn new(coords: [f64; 3], global_id: usize) -> Self {
+        Self { coords, global_id }
+    }
+
+    /// Return the coordintes of a point.
+    pub fn coords(&self) -> [f64; 3] {
+        self.coords
+    }
+
+    /// Return the global id of the point.
+    pub fn global_id(&self) -> usize {
+        self.global_id
+    }
+}
+
 /// A bounding box describes geometry in which an Octree lives.
 pub struct PhysicalBox {
     coords: [f64; 6],
@@ -18,11 +42,7 @@ impl PhysicalBox {
     }
 
     /// Give a slice of points. Compute an associated bounding box.
-    pub fn from_points(points: &[f64]) -> PhysicalBox {
-        assert_eq!(points.len() % 3, 0);
-
-        let points: &[[f64; 3]] = bytemuck::cast_slice(points);
-
+    pub fn from_points(points: &[Point]) -> PhysicalBox {
         let mut xmin = f64::MAX;
         let mut xmax = f64::MIN;
 
@@ -33,9 +53,9 @@ impl PhysicalBox {
         let mut zmax = f64::MIN;
 
         for point in points {
-            let x = point[0];
-            let y = point[1];
-            let z = point[2];
+            let x = point.coords()[0];
+            let y = point.coords()[1];
+            let z = point.coords()[2];
 
             xmin = f64::min(xmin, x);
             xmax = f64::max(xmax, x);
diff --git a/src/lib.rs b/src/lib.rs
index 3f5fbcc..c94ad07 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -7,4 +7,6 @@ pub mod geometry;
 pub mod morton;
 pub mod octree;
 pub mod parsort;
+pub mod serial;
+pub mod tools;
 pub mod types;
diff --git a/src/morton.rs b/src/morton.rs
index 6701ead..5fee9a3 100644
--- a/src/morton.rs
+++ b/src/morton.rs
@@ -5,16 +5,17 @@ use crate::constants::{
     LEVEL_SIZE, NINE_BIT_MASK, NSIBLINGS, X_LOOKUP_DECODE, X_LOOKUP_ENCODE, Y_LOOKUP_DECODE,
     Y_LOOKUP_ENCODE, Z_LOOKUP_DECODE, Z_LOOKUP_ENCODE,
 };
-use crate::geometry::PhysicalBox;
+use crate::geometry::{PhysicalBox, Point};
 use itertools::izip;
 use itertools::Itertools;
+use mpi::traits::Equivalence;
 use std::collections::HashSet;
 
 /// A morton key
 ///
 /// This is a distinct type to distinguish from u64
 /// numbers.
-#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Equivalence)]
 pub struct MortonKey {
     value: u64,
 }
@@ -34,6 +35,13 @@ impl MortonKey {
         key
     }
 
+    /// A key that is not valid or well formed but guaranteed to be larger than any valid key.
+    ///
+    /// This is useful when a guaranteed upper bound is needed.
+    pub fn upper_bound() -> Self {
+        Self { value: u64::MAX }
+    }
+
     /// Check if a key is invalid.
     pub fn invalid_key() -> Self {
         Self { value: 1 << 63 }
@@ -147,9 +155,9 @@ impl MortonKey {
 
     /// Map a physical point within a bounding box to a Morton key on a given level.
     /// It is assumed that points are strictly contained within the bounding box.
-    pub fn from_physical_point(point: [f64; 3], bounding_box: &PhysicalBox, level: usize) -> Self {
+    pub fn from_physical_point(point: Point, bounding_box: &PhysicalBox, level: usize) -> Self {
         let level_size = 1 << level;
-        let reference = bounding_box.physical_to_reference(point);
+        let reference = bounding_box.physical_to_reference(point.coords());
         let x = (reference[0] * level_size as f64) as usize;
         let y = (reference[1] * level_size as f64) as usize;
         let z = (reference[2] * level_size as f64) as usize;
@@ -381,6 +389,10 @@ impl MortonKey {
         let mut result = [MortonKey::default(); 26];
 
         let (level, [x, y, z]) = self.decode();
+
+        if level == 0 {
+            return result;
+        }
         let level_size = 1 << level;
 
         for (direction, res) in izip!(DIRECTIONS, result.iter_mut()) {
@@ -409,6 +421,69 @@ impl MortonKey {
         result
     }
 
+    /// Return the index of the key as a child of the parent, i.e. 0, 1, ..., 7.
+    #[inline(always)]
+    pub fn child_index(&self) -> usize {
+        if *self == MortonKey::root() {
+            return 0;
+        }
+        let level = self.level() as u64;
+
+        let shift = LEVEL_DISPLACEMENT + 3 * (DEEPEST_LEVEL - level);
+
+        ((self.value >> shift) % 8) as usize
+    }
+
+    /// Return the finest descendent that is opposite to the joint corner with the siblings.
+    pub fn finest_outer_descendent(&self) -> MortonKey {
+        // First find out which child the current key is.
+
+        let level = self.level() as u64;
+
+        if level == DEEPEST_LEVEL {
+            return *self;
+        }
+
+        let mut child_level = 1 + level;
+        let mut key = *self;
+        let outer_index = self.child_index() as u64;
+
+        while child_level <= DEEPEST_LEVEL {
+            let shift = LEVEL_DISPLACEMENT + 3 * (DEEPEST_LEVEL - child_level);
+            key = MortonKey::new(1 + (key.value | outer_index << shift));
+            child_level += 1;
+        }
+
+        key
+    }
+
+    /// Return the next possible Morton key on the deepest level that is not a descendent of the current key.
+    ///
+    /// If the key is already the last possible key then return None.
+    pub fn next_non_descendent_key(&self) -> Option<MortonKey> {
+        // If we are an ancestor of deepest_last we return None as then there
+        // is next key.
+
+        if self.is_ancestor(MortonKey::deepest_last()) {
+            return None;
+        }
+
+        let level = self.level() as u64;
+
+        let level_diff = DEEPEST_LEVEL - level;
+        let shift = LEVEL_DISPLACEMENT + 3 * level_diff;
+
+        // Need to know which sibling we are.
+        let child_index = ((self.value >> shift) % 8) as usize;
+        // If we are between 0 and 6 take the next sibling and go to deepest level.
+        if child_index < 7 {
+            Some(MortonKey::new(self.value + (1 << shift) + level_diff))
+        } else {
+            // If we are the last child go to the parent and take next key from there.
+            self.parent().next_non_descendent_key()
+        }
+    }
+
     /// Linearize by sorting and removing overlaps.
     pub fn linearize(keys: &[MortonKey]) -> Vec<MortonKey> {
         let mut new_keys = Vec::<MortonKey>::new();
@@ -476,8 +551,10 @@ impl MortonKey {
         result
     }
 
-    /// Complete a region ensuring that the given keys are part of the leafs.
-    pub fn complete_region(keys: &[MortonKey]) -> Vec<MortonKey> {
+    /// Complete a tree ensuring that the given keys are part of the leafs.
+    ///
+    /// The given keys must not overlap.
+    pub fn complete_tree(keys: &[MortonKey]) -> Vec<MortonKey> {
         // First make sure that the input sequence is sorted.
         let mut keys = keys.to_vec();
         keys.sort_unstable();
@@ -490,20 +567,13 @@ impl MortonKey {
             return result;
         }
 
-        // If a single element is given then just return the result if it is the root of the tree.
-        if keys.len() == 1 && result[0] == MortonKey::from_index_and_level([0, 0, 0], 0) {
-            return result;
+        // If just the root is given return that.
+        if keys.len() == 1 && *keys.first().unwrap() == MortonKey::root() {
+            return keys.to_vec();
         }
 
-        let deepest_first = MortonKey::from_index_and_level([0, 0, 0], DEEPEST_LEVEL as usize);
-        let deepest_last = MortonKey::from_index_and_level(
-            [
-                LEVEL_SIZE as usize - 1,
-                LEVEL_SIZE as usize - 1,
-                LEVEL_SIZE as usize - 1,
-            ],
-            DEEPEST_LEVEL as usize,
-        );
+        let deepest_first = MortonKey::deepest_first();
+        let deepest_last = MortonKey::deepest_last();
 
         // If the first key is not an ancestor of the deepest possible first element in the
         // tree get the finest ancestor between the two and use the first child of that.
@@ -646,13 +716,7 @@ impl MortonKey {
                     );
                 }
             }
-            new_work_list.extend_from_slice(
-                keys.iter()
-                    .copied()
-                    .filter(|&key| key.level() == level - 1)
-                    .collect_vec()
-                    .as_slice(),
-            );
+            new_work_list.extend(keys.iter().copied().filter(|&key| key.level() == level - 1));
 
             work_list = new_work_list;
             // Now extend the work list with the
@@ -961,8 +1025,6 @@ mod test {
 
         let keys = children[1].fill_between_keys(children[2]);
         assert!(keys.is_empty());
-
-        // Correct result for two keys at deepest level
     }
 
     #[test]
@@ -1011,13 +1073,13 @@ mod test {
 
         let keys = [key1, key2, key3];
 
-        let complete_region = MortonKey::complete_region(keys.as_slice());
+        let complete_region = MortonKey::complete_tree(keys.as_slice());
 
         sanity_checks(keys.as_slice(), complete_region.as_slice());
 
         // For an empty slice the complete region method should just add the root of the tree.
         let keys = Vec::<MortonKey>::new();
-        let complete_region = MortonKey::complete_region(keys.as_slice());
+        let complete_region = MortonKey::complete_tree(keys.as_slice());
         assert_eq!(complete_region.len(), 1);
 
         sanity_checks(keys.as_slice(), complete_region.as_slice());
@@ -1026,7 +1088,7 @@ mod test {
 
         let keys = [MortonKey::deepest_first(), MortonKey::deepest_last()];
 
-        let complete_region = MortonKey::complete_region(keys.as_slice());
+        let complete_region = MortonKey::complete_tree(keys.as_slice());
 
         sanity_checks(keys.as_slice(), complete_region.as_slice());
     }
@@ -1222,7 +1284,7 @@ mod test {
     pub fn test_from_physical_point() {
         let bounding_box = PhysicalBox::new([-2.0, -3.0, -1.0, 4.0, 5.0, 6.0]);
 
-        let point = [1.5, -2.5, 5.0];
+        let point = Point::new([1.5, -2.5, 5.0], 0);
         let level = 10;
 
         let key = MortonKey::from_physical_point(point, &bounding_box, level);
@@ -1231,10 +1293,79 @@ mod test {
 
         let coords = physical_box.coordinates();
 
-        assert!(coords[0] <= point[0] && point[0] < coords[3]);
-        assert!(coords[1] <= point[1] && point[1] < coords[4]);
-        assert!(coords[2] <= point[2] && point[2] < coords[5]);
+        assert!(coords[0] <= point.coords()[0] && point.coords()[0] < coords[3]);
+        assert!(coords[1] <= point.coords()[1] && point.coords()[1] < coords[4]);
+        assert!(coords[2] <= point.coords()[2] && point.coords()[2] < coords[5]);
 
         // Now compute the box.
     }
+
+    #[test]
+    pub fn test_child_index() {
+        let key = MortonKey::from_index_and_level([1, 501, 718], 10);
+
+        let children = key.children();
+
+        for (index, child) in children.iter().enumerate() {
+            assert_eq!(index, child.child_index());
+        }
+    }
+
+    #[test]
+    pub fn test_finest_outer_descendent() {
+        let key = MortonKey::from_index_and_level([0, 0, 0], 1);
+
+        let finest_outer_descendent = key.finest_outer_descendent();
+
+        assert_eq!(
+            finest_outer_descendent,
+            MortonKey::from_index_and_level([0, 0, 0], DEEPEST_LEVEL as usize)
+        );
+
+        let key = MortonKey::from_index_and_level([1, 1, 0], 1);
+        let finest_outer_descendent = key.finest_outer_descendent();
+
+        assert_eq!(
+            finest_outer_descendent,
+            MortonKey::from_index_and_level(
+                [LEVEL_SIZE as usize - 1, LEVEL_SIZE as usize - 1, 0],
+                DEEPEST_LEVEL as usize
+            )
+        );
+    }
+
+    #[test]
+    pub fn test_next_nondescendent_key() {
+        let key = MortonKey::from_index_and_level([25, 17, 6], 5);
+
+        let children = key.children();
+
+        // Check the next nondescendent key for the first six children
+
+        for (child, next_child) in children.iter().tuple_windows() {
+            let next_key = child.next_non_descendent_key().unwrap();
+            assert_eq!(next_key.level(), DEEPEST_LEVEL as usize);
+            assert!(!child.is_ancestor(next_key));
+            assert!(next_child.is_ancestor(next_key));
+        }
+
+        // Now check the next nondescendent key from the last child.
+
+        let next_child = children.last().unwrap().next_non_descendent_key();
+
+        // Check that the next nondescendent key from the parent is the same as that of the last child.
+
+        assert_eq!(key.next_non_descendent_key(), next_child);
+
+        // Check that it is not a descendent of the parent and that its level is correct.
+
+        assert_eq!(next_child.unwrap().level(), DEEPEST_LEVEL as usize);
+        assert!(!key.is_ancestor(next_child.unwrap()));
+
+        // Finally make sure that an ancestor of deepest last returns None.
+
+        assert!(MortonKey::deepest_last()
+            .next_non_descendent_key()
+            .is_none());
+    }
 }
diff --git a/src/octree.rs b/src/octree.rs
index e68297d..5dbd076 100644
--- a/src/octree.rs
+++ b/src/octree.rs
@@ -1,325 +1,168 @@
-//! Definition of a linear octree
+//! Definition of Octree.
+pub mod parallel;
+use std::collections::HashMap;
+
+use mpi::traits::CommunicatorCollectives;
+pub use parallel::*;
+use rand::SeedableRng;
+use rand_chacha::ChaCha8Rng;
 
 use crate::{
-    constants::{DEEPEST_LEVEL, NLEVELS},
-    geometry::PhysicalBox,
+    constants::DEEPEST_LEVEL,
+    geometry::{PhysicalBox, Point},
     morton::MortonKey,
 };
-use bytemuck;
-use std::collections::HashMap;
-use vtkio;
-
-/// A neighbour
-pub struct Neighbour {
-    /// Direction
-    pub direction: [i64; 3],
-    /// Level
-    pub level: usize,
-    /// Morton key
-    pub key: MortonKey,
+
+/// Stores what type of key it is.
+#[derive(PartialEq, Eq, Hash, Copy, Clone, Debug)]
+pub enum KeyType {
+    /// A local leaf.
+    LocalLeaf,
+    /// A local interior key.
+    LocalInterior,
+    /// A global key.
+    Global,
+    /// A ghost key from a specific process.
+    Ghost(usize),
 }
 
-/// An octree
-pub struct Octree {
-    leaf_keys: Vec<MortonKey>,
-    points: Vec<[f64; 3]>,
-    point_to_level_keys: [Vec<MortonKey>; NLEVELS],
+/// A general structure for octrees.
+pub struct Octree<'o, C> {
+    points: Vec<Point>,
+    point_keys: Vec<MortonKey>,
+    coarse_tree: Vec<MortonKey>,
+    leaf_tree: Vec<MortonKey>,
+    coarse_tree_bounds: Vec<MortonKey>,
+    all_keys: HashMap<MortonKey, KeyType>,
     bounding_box: PhysicalBox,
-    key_counts: HashMap<MortonKey, usize>,
-    max_leaf_level: usize,
-    max_points_in_leaf: usize,
+    comm: &'o C,
 }
 
-impl Octree {
-    /// Create octress from points
-    pub fn from_points(points: &[f64], max_level: usize, max_points_per_box: usize) -> Self {
-        // Make sure that the points array is a multiple of 3.
-        assert_eq!(points.len() % 3, 0);
-
-        // Make sure that max level never exceeds DEEPEST_LEVEL
-        let max_level = if max_level > DEEPEST_LEVEL as usize {
-            DEEPEST_LEVEL as usize
-        } else {
-            max_level
-        };
+impl<'o, C: CommunicatorCollectives> Octree<'o, C> {
+    /// Create a new distributed Octree.
+    pub fn new(points: &[Point], max_level: usize, max_leaf_points: usize, comm: &'o C) -> Self {
+        // We need a random number generator for sorting. For simplicity we use a ChaCha8 random number generator
+        // seeded with the rank of the process.
+        let mut rng = ChaCha8Rng::seed_from_u64(comm.rank() as u64);
 
-        // Compute the physical bounding box.
+        // First compute the Morton keys of the points.
+        let (point_keys, bounding_box) = points_to_morton(points, DEEPEST_LEVEL as usize, comm);
 
-        let bounding_box = PhysicalBox::from_points(points);
+        // Generate the coarse tree
 
-        // Bunch the points in arrays of 3.
+        let (coarse_tree, leaf_tree) = {
+            // Linearize the keys.
+            let linear_keys = linearize(&point_keys, &mut rng, comm);
 
-        let points: &[[f64; 3]] = bytemuck::cast_slice(points);
-        let npoints = points.len();
+            // Compute the first version of the coarse tree without load balancing.
+            // We want to ensure that it is 2:1 balanced.
+            let coarse_tree = compute_coarse_tree(&linear_keys, comm);
 
-        // We create a vector of keys for each point on each level. We compute the
-        // keys on the deepest level and fill the other levels by going from
-        // parent to parent.
+            let coarse_tree = balance(&coarse_tree, &mut rng, comm);
+            debug_assert!(is_complete_linear_tree(&coarse_tree, comm));
 
-        let mut point_to_level_keys: [Vec<MortonKey>; NLEVELS] = Default::default();
-        point_to_level_keys[DEEPEST_LEVEL as usize] = points
-            .iter()
-            .map(|&point| {
-                MortonKey::from_physical_point(point, &bounding_box, DEEPEST_LEVEL as usize)
-            })
-            .collect::<Vec<_>>();
+            // We now compute the weights for the initial coarse tree.
 
-        for index in (1..=DEEPEST_LEVEL as usize).rev() {
-            let mut new_vec = Vec::<MortonKey>::with_capacity(npoints);
-            for &key in &point_to_level_keys[index] {
-                new_vec.push(key.parent());
-            }
-            point_to_level_keys[index - 1] = new_vec;
-        }
+            let weights = compute_coarse_tree_weights(&linear_keys, &coarse_tree, comm);
 
-        // We now have to create level keys. We are starting at the root and recursing
-        // down until each box has fewer than max_points_per_box keys.
+            // We now load balance the initial coarse tree. This forms our final coarse tree
+            // that is used from now on.
 
-        // First we compute the counts of each key on each level. For that we create
-        // for each level a Hashmap for the keys and then add up.
+            let coarse_tree = load_balance(&coarse_tree, &weights, comm);
+            // We also want to redistribute the fine keys with respect to the load balanced coarse trees.
 
-        let mut key_counts: HashMap<MortonKey, usize> = Default::default();
+            let fine_keys =
+                redistribute_with_respect_to_coarse_tree(&linear_keys, &coarse_tree, comm);
 
-        for keys in &point_to_level_keys {
-            for key in keys {
-                *key_counts.entry(*key).or_default() += 1;
-            }
-        }
+            // We now create the refined tree by recursing the coarse tree until we are at max level
+            // or the fine tree keys per coarse tree box is small enough.
+            let refined_tree =
+                create_local_tree(&fine_keys, &coarse_tree, max_level, max_leaf_points);
 
-        // We can now easily create an adaptive tree by subdividing. We do this by
-        // a recursive function.
-
-        let mut leaf_keys = Vec::<MortonKey>::new();
-
-        fn recurse_keys(
-            key: MortonKey,
-            key_counts: &HashMap<MortonKey, usize>,
-            leaf_keys: &mut Vec<MortonKey>,
-            max_points_per_box: usize,
-            max_level: usize,
-        ) {
-            let level = key.level();
-            // A key may have not be associated with points. This happens if one of the children on
-            // the previous level has no points in its physical box. However, we want to create a
-            // complete tree. So we still add this one empty child.
-            if let Some(&count) = key_counts.get(&key) {
-                if count > max_points_per_box && level < max_level {
-                    for child in key.children() {
-                        recurse_keys(child, key_counts, leaf_keys, max_points_per_box, max_level);
-                    }
-                } else {
-                    leaf_keys.push(key)
-                }
-            } else {
-                leaf_keys.push(key)
-            }
-        }
+            // We now need to 2:1 balance the refined tree and then redistribute again with respect to the coarse tree.
+
+            let refined_tree = redistribute_with_respect_to_coarse_tree(
+                &balance(&refined_tree, &mut rng, comm),
+                &coarse_tree,
+                comm,
+            );
+
+            (coarse_tree, refined_tree)
 
-        // Now execute the recursion starting from root
+            // redistribute the balanced tree according to coarse tree
+        };
 
-        recurse_keys(
-            MortonKey::root(),
-            &key_counts,
-            &mut leaf_keys,
-            max_points_per_box,
-            max_level,
+        let (points, point_keys) = redistribute_points_with_respect_to_coarse_tree(
+            points,
+            &point_keys,
+            &coarse_tree,
+            comm,
         );
 
-        // The leaf keys are now a complete linear tree. But they are not yet balanced.
-        // In the final step we balance the leafs.
+        let coarse_tree_bounds = get_tree_bins(&coarse_tree, comm);
 
-        let leaf_keys = MortonKey::balance(&leaf_keys, MortonKey::root());
+        // Duplicate the coarse tree across all nodes
 
-        let mut max_leaf_level = 0;
-        let mut max_points_in_leaf = 0;
+        // let coarse_tree = gather_to_all(&coarse_tree, comm);
 
-        for key in &leaf_keys {
-            max_leaf_level = max_leaf_level.max(key.level());
-            max_points_in_leaf =
-                max_points_in_leaf.max(if let Some(&count) = key_counts.get(key) {
-                    count
-                } else {
-                    0
-                });
-        }
+        let all_keys = generate_all_keys(&leaf_tree, &coarse_tree, &coarse_tree_bounds, comm);
 
         Self {
-            leaf_keys,
             points: points.to_vec(),
-            point_to_level_keys,
+            point_keys,
+            coarse_tree,
+            leaf_tree,
+            coarse_tree_bounds,
+            all_keys,
             bounding_box,
-            key_counts,
-            max_leaf_level,
-            max_points_in_leaf,
+            comm,
         }
     }
 
-    /// Leaf keys
-    pub fn leaf_keys(&self) -> &Vec<MortonKey> {
-        &self.leaf_keys
-    }
-
-    /// Points
-    pub fn points(&self) -> &Vec<[f64; 3]> {
-        &self.points
-    }
-
-    /// Get level keys for each point
-    pub fn point_to_level_keys(&self) -> &[Vec<MortonKey>; NLEVELS] {
-        &self.point_to_level_keys
+    /// Return the keys associated with the redistributed points.
+    pub fn point_keys(&self) -> &Vec<MortonKey> {
+        &self.point_keys
     }
 
-    /// Bounding box
+    /// Return the bounding box.
     pub fn bounding_box(&self) -> &PhysicalBox {
         &self.bounding_box
     }
 
-    /// Maximum leaf level
-    pub fn maximum_leaf_level(&self) -> usize {
-        self.max_leaf_level
+    /// Return the associated coarse tree.
+    pub fn coarse_tree(&self) -> &Vec<MortonKey> {
+        &self.coarse_tree
     }
 
-    /// Maximum number of points in a leaf box
-    pub fn max_points_in_leaf_box(&self) -> usize {
-        self.max_points_in_leaf
-    }
-
-    /// Number of points in the box indexed by a key
-    pub fn number_of_points_in_key(&self, key: MortonKey) -> usize {
-        if let Some(&count) = self.key_counts.get(&key) {
-            count
-        } else {
-            0
-        }
+    /// Return the points.
+    ///
+    /// Points are distributed across the nodes as part of the tree generation.
+    /// This function returns the redistributed points.
+    pub fn points(&self) -> &Vec<Point> {
+        &self.points
     }
 
-    /// Export the tree to vtk
-    pub fn export_to_vtk(&self, file_path: &str) {
-        use vtkio::model::{
-            Attributes, ByteOrder, CellType, Cells, DataSet, IOBuffer, UnstructuredGridPiece,
-            Version, VertexNumbers,
-        };
-
-        // Each box has 8 corners with 3 coordinates each, hence 24 floats per key.
-        let mut points = Vec::<f64>::new();
-        // 8 coords per box, hence 8 * nkeys values in connectivity.
-        let mut connectivity = Vec::<u64>::new();
-        // Store the vtk offset for each box.
-        let mut offsets = Vec::<u64>::new();
-
-        let bounding_box = self.bounding_box();
-
-        // Go through the keys and add coordinates and connectivity.
-        // Box coordinates are already in the right order, so connectivity
-        // just counts up. We don't mind doubly counted vertices from two boxes.
-        let mut point_count = 0;
-        let mut key_count = 0;
-
-        for key in self.leaf_keys().iter() {
-            // We only want to export non-empty boxes.
-            if self.number_of_points_in_key(*key) == 0 {
-                continue;
-            }
-            let coords = key.physical_box(bounding_box).corners();
-
-            key_count += 1;
-            offsets.push(8 * key_count);
-
-            for coord in &coords {
-                points.push(coord[0]);
-                points.push(coord[1]);
-                points.push(coord[2]);
-
-                connectivity.push(point_count);
-                point_count += 1;
-            }
-        }
-
-        let vtk_file = vtkio::Vtk {
-            version: Version::new((1, 0)),
-            title: String::new(),
-            byte_order: ByteOrder::LittleEndian,
-            file_path: None,
-            data: DataSet::inline(UnstructuredGridPiece {
-                points: IOBuffer::F64(points),
-                cells: Cells {
-                    cell_verts: VertexNumbers::XML {
-                        connectivity,
-                        offsets,
-                    },
-                    types: vec![CellType::Hexahedron; key_count as usize],
-                },
-                data: Attributes {
-                    point: vec![],
-                    cell: vec![],
-                },
-            }),
-        };
-
-        vtk_file.export_ascii(file_path).unwrap();
+    /// Return the leaf tree.
+    pub fn leaf_tree(&self) -> &Vec<MortonKey> {
+        &self.leaf_tree
     }
 
-    // We can now create the vtk object.
-}
-
-#[cfg(test)]
-mod test {
-    use super::Octree;
-    use rand::prelude::*;
-
-    fn get_points_on_sphere(npoints: usize) -> Vec<f64> {
-        let mut rng = rand::rngs::StdRng::seed_from_u64(0);
-        let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
-
-        let mut points = Vec::<f64>::with_capacity(3 * npoints);
-        for _ in 0..(npoints) {
-            let x: f64 = normal.sample(&mut rng);
-            let y: f64 = normal.sample(&mut rng);
-            let z: f64 = normal.sample(&mut rng);
-
-            let norm = (x * x + y * y + z * z).sqrt();
-
-            points.push(x / norm);
-            points.push(y / norm);
-            points.push(z / norm);
-        }
-
-        points
+    /// Get the coarse tree bounds.
+    ///
+    /// This returns an array of size the number of ranks,
+    /// where each element consists of the smallest Morton key in
+    /// the corresponding rank.
+    pub fn coarse_tree_bounds(&self) -> &Vec<MortonKey> {
+        &self.coarse_tree_bounds
     }
 
-    #[test]
-    fn test_octree() {
-        use std::time::Instant;
-
-        let npoints = 1000000;
-        let points = get_points_on_sphere(npoints);
-        let max_level = 7;
-        let max_points_per_box = 100;
-
-        let start = Instant::now();
-        let octree = Octree::from_points(&points, max_level, max_points_per_box);
-        let duration = start.elapsed();
-
-        println!("Creation time: {}", duration.as_millis());
-        println!("Number of leaf keys: {}", octree.leaf_keys().len());
-        println!("Bounding box: {}", octree.bounding_box());
+    /// Return the communicator.
+    pub fn comm(&self) -> &C {
+        self.comm
     }
 
-    #[test]
-    fn test_export() {
-        let fname = "_test_sphere.vtk";
-        let npoints = 1000000;
-        let points = get_points_on_sphere(npoints);
-        let max_level = 7;
-        let max_points_per_box = 100;
-
-        let octree = Octree::from_points(&points, max_level, max_points_per_box);
-
-        octree.export_to_vtk(fname);
-        println!("Maximum leaf level: {}", octree.maximum_leaf_level());
-        println!(
-            "Maximum number of points in leaf box: {}",
-            octree.max_points_in_leaf_box()
-        );
+    /// Return a map of all keys.
+    pub fn all_keys(&self) -> &HashMap<MortonKey, KeyType> {
+        &self.all_keys
     }
 }
diff --git a/src/octree/parallel.rs b/src/octree/parallel.rs
new file mode 100644
index 0000000..4a8a80f
--- /dev/null
+++ b/src/octree/parallel.rs
@@ -0,0 +1,1033 @@
+//! Parallel Octree structure
+
+use std::collections::{HashMap, HashSet};
+
+use crate::{
+    constants::{DEEPEST_LEVEL, NSIBLINGS},
+    geometry::{PhysicalBox, Point},
+    morton::MortonKey,
+    parsort::parsort,
+    tools::{
+        communicate_back, gather_to_all, gather_to_root, global_inclusive_cumsum, redistribute,
+        sort_to_bins,
+    },
+};
+
+use mpi::traits::{Equivalence, Root};
+
+use itertools::{izip, Itertools};
+use mpi::{collective::SystemOperation, traits::CommunicatorCollectives};
+use rand::Rng;
+
+use super::KeyType;
+
+/// Compute the global bounding box across all points on all processes.
+pub fn compute_global_bounding_box<C: CommunicatorCollectives>(
+    points: &[Point],
+    comm: &C,
+) -> PhysicalBox {
+    // Make sure that the points array is a multiple of 3.
+
+    // Now compute the minimum and maximum across each dimension.
+
+    let mut xmin = f64::MAX;
+    let mut xmax = f64::MIN;
+
+    let mut ymin = f64::MAX;
+    let mut ymax = f64::MIN;
+
+    let mut zmin = f64::MAX;
+    let mut zmax = f64::MIN;
+
+    for point in points {
+        let x = point.coords()[0];
+        let y = point.coords()[1];
+        let z = point.coords()[2];
+
+        xmin = f64::min(xmin, x);
+        xmax = f64::max(xmax, x);
+
+        ymin = f64::min(ymin, y);
+        ymax = f64::max(ymax, y);
+
+        zmin = f64::min(zmin, z);
+        zmax = f64::max(zmax, z);
+    }
+
+    let mut global_xmin = 0.0;
+    let mut global_xmax = 0.0;
+
+    let mut global_ymin = 0.0;
+    let mut global_ymax = 0.0;
+
+    let mut global_zmin = 0.0;
+    let mut global_zmax = 0.0;
+
+    comm.all_reduce_into(&xmin, &mut global_xmin, SystemOperation::min());
+    comm.all_reduce_into(&xmax, &mut global_xmax, SystemOperation::max());
+
+    comm.all_reduce_into(&ymin, &mut global_ymin, SystemOperation::min());
+    comm.all_reduce_into(&ymax, &mut global_ymax, SystemOperation::max());
+
+    comm.all_reduce_into(&zmin, &mut global_zmin, SystemOperation::min());
+    comm.all_reduce_into(&zmax, &mut global_zmax, SystemOperation::max());
+
+    let xdiam = global_xmax - global_xmin;
+    let ydiam = global_ymax - global_ymin;
+    let zdiam = global_zmax - global_zmin;
+
+    let xmean = global_xmin + 0.5 * xdiam;
+    let ymean = global_ymin + 0.5 * ydiam;
+    let zmean = global_zmin + 0.5 * zdiam;
+
+    // We increase diameters by box size on deepest level
+    // and use the maximum diameter to compute a
+    // cubic bounding box.
+
+    let deepest_box_diam = 1.0 / (1 << DEEPEST_LEVEL) as f64;
+
+    let max_diam = [xdiam, ydiam, zdiam].into_iter().reduce(f64::max).unwrap();
+
+    let max_diam = max_diam * (1.0 + deepest_box_diam);
+
+    PhysicalBox::new([
+        xmean - 0.5 * max_diam,
+        ymean - 0.5 * max_diam,
+        zmean - 0.5 * max_diam,
+        xmean + 0.5 * max_diam,
+        ymean + 0.5 * max_diam,
+        zmean + 0.5 * max_diam,
+    ])
+}
+
+/// Convert points to Morton keys on specified level.
+pub fn points_to_morton<C: CommunicatorCollectives>(
+    points: &[Point],
+    max_level: usize,
+    comm: &C,
+) -> (Vec<MortonKey>, PhysicalBox) {
+    // Make sure that max level never exceeds DEEPEST_LEVEL
+    let max_level = if max_level > DEEPEST_LEVEL as usize {
+        DEEPEST_LEVEL as usize
+    } else {
+        max_level
+    };
+
+    // Compute the physical bounding box.
+
+    let bounding_box = compute_global_bounding_box(points, comm);
+
+    // Bunch the points in arrays of 3.
+
+    let keys = points
+        .iter()
+        .map(|&point| MortonKey::from_physical_point(point, &bounding_box, max_level))
+        .collect_vec();
+
+    (keys, bounding_box)
+}
+
+/// Take a linear sequence of Morton keys and compute a complete linear associated coarse tree.
+/// The returned coarse tree is load balanced according to the number of linear keys in each coarse block.
+pub fn compute_coarse_tree<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    let size = comm.size();
+
+    debug_assert!(is_linear_tree(linear_keys, comm));
+
+    // On a single node a complete coarse tree is simply the root.
+    if size == 1 {
+        return vec![MortonKey::root()];
+    }
+
+    let mut completed_region = linear_keys
+        .first()
+        .unwrap()
+        .fill_between_keys(*linear_keys.last().unwrap());
+
+    completed_region.insert(0, *linear_keys.first().unwrap());
+    completed_region.push(*linear_keys.last().unwrap());
+
+    // Get the smallest level members of the completed region.
+
+    let min_level = completed_region
+        .iter()
+        .map(|elem| elem.level())
+        .min()
+        .unwrap();
+
+    // Each process selects its largest boxes. These are used to create
+    // a coarse tree.
+
+    let largest_boxes = completed_region
+        .iter()
+        .filter(|elem| elem.level() == min_level)
+        .copied()
+        .collect_vec();
+
+    debug_assert!(is_linear_tree(&largest_boxes, comm));
+
+    complete_tree(&largest_boxes, comm)
+}
+
+/// Compute the weights of each coarse tree block as the number of linear keys associated with each coarse block.
+pub fn compute_coarse_tree_weights<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> Vec<usize> {
+    let rank = comm.rank();
+    // We want to partition the coarse tree. But we need the correct weights. The idea
+    // is that we use the number of original leafs that intersect with the coarse tree
+    // as leafs. In order to compute this we send the coarse tree around to all processes
+    // so that each process computes for each coarse tree element how many of its keys
+    // intersect with each node of the coarse tree. We then sum up the local weight for each
+    // coarse tree node across all nodes to get the weight.
+
+    let global_coarse_tree = gather_to_all(coarse_tree, comm);
+
+    // We also want to send around a corresponding array of ranks so that for each global coarse tree key
+    // we have the rank of where it originates from.
+
+    let coarse_tree_ranks = gather_to_all(&vec![rank as usize; coarse_tree.len()], comm);
+
+    // We now compute the local weights.
+    let mut local_weight_contribution = vec![0; global_coarse_tree.len()];
+
+    // In the following loop we want to be a bit smart. We do not iterate through all the local elements.
+    // We know that our keys are sorted and also that the coarse tree keys are sorted. So we find the region
+    // of our sorted keys that overlaps with the coarse tree region.
+
+    // Let's find the start of our region. The start of our region is a coarse key that is an ancestor
+    // of our current key. This works because the coarse tree has levels at most as high as the sorted keys.
+
+    let first_key = *linear_keys.first().unwrap();
+
+    let first_coarse_index = global_coarse_tree
+        .iter()
+        .take_while(|coarse_key| !coarse_key.is_ancestor(first_key))
+        .count();
+
+    // Now we need to find the end index of our region. For this again we find the index of our coarse tree that
+    // is an ancestor of our last key.
+    let last_key = *linear_keys.last().unwrap();
+
+    let last_coarse_index = global_coarse_tree
+        .iter()
+        .take_while(|coarse_key| !coarse_key.is_ancestor(last_key))
+        .count();
+
+    // We now only need to iterate through between the first and last coarse index in the coarse tree.
+    // In the way we have computed the indices. The last coarse index is inclusive (it is the ancestor of our last key).
+
+    for (w, &global_coarse_key) in izip!(
+        local_weight_contribution[first_coarse_index..=last_coarse_index].iter_mut(),
+        global_coarse_tree[first_coarse_index..=last_coarse_index].iter()
+    ) {
+        *w += linear_keys
+            .iter()
+            .filter(|&&key| global_coarse_key.is_ancestor(key))
+            .count();
+    }
+
+    // We now need to sum up the weights across all processes.
+
+    let mut global_weights = vec![0; global_coarse_tree.len()];
+
+    comm.all_reduce_into(
+        &local_weight_contribution,
+        &mut global_weights,
+        SystemOperation::sum(),
+    );
+
+    // Each process now has all weights. However, we only need the ones for the current process.
+    // So we just filter the rest out.
+
+    izip!(coarse_tree_ranks, global_weights)
+        .filter_map(|(r, weight)| {
+            if r == rank as usize {
+                Some(weight)
+            } else {
+                None
+            }
+        })
+        .collect_vec()
+}
+
+/// Redistribute sorted keys with respect to a linear coarse tree.
+pub fn redistribute_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    let size = comm.size();
+
+    if size == 1 {
+        return linear_keys.to_vec();
+    }
+
+    // We want to globally redistribute keys so that the keys on each process are descendents
+    // of the local coarse tree keys.
+
+    // We are using here the fact that the coarse tree is complete and sorted.
+    // We are sending around to each process the first local index. This
+    // defines bins in which we sort our keys. The keys are then sent around to the correct
+    // processes via an alltoallv operation.
+
+    let my_first = coarse_tree.first().unwrap();
+
+    let global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
+
+    // We now have our bins. We go through our keys and store how
+    // many keys are assigned to each rank. We are using here that
+    // our keys and the coarse tree are both sorted.
+
+    // This will store for each rank how many keys will be assigned to it.
+
+    let rank_counts = sort_to_bins(linear_keys, &global_bins)
+        .iter()
+        .map(|&elem| elem as i32)
+        .collect_vec();
+
+    // We now have the counts for each rank. Let's redistribute accordingly and return.
+
+    let result = redistribute(linear_keys, &rank_counts, comm);
+
+    #[cfg(debug_assertions)]
+    {
+        // Check through that the first and last key of result are descendents
+        // of the first and last coarse bloack.
+        debug_assert!(coarse_tree
+            .first()
+            .unwrap()
+            .is_ancestor(*result.first().unwrap()));
+        debug_assert!(coarse_tree
+            .last()
+            .unwrap()
+            .is_ancestor(*result.last().unwrap()));
+    }
+
+    result
+}
+
+/// Return a complete tree generated from local keys and associated coarse keys.
+///
+/// The coarse keys are refined until the maximum level is reached or until each coarse key
+/// is the ancestor of at most `max_keys` fine keys.
+/// It is assumed that the level of the fine keys is at least as large as `max_level`.
+pub fn create_local_tree(
+    sorted_fine_keys: &[MortonKey],
+    coarse_keys: &[MortonKey],
+    mut max_level: usize,
+    max_keys: usize,
+) -> Vec<MortonKey> {
+    if max_level > DEEPEST_LEVEL as usize {
+        max_level = DEEPEST_LEVEL as usize;
+    }
+
+    // We split the sorted fine keys into subslices so that each subslice
+    // is associated with a coarse slice.
+
+    let bins = coarse_keys.to_vec();
+
+    let counts = sort_to_bins(sorted_fine_keys, &bins);
+
+    // We now know how many fine keys are associated with each coarse block. We iterate
+    // through and locally refine for each block that requires it.
+
+    let mut remainder = sorted_fine_keys;
+    let mut refined_keys = Vec::<MortonKey>::new();
+
+    for (&count, &coarse_key) in izip!(counts.iter(), coarse_keys.iter()) {
+        let current;
+        (current, remainder) = remainder.split_at(count);
+        if coarse_key.level() < max_level && current.len() > max_keys {
+            // We need to refine the current split.
+            refined_keys.extend_from_slice(
+                create_local_tree(
+                    current,
+                    coarse_key.children().as_slice(),
+                    max_level,
+                    max_keys,
+                )
+                .as_slice(),
+            );
+        } else {
+            refined_keys.push(coarse_key)
+        }
+    }
+
+    refined_keys
+}
+
+/// Linearize a set of weighted Morton keys.
+pub fn linearize<R: Rng, C: CommunicatorCollectives>(
+    keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    let size = comm.size();
+    let rank = comm.rank();
+
+    // If we only have one process we use the standard serial linearization.
+
+    if size == 1 {
+        return MortonKey::linearize(keys);
+    }
+
+    // We are first sorting the keys. Then in a linear process across all processors we
+    // go through the arrays and delete ancestors of nodes.
+
+    let sorted_keys = parsort(keys, comm, rng);
+
+    // Each process needs to send its first element to the previous process. Each process
+    // then goes through its own list and retains elements that are not ancestors of the
+    // next element.
+
+    let mut result = Vec::<MortonKey>::new();
+
+    let next_key = communicate_back(&sorted_keys, comm);
+
+    // Treat the local keys
+    for (&m1, &m2) in sorted_keys.iter().tuple_windows() {
+        // m1 is also ancestor of m2 if they are identical.
+        if m1.is_ancestor(m2) {
+            continue;
+        } else {
+            result.push(m1);
+        }
+    }
+
+    // If we are at the last process simply push the last key.
+    // Otherwise check whether it might be the ancestor of `next_key`,
+    // the first key on the next process. If yes, don't push it. Otherwise do.
+
+    if rank == size - 1 {
+        result.push(*sorted_keys.last().unwrap());
+    } else {
+        let last = *sorted_keys.last().unwrap();
+        if !last.is_ancestor(next_key.unwrap()) {
+            result.push(last);
+        }
+    }
+
+    debug_assert!(is_linear_tree(&result, comm));
+
+    result
+}
+
+/// Balance a sorted list of Morton keys across processors given an array of corresponding weights.
+pub fn load_balance<C: CommunicatorCollectives>(
+    sorted_keys: &[MortonKey],
+    weights: &[usize],
+    comm: &C,
+) -> Vec<MortonKey> {
+    assert_eq!(sorted_keys.len(), weights.len());
+
+    let size = comm.size();
+    let rank = comm.rank();
+
+    // If we only have one process we simply return.
+
+    if size == 1 {
+        return sorted_keys.to_vec();
+    }
+
+    // First scan the weight.
+    // We scan the local arrays, then use a global scan operation on the last element
+    // of each array to get the global sums and then we update the array of each rank
+    // with the sum from the previous ranks.
+
+    let scan = global_inclusive_cumsum(weights, comm);
+
+    // Now broadcast the total weight to all processes.
+
+    let mut total_weight = if rank == size - 1 {
+        *scan.last().unwrap()
+    } else {
+        0
+    };
+
+    comm.process_at_rank(size - 1)
+        .broadcast_into(&mut total_weight);
+
+    let w = total_weight / (size as usize);
+    let k = total_weight % (size as usize);
+
+    // Sort the elements into bins according to which process they should be sent.
+    // We do not need to sort the Morton keys themselves into bins but the scanned weights.
+    // The corresponding counts are the right counts for the Morton keys.
+
+    let mut bins = Vec::<usize>::with_capacity(size as usize);
+
+    for p in 1..=size as usize {
+        if p <= k {
+            bins.push((p - 1) * (1 + w));
+        } else {
+            bins.push((p - 1) * w + k);
+        }
+    }
+
+    let counts = sort_to_bins(&scan, &bins)
+        .iter()
+        .map(|elem| *elem as i32)
+        .collect_vec();
+
+    // Now distribute the data with an all to all v.
+    // We create a vector of how many elements to send to each process and
+    // then send the actual data.
+
+    let mut recvbuffer = redistribute(sorted_keys, &counts, comm);
+
+    recvbuffer.sort_unstable();
+    recvbuffer
+}
+
+/// Given a distributed set of linear keys, generate a complete tree.
+pub fn complete_tree<C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    let mut linear_keys = linear_keys.to_vec();
+
+    debug_assert!(is_linear_tree(&linear_keys, comm));
+
+    let size = comm.size();
+    let rank = comm.rank();
+
+    if size == 1 {
+        return MortonKey::complete_tree(linear_keys.as_slice());
+    }
+
+    // Now insert on the first and last process the first and last child of the
+    // finest ancestor of first/last box on deepest level
+
+    // Send first element to previous rank and insert into local keys.
+    // On the first process we also need to insert the first child of the finest
+    // ancestor of the deepest first key and first element. Correspondingly on the last process
+    // we need to insert the last child of the finest ancester of the deepest last key and last element.
+
+    let next_key = communicate_back(&linear_keys, comm);
+
+    if rank < size - 1 {
+        linear_keys.push(next_key.unwrap());
+    }
+
+    // Now fix the first key on the first rank.
+
+    if rank == 0 {
+        let first_key = linear_keys.first().unwrap();
+        let deepest_first = MortonKey::deepest_first();
+        if !first_key.is_ancestor(deepest_first) {
+            let ancestor = deepest_first.finest_common_ancestor(*first_key);
+            linear_keys.insert(0, ancestor.children()[0]);
+        }
+    }
+
+    if rank == size - 1 {
+        let last_key = linear_keys.last().unwrap();
+        let deepest_last = MortonKey::deepest_last();
+        if !last_key.is_ancestor(deepest_last) {
+            let ancestor = deepest_last.finest_common_ancestor(*last_key);
+            linear_keys.push(ancestor.children()[NSIBLINGS - 1]);
+        }
+    }
+
+    // Now complete the regions defined by the keys on each process.
+
+    let mut result = Vec::<MortonKey>::new();
+
+    for (&key1, &key2) in linear_keys.iter().tuple_windows() {
+        result.push(key1);
+        result.extend_from_slice(key1.fill_between_keys(key2).as_slice());
+    }
+
+    if rank == size - 1 {
+        result.push(*linear_keys.last().unwrap());
+    }
+
+    debug_assert!(is_complete_linear_tree(&result, comm));
+
+    result
+}
+
+/// Balance a distributed tree.
+pub fn balance<R: Rng, C: CommunicatorCollectives>(
+    linear_keys: &[MortonKey],
+    rng: &mut R,
+    comm: &C,
+) -> Vec<MortonKey> {
+    // Treat the case that the length of the keys is one and is only the root.
+    // This would lead to an empty output below as we only iterate up to level 1.
+
+    if linear_keys.len() == 1 && *linear_keys.first().unwrap() == MortonKey::root() {
+        return vec![MortonKey::root()];
+    }
+
+    let deepest_level = deepest_level(linear_keys, comm);
+
+    // Start with keys at deepest level
+    let mut work_list = linear_keys
+        .iter()
+        .copied()
+        .filter(|&key| key.level() == deepest_level)
+        .collect_vec();
+
+    let mut result = Vec::<MortonKey>::new();
+
+    // Now go through and make sure that for each key siblings and neighbours of parents are added
+
+    for level in (1..=deepest_level).rev() {
+        let mut parents = HashSet::<MortonKey>::new();
+        let mut new_work_list = Vec::<MortonKey>::new();
+        // We filter the work list by level and also make sure that
+        // only one sibling of each of the parents children is added to
+        // our current level list.
+        for key in work_list.iter() {
+            let parent = key.parent();
+            if !parents.contains(&parent) {
+                parents.insert(parent);
+                result.extend_from_slice(key.siblings().as_slice());
+                new_work_list.extend_from_slice(
+                    parent
+                        .neighbours()
+                        .iter()
+                        .copied()
+                        .filter(|&key| key.is_valid())
+                        .collect_vec()
+                        .as_slice(),
+                );
+            }
+        }
+        new_work_list.extend(
+            linear_keys
+                .iter()
+                .copied()
+                .filter(|&key| key.level() == level - 1),
+        );
+
+        work_list = new_work_list;
+    }
+
+    let result = linearize(&result, rng, comm);
+
+    debug_assert!(is_complete_linear_and_balanced(&result, comm));
+    result
+}
+
+/// Return true if the keys are linear.
+pub fn is_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
+    let mut is_linear = true;
+
+    for (&key1, &key2) in arr.iter().tuple_windows() {
+        if key1 >= key2 || key1.is_ancestor(key2) {
+            is_linear = false;
+            break;
+        }
+    }
+
+    if comm.size() == 1 {
+        return is_linear;
+    }
+
+    // Now check the interfaces
+
+    if let Some(next_key) = communicate_back(arr, comm) {
+        let last = *arr.last().unwrap();
+        if last >= next_key || last.is_ancestor(next_key) {
+            is_linear = false;
+        }
+    }
+
+    let mut global_is_linear = false;
+
+    comm.all_reduce_into(
+        &is_linear,
+        &mut global_is_linear,
+        SystemOperation::logical_and(),
+    );
+
+    global_is_linear
+}
+
+/// Redistribute points with respect to a given coarse tree
+pub fn redistribute_points_with_respect_to_coarse_tree<C: CommunicatorCollectives>(
+    points: &[Point],
+    morton_keys_for_points: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    comm: &C,
+) -> (Vec<Point>, Vec<MortonKey>) {
+    if comm.size() == 1 {
+        return (points.to_vec(), morton_keys_for_points.to_vec());
+    }
+
+    pub fn argsort<T: Ord + Copy>(arr: &[T]) -> Vec<usize> {
+        let mut sort_indices = (0..arr.len()).collect_vec();
+        sort_indices.sort_unstable_by_key(|&index| arr[index]);
+        sort_indices
+    }
+
+    pub fn reorder<T: Copy>(arr: &[T], permutation: &[usize]) -> Vec<T> {
+        let mut reordered = Vec::<T>::with_capacity(arr.len());
+        for &index in permutation.iter() {
+            reordered.push(arr[index])
+        }
+        reordered
+    }
+
+    assert_eq!(points.len(), morton_keys_for_points.len());
+
+    let size = comm.size();
+
+    if size == 1 {
+        return (points.to_vec(), morton_keys_for_points.to_vec());
+    }
+
+    let sort_indices = argsort(morton_keys_for_points);
+    let sorted_keys = reorder(morton_keys_for_points, &sort_indices);
+    let sorted_points = reorder(points, &sort_indices);
+
+    // Now get the bins
+
+    let my_first = coarse_tree.first().unwrap();
+
+    let global_bins = gather_to_all(std::slice::from_ref(my_first), comm);
+
+    // We now sort the morton indices into the bins.
+
+    // This will store for each rank how many keys will be assigned to it.
+
+    let counts = sort_to_bins(&sorted_keys, &global_bins)
+        .iter()
+        .map(|&elem| elem as i32)
+        .collect_vec();
+
+    // We now redistribute the points and the corresponding keys.
+
+    let (distributed_points, distributed_keys) = (
+        redistribute(&sorted_points, &counts, comm),
+        redistribute(&sorted_keys, &counts, comm),
+    );
+
+    // Now sort the distributed points and keys internally again.
+
+    let sort_indices = argsort(&distributed_keys);
+    let sorted_keys = reorder(&distributed_keys, &sort_indices);
+    let sorted_points = reorder(&distributed_points, &sort_indices);
+
+    (sorted_points, sorted_keys)
+}
+
+/// Return true on all ranks if distributed tree is complete. Otherwise, return false.
+pub fn is_complete_linear_tree<C: CommunicatorCollectives>(arr: &[MortonKey], comm: &C) -> bool {
+    // First check that the local tree on each node is complete.
+
+    let mut complete_linear = true;
+    for (key1, key2) in arr.iter().tuple_windows() {
+        // Make sure that the keys are sorted and not duplicated.
+        if key1 >= key2 {
+            complete_linear = false;
+            break;
+        }
+        // The next key should be an ancestor of the next non-descendent key.
+        if let Some(expected_next) = key1.next_non_descendent_key() {
+            if !key2.is_ancestor(expected_next) {
+                complete_linear = false;
+                break;
+            }
+        } else {
+            // Only for the very last key there should not be a next non-descendent key.
+            complete_linear = false;
+        }
+    }
+
+    // We now check the interfaces.
+
+    if let Some(next_first) = communicate_back(arr, comm) {
+        // We are on any but the last rank
+        let last_key = arr.last().unwrap();
+
+        // Check that the keys are sorted and not duplicated.
+        if *last_key >= next_first {
+            complete_linear = false;
+        }
+
+        // Check that the next key is an encestor of the next non-descendent.
+        if let Some(expected_next) = last_key.next_non_descendent_key() {
+            if !next_first.is_ancestor(expected_next) {
+                complete_linear = false;
+            }
+        } else {
+            complete_linear = false;
+        }
+    } else {
+        // We are on the last rank
+        // Check that the last key is ancestor of deepest last.
+        if !arr.last().unwrap().is_ancestor(MortonKey::deepest_last()) {
+            complete_linear = false;
+        }
+    }
+
+    // Now check that at the first rank we include the deepest first.
+
+    if comm.rank() == 0 && !arr.first().unwrap().is_ancestor(MortonKey::deepest_first()) {
+        complete_linear = false;
+    }
+
+    // Now communicate everything together.
+
+    let mut result = false;
+    comm.all_reduce_into(
+        &complete_linear,
+        &mut result,
+        SystemOperation::logical_and(),
+    );
+
+    result
+}
+
+/// Return the deepest level of a distributed list of Morton keys.
+pub fn deepest_level<C: CommunicatorCollectives>(keys: &[MortonKey], comm: &C) -> usize {
+    let local_deepest_level = keys.iter().map(|elem| elem.level()).max().unwrap();
+
+    if comm.size() == 1 {
+        return local_deepest_level;
+    }
+
+    let mut global_deepest_level: usize = 0;
+
+    comm.all_reduce_into(
+        &local_deepest_level,
+        &mut global_deepest_level,
+        SystemOperation::max(),
+    );
+
+    global_deepest_level
+}
+
+/// Check if tree is balanced.
+pub fn is_complete_linear_and_balanced<C: CommunicatorCollectives>(
+    arr: &[MortonKey],
+    comm: &C,
+) -> bool {
+    // Send the tree to the root node and check there that it is balanced.
+
+    let mut balanced = false;
+
+    if let Some(arr) = gather_to_root(arr, comm) {
+        balanced = MortonKey::is_complete_linear_and_balanced(&arr);
+    }
+
+    comm.process_at_rank(0).broadcast_into(&mut balanced);
+
+    balanced
+}
+
+/// For a complete linear bin get on each process the first key of all processes.
+///
+/// This information can be used to query on which process a key is living.
+pub fn get_tree_bins<C: CommunicatorCollectives>(
+    complete_linear_tree: &[MortonKey],
+    comm: &C,
+) -> Vec<MortonKey> {
+    gather_to_all(
+        std::slice::from_ref(complete_linear_tree.first().unwrap()),
+        comm,
+    )
+}
+
+/// For a sorted array return either position of the key or positioin directly before search key.
+pub fn get_key_index(arr: &[MortonKey], key: MortonKey) -> usize {
+    // Does a binary search of the key. If the key is found with Ok(..)
+    // the exact index is returned of the found key. If the key is not found
+    // the closest larger index is returned. So we subtract one to get the closest
+    // smaller index.
+
+    match arr.binary_search(&key) {
+        Ok(index) => index,
+        Err(index) => index - 1,
+    }
+}
+
+/// Check if a key is associated with the current rank.
+///
+/// Note that the key does not need to exist as leaf. It just needs
+/// to be descendent of a coarse key on the current rank.
+pub fn key_on_current_rank(
+    key: MortonKey,
+    coarse_tree_bounds: &[MortonKey],
+    rank: usize,
+    size: usize,
+) -> bool {
+    if rank == size - 1 {
+        key >= *coarse_tree_bounds.last().unwrap()
+    } else {
+        coarse_tree_bounds[rank] <= key && key < coarse_tree_bounds[rank + 1]
+    }
+}
+
+/// Generate all leaf and interior keys.
+pub fn generate_all_keys<C: CommunicatorCollectives>(
+    leaf_tree: &[MortonKey],
+    coarse_tree: &[MortonKey],
+    coarse_tree_bounds: &[MortonKey],
+    comm: &C,
+) -> HashMap<MortonKey, KeyType> {
+    /// This struct combines rank and key information for sending ghosts to neighbors.
+    #[derive(Copy, Clone, Equivalence)]
+    struct KeyWithRank {
+        key: MortonKey,
+        rank: usize,
+    }
+
+    let rank = comm.rank() as usize;
+    let size = comm.size() as usize;
+
+    let mut all_keys = HashMap::<MortonKey, KeyType>::new();
+    let leaf_keys: HashSet<MortonKey> = HashSet::from_iter(leaf_tree.iter().copied());
+
+    // If size == 1 we simply create locally the keys, so don't need to treat the global keys.
+
+    if size > 1 {
+        let mut global_keys = HashSet::<MortonKey>::new();
+
+        // First deal with the parents of the coarse tree. These are different
+        // as they may exist on multiple nodes, so receive a different label.
+
+        for &key in coarse_tree {
+            let mut parent = key.parent();
+            while parent.level() > 0 && !all_keys.contains_key(&parent) {
+                global_keys.insert(parent);
+                parent = parent.parent();
+            }
+        }
+
+        // We now send around the parents of the coarse tree to every node. These will
+        // be global keys.
+
+        let global_keys = gather_to_all(&global_keys.iter().copied().collect_vec(), comm);
+
+        // We can now insert the global keys into `all_keys` with the `Global` label.
+
+        for &key in &global_keys {
+            all_keys.entry(key).or_insert(KeyType::Global);
+        }
+    }
+
+    // We now deal with the fine leafs and their ancestors.
+    // The leafs of the coarse tree will also be either part
+    // of the fine tree leafs or will be interior keys. In either
+    // case the following loop catches them.
+
+    for leaf in leaf_keys {
+        debug_assert!(!all_keys.contains_key(&leaf));
+        all_keys.insert(leaf, KeyType::LocalLeaf);
+        let mut parent = leaf.parent();
+        while parent.level() > 0 && !all_keys.contains_key(&parent) {
+            all_keys.insert(parent, KeyType::LocalInterior);
+            parent = parent.parent();
+        }
+    }
+
+    // Need to explicitly add the root at the end.
+    all_keys.entry(MortonKey::root()).or_insert(KeyType::Global);
+
+    // We only need to deal with ghosts if the size is larger than 1.
+
+    if size > 1 {
+        // This maps from rank to the keys that we want to send to the ranks
+
+        let mut rank_send_ghost = HashMap::<usize, Vec<KeyWithRank>>::new();
+        for index in 0..size {
+            rank_send_ghost.insert(index, Vec::<KeyWithRank>::new());
+        }
+
+        let mut send_to_all = Vec::<KeyWithRank>::new();
+
+        for (&key, &status) in all_keys.iter() {
+            // We need not send around global keys to neighbors.
+            if status == KeyType::Global {
+                continue;
+            }
+            for &neighbor in key.neighbours().iter().filter(|&&key| key.is_valid()) {
+                // If the neighbour is a global key then continue.
+                if all_keys
+                    .get(&neighbor)
+                    .is_some_and(|&value| value == KeyType::Global)
+                {
+                    // Global keys exist on all nodes, so need to send their neighbors to all nodes.
+                    send_to_all.push(KeyWithRank { key, rank });
+                } else {
+                    // Get rank of the neighbour
+                    let neighbor_rank = get_key_index(coarse_tree_bounds, neighbor);
+                    rank_send_ghost
+                        .entry(neighbor_rank)
+                        .and_modify(|keys| keys.push(KeyWithRank { key, rank }));
+                }
+            }
+        }
+
+        let send_ghost_to_all = gather_to_all(&send_to_all, comm);
+        // We now know which key needs to be sent to which rank.
+        // Turn to array, get the counts and send around.
+
+        let (arr, counts) = {
+            let mut arr = Vec::<KeyWithRank>::new();
+            let mut counts = Vec::<i32>::new();
+            for index in 0..size {
+                let keys = rank_send_ghost.get(&index).unwrap();
+                arr.extend(keys.iter());
+                counts.push(keys.len() as i32);
+            }
+            (arr, counts)
+        };
+
+        // These are all the keys that are neighbors to our keys. We now go through
+        // and store those that do not live on our tree as into `all_keys` with a label
+        // of `Ghost`.
+        let mut ghost_keys = redistribute(&arr, &counts, comm);
+        // Add the neighbors of any global key.
+        ghost_keys.extend(send_ghost_to_all.iter());
+
+        for key in &ghost_keys {
+            if key.rank == rank {
+                // Don't need to add the keys that are already on the rank.
+                continue;
+            }
+            all_keys.insert(key.key, KeyType::Ghost(key.rank));
+        }
+    }
+
+    all_keys
+}
+
+#[cfg(test)]
+mod test {
+    use crate::{
+        octree::get_key_index,
+        tools::{generate_random_keys, seeded_rng},
+    };
+
+    #[test]
+    fn test_get_key_rank() {
+        let mut rng = seeded_rng(0);
+
+        let mut keys = generate_random_keys(50, &mut rng);
+
+        keys.sort_unstable();
+
+        let mid = keys[25];
+
+        assert_eq!(25, get_key_index(&keys, mid));
+
+        // Now remove the mid index and do the same again.
+
+        keys.remove(25);
+
+        // The result should be 24.
+
+        assert_eq!(24, get_key_index(&keys, mid));
+    }
+}
diff --git a/src/parsort.rs b/src/parsort.rs
index 16518cc..7191783 100644
--- a/src/parsort.rs
+++ b/src/parsort.rs
@@ -1,107 +1,28 @@
 //! Implementation of a parallel samplesort.
 
 use std::fmt::Display;
-use std::mem::offset_of;
 
 use itertools::Itertools;
-use mpi::datatype::{UncommittedDatatypeRef, UncommittedUserDatatype, UserDatatype};
-use mpi::traits::{Equivalence, Root};
-use mpi::{
-    datatype::{Partition, PartitionMut},
-    traits::CommunicatorCollectives,
-};
+use mpi::traits::CommunicatorCollectives;
+use mpi::traits::Equivalence;
 use rand::{seq::SliceRandom, Rng};
 
-const OVERSAMPLING: usize = 8;
+use crate::morton::MortonKey;
+use crate::tools::{gather_to_all, global_max, global_min, redistribute_by_bins};
 
-/// Sortable trait that each type fed into parsort needs to satisfy.
-pub trait ParallelSortable:
-    MinValue
-    + MaxValue
-    + Equivalence
-    + Copy
-    + Clone
-    + Default
-    + PartialEq
-    + Eq
-    + PartialOrd
-    + Ord
-    + Display
-    + Sized
-{
-}
-
-impl<
-        T: MinValue
-            + MaxValue
-            + Equivalence
-            + Copy
-            + Clone
-            + Default
-            + PartialEq
-            + Eq
-            + PartialOrd
-            + Ord
-            + Display
-            + Sized,
-    > ParallelSortable for T
-{
-}
+const OVERSAMPLING: usize = 8;
 
 /// An internal struct. We convert every array element
 /// into this struct. The idea is that this is guaranteed to be unique
 /// as it encodes not only the element but also its rank and index.
-#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord)]
-struct UniqueItem<T: ParallelSortable> {
-    pub value: T,
+#[derive(Copy, Clone, Default, PartialEq, Eq, PartialOrd, Ord, Equivalence)]
+struct UniqueItem {
+    pub value: MortonKey,
     pub rank: usize,
     pub index: usize,
 }
 
-unsafe impl<T: ParallelSortable> Equivalence for UniqueItem<T> {
-    type Out = UserDatatype;
-
-    // Depending on the MPI implementation the below offset needs
-    // to be an i64 or isize. If it is an i64 Clippy warns about
-    // a useless conversion. But this warning is MPI implementation
-    // dependent. So switch off here.
-
-    #[allow(clippy::useless_conversion)]
-    fn equivalent_datatype() -> Self::Out {
-        UserDatatype::structured::<UncommittedDatatypeRef>(
-            &[1, 1, 1],
-            &[
-                (offset_of!(UniqueItem<T>, value) as i64)
-                    .try_into()
-                    .unwrap(),
-                (offset_of!(UniqueItem<T>, rank) as i64).try_into().unwrap(),
-                (offset_of!(UniqueItem<T>, index) as i64)
-                    .try_into()
-                    .unwrap(),
-            ],
-            &[
-                UncommittedUserDatatype::contiguous(1, &<T as Equivalence>::equivalent_datatype())
-                    .as_ref(),
-                usize::equivalent_datatype().into(),
-                usize::equivalent_datatype().into(),
-            ],
-        )
-    }
-}
-
-/// Return the minimum possible value of a type.
-pub trait MinValue {
-    /// Return the min value.
-    fn min_value() -> Self;
-}
-
-/// Return the maximum possible value of a type.
-pub trait MaxValue {
-    /// Return the max value.
-    fn max_value() -> Self;
-}
-
-impl<T: ParallelSortable> Display for UniqueItem<T> {
+impl Display for UniqueItem {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
@@ -111,34 +32,21 @@ impl<T: ParallelSortable> Display for UniqueItem<T> {
     }
 }
 
-impl<T: ParallelSortable> MinValue for UniqueItem<T> {
-    fn min_value() -> Self {
-        UniqueItem::new(<T as MinValue>::min_value(), 0, 0)
-    }
-}
-
-impl<T: ParallelSortable> MaxValue for UniqueItem<T> {
-    fn max_value() -> Self {
-        UniqueItem::new(<T as MaxValue>::max_value(), 0, 0)
-    }
-}
-
-impl<T: ParallelSortable> UniqueItem<T> {
-    pub fn new(value: T, rank: usize, index: usize) -> Self {
+impl UniqueItem {
+    pub fn new(value: MortonKey, rank: usize, index: usize) -> Self {
         Self { value, rank, index }
     }
 }
 
-fn to_unique_item<T: ParallelSortable>(arr: &[T], rank: usize) -> Vec<UniqueItem<T>> {
+fn to_unique_item(arr: &[MortonKey], rank: usize) -> Vec<UniqueItem> {
     arr.iter()
         .enumerate()
         .map(|(index, &item)| UniqueItem::new(item, rank, index))
         .collect()
 }
 
-fn get_buckets<T, C, R>(arr: &[UniqueItem<T>], comm: &C, rng: &mut R) -> Vec<UniqueItem<T>>
+fn get_buckets<C, R>(arr: &[UniqueItem], comm: &C, rng: &mut R) -> Vec<UniqueItem>
 where
-    T: ParallelSortable,
     C: CommunicatorCollectives,
     R: Rng + ?Sized,
 {
@@ -152,42 +60,22 @@ where
         OVERSAMPLING
     };
 
-    // We are choosing unique splitters that neither contain
-    // zero nor u64::max.
+    // We get the global smallest and global largest element. We do not want those
+    // in the splitter so filter out their occurence.
+
+    let global_min_elem = global_min(arr, comm);
+    let global_max_elem = global_max(arr, comm);
+
+    // We do not want the global smallest element in the splitter.
 
     let splitters = arr
         .choose_multiple(rng, oversampling)
         .copied()
         .collect::<Vec<_>>();
 
-    // We use an all_gatherv so that each process receives all splitters.
-    // For that we first communicate how many splitters each process has
-    // and then we send the splitters themselves.
-
-    let nsplitters = splitters.len();
-    let mut splitters_per_rank = vec![0_usize; size];
-
-    comm.all_gather_into(&nsplitters, &mut splitters_per_rank);
-
-    // We now know how many splitters each process has. We now create space
-    // for the splitters and send them all around.
-
-    let n_all_splitters = splitters_per_rank.iter().sum();
+    // We gather the splitters into all ranks so that each rank has all splitters.
 
-    let mut all_splitters = vec![Default::default(); n_all_splitters];
-    let splitters_per_rank = splitters_per_rank.iter().map(|&x| x as i32).collect_vec();
-
-    let displs: Vec<i32> = splitters_per_rank
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
-
-    let mut partition = PartitionMut::new(&mut all_splitters[..], splitters_per_rank, &displs[..]);
-    comm.all_gather_varcount_into(&splitters, &mut partition);
+    let mut all_splitters = gather_to_all(&splitters, comm);
 
     // We now have all splitters available on each process.
     // We can now sort the splitters. Every process will then have the same list of sorted splitters.
@@ -197,97 +85,31 @@ where
     // We now insert the smallest and largest possible element if they are not already
     // in the splitter collection.
 
-    if *all_splitters.first().unwrap() != UniqueItem::min_value() {
-        all_splitters.insert(0, UniqueItem::min_value())
+    if *all_splitters.first().unwrap() != global_min_elem {
+        all_splitters.insert(0, global_min_elem)
     }
 
-    if *all_splitters.last().unwrap() != UniqueItem::max_value() {
-        all_splitters.push(UniqueItem::max_value());
+    if *all_splitters.last().unwrap() != global_max_elem {
+        all_splitters.push(global_max_elem);
     }
 
     // We now define p buckets (p is number of processors) and we return
-    // a p + 1 element array containing the first element of each bucket
-    // concluded with the largest possible element.
+    // a p element array containing the first element of each bucket
 
     all_splitters = split(&all_splitters, size)
         .map(|slice| slice.first().unwrap())
         .copied()
         .collect::<Vec<_>>();
-    all_splitters.push(UniqueItem::max_value());
 
     all_splitters
 }
 
-fn get_counts<T: ParallelSortable>(arr: &[UniqueItem<T>], buckets: &[UniqueItem<T>]) -> Vec<usize> {
-    // The following array will store the counts for each bucket.
-
-    let mut counts = vec![0_usize; buckets.len() - 1];
-
-    // We are iterating through the array. Whenever an element is larger or equal than
-    // the current splitter we store the current position in `bin_displs` and advance `splitter_iter`
-    // by 1.
-
-    // In the following iterator we skip the first bin displacement position as this must be the default
-    // zero (start of the bins).
-
-    // Note that bucket iterator has as many elements as counts as the tuple_windows has length
-    // 1 smaller than the original array length.
-    let mut bucket_iter = buckets.iter().tuple_windows::<(_, _)>();
-
-    // We skip the first element as this is always zero.
-    let mut count_iter = counts.iter_mut();
-
-    let mut count: usize = 0;
-    let mut current_count = count_iter.next().unwrap();
-
-    let (mut first, mut last) = bucket_iter.next().unwrap();
-
-    for elem in arr {
-        // The test after the or sorts out the case that our set includes the maximum possible
-        // item and we are in the last bucket. The biggest item should be counted as belonging
-        // to the bucket.
-        if (first <= elem && elem < last)
-            || (*last == UniqueItem::max_value() && *elem == UniqueItem::max_value())
-        {
-            // Element is in the right bucket.
-            count += 1;
-            continue;
-        } else {
-            // Element is not in the right bucket.
-            // Store counts and find the correct bucket.
-            *current_count = count;
-            loop {
-                (first, last) = bucket_iter.next().unwrap();
-                current_count = count_iter.next().unwrap();
-                if (first <= elem && elem < last)
-                    || (*last == UniqueItem::max_value() && *elem == UniqueItem::max_value())
-                {
-                    break;
-                }
-            }
-            // Now have the right bucket. Reset count and continue.
-            count = 1;
-        }
-    }
-
-    // Need to store the count for the last bucket in the iterator.
-    // This is always necessary as last iterator is half open interval.
-    // So we don't go into the else part of the for loop.
-
-    *current_count = count;
-
-    // We don't need to fill the remaining counts entries with zero
-    // since the array is already initialized with zero.
-
-    counts
-}
-
 /// Parallel sort
-pub fn parsort<T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized>(
-    arr: &[T],
+pub fn parsort<C: CommunicatorCollectives, R: Rng + ?Sized>(
+    arr: &[MortonKey],
     comm: &C,
     rng: &mut R,
-) -> Vec<T> {
+) -> Vec<MortonKey> {
     let size = comm.size() as usize;
     let rank = comm.rank() as usize;
     // If we only have a single rank simply sort the local array and return
@@ -313,50 +135,8 @@ pub fn parsort<T: ParallelSortable, C: CommunicatorCollectives, R: Rng + ?Sized>
 
     let buckets = get_buckets(&arr, comm, rng);
 
-    // We now compute how many elements of our array go into each bucket.
-
-    let counts = get_counts(&arr, &buckets)
-        .iter()
-        .map(|&elem| elem as i32)
-        .collect::<Vec<_>>();
-
-    // We now do an all_to_allv to communicate the array elements to the right processors.
-
-    // First we need to communicate how many elements everybody gets from each processor.
-
-    let mut counts_from_processor = vec![0_i32; size];
-
-    comm.all_to_all_into(&counts, &mut counts_from_processor);
-
-    // Each processor now knows how much he gets from all the others.
-
-    // We can now send around the actual elements with an alltoallv.
-    let send_displs: Vec<i32> = counts
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
-
-    let send_partition = Partition::new(&arr, counts, &send_displs[..]);
-
-    let mut recvbuffer =
-        vec![UniqueItem::default(); counts_from_processor.iter().sum::<i32>() as usize];
-
-    let recv_displs: Vec<i32> = counts_from_processor
-        .iter()
-        .scan(0, |acc, &x| {
-            let tmp = *acc;
-            *acc += x;
-            Some(tmp)
-        })
-        .collect();
-
-    let mut receiv_partition =
-        PartitionMut::new(&mut recvbuffer[..], counts_from_processor, &recv_displs[..]);
-    comm.all_to_all_varcount_into(&send_partition, &mut receiv_partition);
+    // We now redistribute with respect to these buckets.
+    let mut recvbuffer = redistribute_by_bins(&arr, &buckets, comm);
 
     // We now have everything in the receive buffer. Now sort the local elements and return
 
@@ -396,72 +176,3 @@ impl<'a, T> Iterator for Split<'a, T> {
         Some(chunk)
     }
 }
-
-/// Array to root
-pub fn array_to_root<T: ParallelSortable, C: CommunicatorCollectives>(
-    arr: &[T],
-    comm: &C,
-) -> Option<Vec<T>> {
-    let n = arr.len() as i32;
-    let rank = comm.rank();
-    let size = comm.size();
-    let root_process = comm.process_at_rank(0);
-
-    // We first communicate the length of the array to root.
-
-    if rank == 0 {
-        // We are at root.
-
-        let mut ranks = vec![0_i32; size as usize];
-        root_process.gather_into_root(&n, &mut ranks);
-
-        // We now have all ranks at root. Can now a varcount gather to get
-        // the array elements.
-
-        let nelements = ranks.iter().sum::<i32>();
-
-        let mut new_arr = vec![<T as Default>::default(); nelements as usize];
-
-        let displs: Vec<i32> = ranks
-            .iter()
-            .scan(0, |acc, &x| {
-                let tmp = *acc;
-                *acc += x;
-                Some(tmp)
-            })
-            .collect();
-
-        let mut partition = PartitionMut::new(&mut new_arr[..], ranks, &displs[..]);
-
-        root_process.gather_varcount_into_root(arr, &mut partition);
-        Some(new_arr)
-    } else {
-        root_process.gather_into(&n);
-        root_process.gather_varcount_into(arr);
-        None
-    }
-}
-
-macro_rules! impl_min_max_value {
-    ($type:ty) => {
-        impl MinValue for $type {
-            fn min_value() -> Self {
-                <$type>::MIN
-            }
-        }
-
-        impl MaxValue for $type {
-            fn max_value() -> Self {
-                <$type>::MAX
-            }
-        }
-    };
-}
-
-impl_min_max_value!(usize);
-impl_min_max_value!(i8);
-impl_min_max_value!(i32);
-impl_min_max_value!(i64);
-impl_min_max_value!(u8);
-impl_min_max_value!(u32);
-impl_min_max_value!(u64);
diff --git a/src/serial.rs b/src/serial.rs
new file mode 100644
index 0000000..0249bb8
--- /dev/null
+++ b/src/serial.rs
@@ -0,0 +1,322 @@
+//! Definition of a linear octree
+
+use crate::{
+    constants::{DEEPEST_LEVEL, NLEVELS},
+    geometry::{PhysicalBox, Point},
+    morton::MortonKey,
+};
+use std::collections::HashMap;
+use vtkio;
+
+/// A neighbour
+pub struct Neighbour {
+    /// Direction
+    pub direction: [i64; 3],
+    /// Level
+    pub level: usize,
+    /// Morton key
+    pub key: MortonKey,
+}
+
+/// An octree
+pub struct Octree {
+    leaf_keys: Vec<MortonKey>,
+    points: Vec<Point>,
+    point_to_level_keys: [Vec<MortonKey>; NLEVELS],
+    bounding_box: PhysicalBox,
+    key_counts: HashMap<MortonKey, usize>,
+    max_leaf_level: usize,
+    max_points_in_leaf: usize,
+}
+
+impl Octree {
+    /// Create octress from points
+    pub fn from_points(points: &[Point], max_level: usize, max_points_per_box: usize) -> Self {
+        // Make sure that the points array is a multiple of 3.
+
+        // Make sure that max level never exceeds DEEPEST_LEVEL
+        let max_level = if max_level > DEEPEST_LEVEL as usize {
+            DEEPEST_LEVEL as usize
+        } else {
+            max_level
+        };
+
+        // Compute the physical bounding box.
+
+        let bounding_box = PhysicalBox::from_points(points);
+
+        // Bunch the points in arrays of 3.
+
+        let npoints = points.len();
+
+        // We create a vector of keys for each point on each level. We compute the
+        // keys on the deepest level and fill the other levels by going from
+        // parent to parent.
+
+        let mut point_to_level_keys: [Vec<MortonKey>; NLEVELS] = Default::default();
+        point_to_level_keys[DEEPEST_LEVEL as usize] = points
+            .iter()
+            .map(|&point| {
+                MortonKey::from_physical_point(point, &bounding_box, DEEPEST_LEVEL as usize)
+            })
+            .collect::<Vec<_>>();
+
+        for index in (1..=DEEPEST_LEVEL as usize).rev() {
+            let mut new_vec = Vec::<MortonKey>::with_capacity(npoints);
+            for &key in &point_to_level_keys[index] {
+                new_vec.push(key.parent());
+            }
+            point_to_level_keys[index - 1] = new_vec;
+        }
+
+        // We now have to create level keys. We are starting at the root and recursing
+        // down until each box has fewer than max_points_per_box keys.
+
+        // First we compute the counts of each key on each level. For that we create
+        // for each level a Hashmap for the keys and then add up.
+
+        let mut key_counts: HashMap<MortonKey, usize> = Default::default();
+
+        for keys in &point_to_level_keys {
+            for key in keys {
+                *key_counts.entry(*key).or_default() += 1;
+            }
+        }
+
+        // We can now easily create an adaptive tree by subdividing. We do this by
+        // a recursive function.
+
+        let mut leaf_keys = Vec::<MortonKey>::new();
+
+        fn recurse_keys(
+            key: MortonKey,
+            key_counts: &HashMap<MortonKey, usize>,
+            leaf_keys: &mut Vec<MortonKey>,
+            max_points_per_box: usize,
+            max_level: usize,
+        ) {
+            let level = key.level();
+            // A key may have not be associated with points. This happens if one of the children on
+            // the previous level has no points in its physical box. However, we want to create a
+            // complete tree. So we still add this one empty child.
+            if let Some(&count) = key_counts.get(&key) {
+                if count > max_points_per_box && level < max_level {
+                    for child in key.children() {
+                        recurse_keys(child, key_counts, leaf_keys, max_points_per_box, max_level);
+                    }
+                } else {
+                    leaf_keys.push(key)
+                }
+            } else {
+                leaf_keys.push(key)
+            }
+        }
+
+        // Now execute the recursion starting from root
+
+        recurse_keys(
+            MortonKey::root(),
+            &key_counts,
+            &mut leaf_keys,
+            max_points_per_box,
+            max_level,
+        );
+
+        // The leaf keys are now a complete linear tree. But they are not yet balanced.
+        // In the final step we balance the leafs.
+
+        let leaf_keys = MortonKey::balance(&leaf_keys, MortonKey::root());
+
+        let mut max_leaf_level = 0;
+        let mut max_points_in_leaf = 0;
+
+        for key in &leaf_keys {
+            max_leaf_level = max_leaf_level.max(key.level());
+            max_points_in_leaf =
+                max_points_in_leaf.max(if let Some(&count) = key_counts.get(key) {
+                    count
+                } else {
+                    0
+                });
+        }
+
+        Self {
+            leaf_keys,
+            points: points.to_vec(),
+            point_to_level_keys,
+            bounding_box,
+            key_counts,
+            max_leaf_level,
+            max_points_in_leaf,
+        }
+    }
+
+    /// Leaf keys
+    pub fn leaf_keys(&self) -> &Vec<MortonKey> {
+        &self.leaf_keys
+    }
+
+    /// Points
+    pub fn points(&self) -> &Vec<Point> {
+        &self.points
+    }
+
+    /// Get level keys for each point
+    pub fn point_to_level_keys(&self) -> &[Vec<MortonKey>; NLEVELS] {
+        &self.point_to_level_keys
+    }
+
+    /// Bounding box
+    pub fn bounding_box(&self) -> &PhysicalBox {
+        &self.bounding_box
+    }
+
+    /// Maximum leaf level
+    pub fn maximum_leaf_level(&self) -> usize {
+        self.max_leaf_level
+    }
+
+    /// Maximum number of points in a leaf box
+    pub fn max_points_in_leaf_box(&self) -> usize {
+        self.max_points_in_leaf
+    }
+
+    /// Number of points in the box indexed by a key
+    pub fn number_of_points_in_key(&self, key: MortonKey) -> usize {
+        if let Some(&count) = self.key_counts.get(&key) {
+            count
+        } else {
+            0
+        }
+    }
+
+    /// Export the tree to vtk
+    pub fn export_to_vtk(&self, file_path: &str) {
+        use vtkio::model::{
+            Attributes, ByteOrder, CellType, Cells, DataSet, IOBuffer, UnstructuredGridPiece,
+            Version, VertexNumbers,
+        };
+
+        // Each box has 8 corners with 3 coordinates each, hence 24 floats per key.
+        let mut points = Vec::<f64>::new();
+        // 8 coords per box, hence 8 * nkeys values in connectivity.
+        let mut connectivity = Vec::<u64>::new();
+        // Store the vtk offset for each box.
+        let mut offsets = Vec::<u64>::new();
+
+        let bounding_box = self.bounding_box();
+
+        // Go through the keys and add coordinates and connectivity.
+        // Box coordinates are already in the right order, so connectivity
+        // just counts up. We don't mind doubly counted vertices from two boxes.
+        let mut point_count = 0;
+        let mut key_count = 0;
+
+        for key in self.leaf_keys().iter() {
+            // We only want to export non-empty boxes.
+            if self.number_of_points_in_key(*key) == 0 {
+                continue;
+            }
+            let coords = key.physical_box(bounding_box).corners();
+
+            key_count += 1;
+            offsets.push(8 * key_count);
+
+            for coord in &coords {
+                points.push(coord[0]);
+                points.push(coord[1]);
+                points.push(coord[2]);
+
+                connectivity.push(point_count);
+                point_count += 1;
+            }
+        }
+
+        let vtk_file = vtkio::Vtk {
+            version: Version::new((1, 0)),
+            title: String::new(),
+            byte_order: ByteOrder::LittleEndian,
+            file_path: None,
+            data: DataSet::inline(UnstructuredGridPiece {
+                points: IOBuffer::F64(points),
+                cells: Cells {
+                    cell_verts: VertexNumbers::XML {
+                        connectivity,
+                        offsets,
+                    },
+                    types: vec![CellType::Hexahedron; key_count as usize],
+                },
+                data: Attributes {
+                    point: vec![],
+                    cell: vec![],
+                },
+            }),
+        };
+
+        vtk_file.export_ascii(file_path).unwrap();
+    }
+
+    // We can now create the vtk object.
+}
+
+#[cfg(test)]
+mod test {
+    use crate::geometry::Point;
+
+    use super::Octree;
+    use rand::prelude::*;
+
+    fn get_points_on_sphere(npoints: usize) -> Vec<Point> {
+        let mut rng = rand::rngs::StdRng::seed_from_u64(0);
+        let normal = rand_distr::Normal::new(0.0, 1.0).unwrap();
+
+        let mut points = Vec::<Point>::with_capacity(npoints);
+        for _ in 0..(npoints) {
+            let x: f64 = normal.sample(&mut rng);
+            let y: f64 = normal.sample(&mut rng);
+            let z: f64 = normal.sample(&mut rng);
+
+            let norm = (x * x + y * y + z * z).sqrt();
+
+            points.push(Point::new([x / norm, y / norm, z / norm], 0));
+        }
+
+        points
+    }
+
+    #[test]
+    fn test_octree() {
+        use std::time::Instant;
+
+        let npoints = 10000;
+        let points = get_points_on_sphere(npoints);
+        let max_level = 7;
+        let max_points_per_box = 100;
+
+        let start = Instant::now();
+        let octree = Octree::from_points(&points, max_level, max_points_per_box);
+        let duration = start.elapsed();
+
+        println!("Creation time: {}", duration.as_millis());
+        println!("Number of leaf keys: {}", octree.leaf_keys().len());
+        println!("Bounding box: {}", octree.bounding_box());
+    }
+
+    #[test]
+    fn test_export() {
+        let fname = "_test_sphere.vtk";
+        let npoints = 10000;
+        let points = get_points_on_sphere(npoints);
+        let max_level = 7;
+        let max_points_per_box = 100;
+
+        let octree = Octree::from_points(&points, max_level, max_points_per_box);
+
+        octree.export_to_vtk(fname);
+        println!("Maximum leaf level: {}", octree.maximum_leaf_level());
+        println!(
+            "Maximum number of points in leaf box: {}",
+            octree.max_points_in_leaf_box()
+        );
+    }
+}
diff --git a/src/tools.rs b/src/tools.rs
new file mode 100644
index 0000000..dbe8edd
--- /dev/null
+++ b/src/tools.rs
@@ -0,0 +1,438 @@
+//! Utility routines.
+
+use itertools::{izip, Itertools};
+use mpi::{
+    collective::{SystemOperation, UserOperation},
+    datatype::{Partition, PartitionMut},
+    point_to_point as p2p,
+    traits::{CommunicatorCollectives, Destination, Equivalence, Root, Source},
+};
+use num::traits::Zero;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+
+use crate::{
+    constants::{DEEPEST_LEVEL, LEVEL_SIZE},
+    geometry::Point,
+    morton::MortonKey,
+};
+
+/// Gather array to all processes
+pub fn gather_to_all<T: Equivalence, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> Vec<T> {
+    // First we need to broadcast the individual sizes on each process.
+
+    let size = comm.size();
+
+    let local_len = arr.len() as i32;
+
+    let mut sizes = vec![0; size as usize];
+
+    comm.all_gather_into(&local_len, &mut sizes);
+
+    let recv_len = sizes.iter().sum::<i32>() as usize;
+
+    // Now we have the size of each local contribution.
+    // let mut recvbuffer =
+    //     vec![T: Default; counts_from_processor.iter().sum::<i32>() as usize];
+    let mut recvbuffer = Vec::<T>::with_capacity(recv_len);
+    let buf: &mut [T] = unsafe { std::mem::transmute(recvbuffer.spare_capacity_mut()) };
+
+    let recv_displs: Vec<i32> = sizes
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect();
+
+    let mut receiv_partition = PartitionMut::new(buf, sizes, &recv_displs[..]);
+
+    comm.all_gather_varcount_into(arr, &mut receiv_partition);
+
+    unsafe { recvbuffer.set_len(recv_len) };
+
+    recvbuffer
+}
+/// Array to root
+
+/// Gather distributed array to the root rank.
+///
+/// The result is a `Vec<T>` on root and `None` on all other ranks.
+pub fn gather_to_root<T: Equivalence, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> Option<Vec<T>> {
+    let n = arr.len() as i32;
+    let rank = comm.rank();
+    let size = comm.size();
+    let root_process = comm.process_at_rank(0);
+
+    // We first communicate the length of the array to root.
+
+    if rank == 0 {
+        // We are at root.
+
+        let mut counts = vec![0_i32; size as usize];
+        root_process.gather_into_root(&n, &mut counts);
+
+        // We now have all ranks at root. Can now a varcount gather to get
+        // the array elements.
+
+        let nelements = counts.iter().sum::<i32>();
+        let mut new_arr = Vec::<T>::with_capacity(nelements as usize);
+        let new_arr_buf: &mut [T] = unsafe { std::mem::transmute(new_arr.spare_capacity_mut()) };
+
+        let displs = displacements(counts.as_slice());
+
+        let mut partition = PartitionMut::new(new_arr_buf, counts, &displs[..]);
+
+        root_process.gather_varcount_into_root(arr, &mut partition);
+
+        unsafe { new_arr.set_len(nelements as usize) };
+        Some(new_arr)
+    } else {
+        root_process.gather_into(&n);
+        root_process.gather_varcount_into(arr);
+        None
+    }
+}
+
+/// Get global size of a distributed array.
+///
+/// Computes the size and broadcoasts it to all ranks.
+pub fn global_size<T, C: CommunicatorCollectives>(arr: &[T], comm: &C) -> usize {
+    let local_size = arr.len();
+    let mut global_size = 0;
+
+    comm.all_reduce_into(&local_size, &mut global_size, SystemOperation::sum());
+
+    global_size
+}
+
+/// Get the maximum value across all ranks
+pub fn global_max<T: Equivalence + Copy + Ord, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> T {
+    let local_max = arr.iter().max().unwrap();
+
+    // Just need to initialize global_max with something.
+    let mut global_max = *local_max;
+
+    comm.all_reduce_into(
+        local_max,
+        &mut global_max,
+        &UserOperation::commutative(|x, y| {
+            let x: &[T] = x.downcast().unwrap();
+            let y: &mut [T] = y.downcast().unwrap();
+            for (&x_i, y_i) in x.iter().zip(y) {
+                *y_i = x_i.max(*y_i);
+            }
+        }),
+    );
+
+    global_max
+}
+
+/// Get the minimum value across all ranks
+pub fn global_min<T: Equivalence + Copy + Ord, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> T {
+    let local_min = *arr.iter().min().unwrap();
+
+    // Just need to initialize global_min with something.
+    let mut global_min = local_min;
+
+    comm.all_reduce_into(
+        &local_min,
+        &mut global_min,
+        &UserOperation::commutative(|x, y| {
+            let x: &[T] = x.downcast().unwrap();
+            let y: &mut [T] = y.downcast().unwrap();
+            for (&x_i, y_i) in x.iter().zip(y) {
+                *y_i = x_i.min(*y_i);
+            }
+        }),
+    );
+
+    global_min
+}
+
+/// Communicate the first element of each local array back to the previous rank.
+pub fn communicate_back<T: Equivalence, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> Option<T> {
+    let rank = comm.rank();
+    let size = comm.size();
+
+    if size == 1 {
+        return None;
+    }
+
+    if rank == size - 1 {
+        comm.process_at_rank(rank - 1).send(arr.first().unwrap());
+        None
+    } else {
+        let (new_last, _status) = if rank > 0 {
+            p2p::send_receive(
+                arr.first().unwrap(),
+                &comm.process_at_rank(rank - 1),
+                &comm.process_at_rank(rank + 1),
+            )
+        } else {
+            comm.process_at_rank(1).receive::<T>()
+        };
+        Some(new_last)
+    }
+}
+
+/// Check if an array is sorted.
+pub fn is_sorted_array<T: Equivalence + PartialOrd, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> bool {
+    let mut sorted = true;
+    for (elem1, elem2) in arr.iter().tuple_windows() {
+        if elem1 > elem2 {
+            sorted = false;
+        }
+    }
+
+    if comm.size() == 1 {
+        return sorted;
+    }
+
+    if let Some(next_first) = communicate_back(arr, comm) {
+        sorted = *arr.last().unwrap() <= next_first;
+    }
+
+    let mut global_sorted: bool = false;
+    comm.all_reduce_into(&sorted, &mut global_sorted, SystemOperation::logical_and());
+
+    global_sorted
+}
+
+/// Redistribute an array via an all_to_all_varcount operation.
+pub fn redistribute<T: Equivalence, C: CommunicatorCollectives>(
+    arr: &[T],
+    counts: &[i32],
+    comm: &C,
+) -> Vec<T> {
+    assert_eq!(counts.len(), comm.size() as usize);
+
+    // First send the counts around via an alltoall operation.
+
+    let mut recv_counts = vec![0; counts.len()];
+
+    comm.all_to_all_into(counts, &mut recv_counts);
+
+    // We have the recv_counts. Allocate space and setup the partitions.
+
+    let nelems = recv_counts.iter().sum::<i32>() as usize;
+
+    let mut output = Vec::<T>::with_capacity(nelems);
+    let out_buf: &mut [T] = unsafe { std::mem::transmute(output.spare_capacity_mut()) };
+
+    let send_partition = Partition::new(arr, counts, displacements(counts));
+    let mut recv_partition =
+        PartitionMut::new(out_buf, &recv_counts[..], displacements(&recv_counts));
+
+    comm.all_to_all_varcount_into(&send_partition, &mut recv_partition);
+
+    unsafe { output.set_len(nelems) };
+
+    output
+}
+
+/// Perform a global inclusive cumulative sum operation.
+///
+/// For the array `[1, 3, 5, 7]` the output will be `[1, 4, 9, 16]`.
+pub fn global_inclusive_cumsum<T: Equivalence + Zero + Copy, C: CommunicatorCollectives>(
+    arr: &[T],
+    comm: &C,
+) -> Vec<T> {
+    let mut scan: Vec<T> = arr
+        .iter()
+        .scan(<T as Zero>::zero(), |state, x| {
+            *state = *x + *state;
+            Some(*state)
+        })
+        .collect_vec();
+    let scan_last = *scan.last().unwrap();
+    let mut scan_result = T::zero();
+    comm.exclusive_scan_into(&scan_last, &mut scan_result, SystemOperation::sum());
+    for elem in &mut scan {
+        *elem = *elem + scan_result;
+    }
+
+    scan
+}
+
+/// Distribute a sorted sequence into bins.
+///
+/// For an array with n elements to be distributed into p bins,
+/// the array `bins` has p elements. The bins are defined by half-open intervals
+/// of the form [b_j, b_{j+1})). The final bin is the half-open interval [b_{p-1}, \infty).
+/// It is assumed that the bins and the elements are both sorted sequences and that
+/// every element has an associated bin.
+/// The function returns a p element array with the counts of how many elements go to each bin.
+/// Since the sequence is sorted this fully defines what element goes into which bin.
+pub fn sort_to_bins<T: Ord>(sorted_keys: &[T], bins: &[T]) -> Vec<usize> {
+    let nbins = bins.len();
+
+    // Make sure that the smallest element of the sorted keys fits into the bins.
+    assert!(bins.first().unwrap() <= sorted_keys.first().unwrap());
+
+    // Deal with the special case that there is only one bin.
+    // This means that all elements are in the one bin.
+    if nbins == 1 {
+        return vec![sorted_keys.len(); 1];
+    }
+
+    let mut bin_counts = vec![0; nbins];
+
+    // This iterates over each possible bin and returns also the associated rank.
+    // The last bin position is not iterated over since for an array with p elements
+    // there are p-1 tuple windows.
+    let mut bin_iter = izip!(
+        bin_counts.iter_mut(),
+        bins.iter().tuple_windows::<(&T, &T)>(),
+    );
+
+    // We take the first element of the bin iterator. There will always be at least one since
+    // there are at least two bins (an actual one, and the last half infinite one)
+    let mut r: &mut usize;
+    let mut bin_start: &T;
+    let mut bin_end: &T;
+    (r, (bin_start, bin_end)) = bin_iter.next().unwrap();
+
+    let mut count = 0;
+    'outer: for key in sorted_keys.iter() {
+        if bin_start <= key && key < bin_end {
+            *r += 1;
+            count += 1;
+        } else {
+            // Move the bin forward until it fits. There will always be a fitting bin.
+            loop {
+                if let Some((rn, (bsn, ben))) = bin_iter.next() {
+                    if bsn <= key && key < ben {
+                        // We have found the next fitting bin for our current element.
+                        // Can register it and go back to the outer for loop.
+                        *rn += 1;
+                        r = rn;
+                        bin_start = bsn;
+                        bin_end = ben;
+                        count += 1;
+                        break;
+                    }
+                } else {
+                    // We have no more fitting bin. So break the outer loop.
+                    break 'outer;
+                }
+            }
+        }
+    }
+
+    // We now have everything but the last bin. Just bunch the remaining elements to
+    // the last count.
+    *bin_counts.last_mut().unwrap() = sorted_keys.len() - count;
+
+    bin_counts
+}
+
+/// Redistribute locally sorted keys with respect to bins.
+///
+/// - The array `sorted_keys` is assumed to be sorted within each process. It needs not be globally sorted.
+/// - If there are `r` ranks in the communicator, the size of `bins` must be `r`.
+/// - The bins are defined through half-open intervals `(bin[0], bin[1])`, .... This defines r-1 bins. The
+///   last bin is the half-open interval `[bin[r-1], \infty)`.
+/// - All array elements must be larger or equal `bin[0]`. This means that each element can be sorted into a bin.
+pub fn redistribute_by_bins<T: Equivalence + Ord, C: CommunicatorCollectives>(
+    sorted_keys: &[T],
+    bins: &[T],
+    comm: &C,
+) -> Vec<T> {
+    let counts = sort_to_bins(sorted_keys, bins);
+    let counts = counts.iter().map(|elem| *elem as i32).collect_vec();
+    redistribute(sorted_keys, &counts, comm)
+}
+
+/// Generate random keys for testing.
+pub fn generate_random_keys<R: Rng>(nkeys: usize, rng: &mut R) -> Vec<MortonKey> {
+    let mut result = Vec::<MortonKey>::with_capacity(nkeys);
+
+    let xindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+    let yindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+    let zindices = rand::seq::index::sample(rng, LEVEL_SIZE as usize, nkeys);
+
+    for (xval, yval, zval) in izip!(xindices.iter(), yindices.iter(), zindices.iter()) {
+        result.push(MortonKey::from_index_and_level(
+            [xval, yval, zval],
+            DEEPEST_LEVEL as usize,
+        ));
+    }
+
+    result
+}
+
+/// Generate random points for testing.
+pub fn generate_random_points<R: Rng, C: CommunicatorCollectives>(
+    npoints: usize,
+    rng: &mut R,
+    comm: &C,
+) -> Vec<Point> {
+    let mut points = Vec::<Point>::with_capacity(npoints);
+    let rank = comm.rank() as usize;
+
+    for index in 0..npoints {
+        points.push(Point::new(
+            [rng.gen(), rng.gen(), rng.gen()],
+            npoints * rank + index,
+        ));
+    }
+
+    points
+}
+
+/// Get a seeded rng
+pub fn seeded_rng(seed: usize) -> ChaCha8Rng {
+    ChaCha8Rng::seed_from_u64(seed as u64)
+}
+
+/// Compute displacements from a vector of counts.
+///
+/// This is useful for global MPI varcount operations. Let
+/// count [ 3, 4, 5]. Then the corresponding displacements are
+// [0, 3, 7]. Note that the last element `5` is ignored.
+pub fn displacements(counts: &[i32]) -> Vec<i32> {
+    counts
+        .iter()
+        .scan(0, |acc, &x| {
+            let tmp = *acc;
+            *acc += x;
+            Some(tmp)
+        })
+        .collect()
+}
+
+#[cfg(test)]
+mod test {
+    use itertools::Itertools;
+
+    use super::sort_to_bins;
+
+    #[test]
+    fn test_sort_to_bins() {
+        let elems = (0..100).collect_vec();
+        let bins = [0, 17, 55];
+
+        let counts = sort_to_bins(&elems, &bins);
+
+        assert_eq!(counts[0], 17);
+        assert_eq!(counts[1], 38);
+        assert_eq!(counts[2], 45);
+    }
+}