diff --git a/phylo2vec/src/tree_vec/mod.rs b/phylo2vec/src/tree_vec/mod.rs index 4b45951..bb707e8 100644 --- a/phylo2vec/src/tree_vec/mod.rs +++ b/phylo2vec/src/tree_vec/mod.rs @@ -1,7 +1,13 @@ use crate::utils::sample; pub mod ops; +use ops::{ + build_vector, find_coords_of_first_leaf, order_cherries, order_cherries_no_parents, Ancestry, +}; +/// A vector representation of a phylogenetic tree +/// +/// Contains the tree structure, branch lengths, taxa, and rootedness #[derive(Debug, PartialEq, Clone)] pub struct TreeVec { n_leaf: usize, @@ -11,7 +17,17 @@ pub struct TreeVec { is_rooted: bool, } +/// Implementation of the `TreeVec` struct impl TreeVec { + /// Creates a new `TreeVec` instance + /// + /// # Arguments + /// * `data` - Vector containing the tree structure + /// * `branch_lengths` - Optional vector of branch length tuples (start, end) + /// * `taxa` - Optional vector of taxon names + /// + /// # Returns + /// A new `TreeVec` instance with the specified data and properties pub fn new( data: Vec, branch_lengths: Option>, @@ -27,25 +43,127 @@ impl TreeVec { } } + /// Creates a new random tree with specified number of leaves + /// + /// # Arguments + /// * `n_leaves` - Number of leaves in the tree + /// * `ordering` - Whether to maintain ordered structure + /// + /// # Returns + /// A new randomly generated `TreeVec` instance pub fn from_sample(n_leaves: usize, ordering: bool) -> Self { let v = sample(n_leaves, ordering); TreeVec::new(v, None, None) } + /// Converts the tree to Newick format + /// + /// # Returns + /// A String containing the Newick representation of the tree pub fn to_newick(&self) -> String { return ops::to_newick(&self.data); } - pub fn get_ancestry(&self) -> Vec<[usize; 3]> { + /// Gets the ancestry matrix representation of the tree + /// + /// # Returns + /// An `Ancestry` type containing parent-child relationships + pub fn get_ancestry(&self) -> Ancestry { return ops::get_ancestry(&self.data); } - pub fn add_leaf(leaf: usize, branch: usize) -> Self { - unimplemented!(); + /// Adds a new leaf to the tree + /// + /// # Arguments + /// * `leaf` - Index of the new leaf to add + /// * `branch` - Index of the branch to attach the leaf to + /// + /// # Side effects + /// Modifies the tree structure by adding the new leaf and updating indices + pub fn add_leaf(&mut self, leaf: usize, branch: usize) { + self.data.push(branch); + + let mut ancestry_add = self.get_ancestry(); + + println!("{:?}", ancestry_add); + let mut found_first_leaf = false; + for r in 0..ancestry_add.len() { + for c in 0..3 { + if !found_first_leaf && ancestry_add[r][c] == self.data.len() { + // Find the indices of the first leaf + // and then set the value to the new leaf + ancestry_add[r][c] = leaf; + found_first_leaf = true; + } else if ancestry_add[r][c] >= leaf { + ancestry_add[r][c] += 1; + } + } + } + + // ancestry_add[leaf_coords][leaf_col] = leaf as isize; + // let ancestry_add_ref = &mut ancestry_add; + order_cherries(&mut ancestry_add); + order_cherries_no_parents(&mut ancestry_add); + self.data = build_vector(ancestry_add); } - pub fn remove_leaf(leaf: usize) -> Self { - unimplemented!(); + /// Removes a leaf from the tree + /// + /// # Arguments + /// * `leaf` - Index of the leaf to remove + /// + /// # Returns + /// The index of the sister node of the removed leaf + /// + /// # Side effects + /// Modifies the tree structure by removing the leaf and updating indices + pub fn remove_leaf(&mut self, leaf: usize) -> usize { + let ancestry = self.get_ancestry(); + let leaf_coords = find_coords_of_first_leaf(&ancestry, leaf); + let leaf_row = leaf_coords.0; + let leaf_col = leaf_coords.1; + + // Find the parent of the leaf to remove + let parent = ancestry[leaf_row][2]; + let sister = ancestry[leaf_row][1 - leaf_col]; + let num_cherries = ancestry.len(); + + let mut ancestry_rm = Vec::with_capacity(num_cherries - 1); + + for r in 0..num_cherries - 1 { + let mut new_row = if r < leaf_row { + ancestry[r].clone() + } else { + ancestry[r + 1].clone() + }; + + for c in 0..3 { + let mut node = new_row[c]; + + if node == parent { + node = sister; + } + + // Subtract 1 for leaves > "leaf" + // (so that the vector is still valid) + if node > leaf { + node -= 1; + if node >= parent { + node -= 1; + } + } + + new_row[c] = node; + } + + ancestry_rm.push(new_row); + } + + order_cherries(&mut ancestry_rm); + order_cherries_no_parents(&mut ancestry_rm); + self.data = build_vector(ancestry_rm); + + return sister; } } @@ -71,6 +189,9 @@ mod tests { assert_eq!(tree.taxa, None); } + /// Test the creation of a new tree from a sample + /// + /// Tests are using 50 leaf tree with ordering and no ordering #[rstest] #[case(50, true)] #[case(50, false)] @@ -82,6 +203,9 @@ mod tests { assert_eq!(tree.taxa, None); } + /// Test the conversion of a tree to Newick format + /// + /// Tests are using 5 or less leaf tree with different structures #[rstest] #[case(vec![0, 0, 0, 1, 3], "(((0,(3,5)6)8,2)9,(1,4)7)10;")] #[case(vec![0, 1, 2, 3, 4], "(0,(1,(2,(3,(4,5)6)7)8)9)10;")] @@ -92,6 +216,9 @@ mod tests { assert_eq!(newick, expected); } + /// Test the retrieval of the ancestry matrix + /// + /// Tests are using 5 or less leaf tree with different structures #[rstest] #[case(vec![0, 0, 0, 1, 3], vec![[3, 5, 6], [1, 4, 7], @@ -105,9 +232,46 @@ mod tests { #[case(vec![0, 0, 1], vec![[1, 3, 4], [0, 2, 5], [5, 4, 6]])] - fn test_get_ancestry(#[case] v: Vec, #[case] expected: Vec<[usize; 3]>) { + fn test_get_ancestry(#[case] v: Vec, #[case] expected: Ancestry) { let tree = TreeVec::new(v, None, None); let ancestry = tree.get_ancestry(); assert_eq!(ancestry, expected); } + + /// Test the addition of a new leaf to the tree + /// + /// Tests are using 6 leaf tree with different leaf and branch indices + #[rstest] + #[case(vec![0, 1, 2, 5, 4, 2], 5, 3, vec![0, 1, 2, 5, 3, 4, 2])] + #[case(vec![0, 1, 2, 5, 4, 2], 7, 0, vec![0, 1, 2, 5, 4, 2, 0])] + #[case(vec![0, 1, 2, 5, 4, 2], 7, 2, vec![0, 1, 2, 5, 4, 2, 2])] + fn test_add_leaf( + #[case] v: Vec, + #[case] leaf: usize, + #[case] branch: usize, + #[case] expected: Vec, + ) { + let mut tree = TreeVec::new(v, None, None); + tree.add_leaf(leaf, branch); + assert_eq!(tree.data, expected); + } + + /// Test the removal of a leaf from the tree + /// + /// Tests are using 6 leaf tree with different leaf and sister branch indices + #[rstest] + #[case(vec![0, 1, 2, 5, 4, 2], 5, 4, vec![0, 1, 2, 5, 2])] + #[case(vec![0, 1, 2, 5, 4, 2], 6, 2, vec![0, 1, 2, 5, 4])] + #[case(vec![0, 1, 2, 5, 4, 2], 0, 11, vec![0, 1, 4, 3, 1])] + fn test_remove_leaf( + #[case] v: Vec, + #[case] leaf: usize, + #[case] branch: usize, + #[case] expected: Vec, + ) { + let mut tree = TreeVec::new(v, None, None); + let sister = tree.remove_leaf(leaf); + assert_eq!(tree.data, expected); + assert_eq!(sister, branch); + } } diff --git a/phylo2vec/src/tree_vec/ops/mod.rs b/phylo2vec/src/tree_vec/ops/mod.rs index 5b2ff2d..94e9c27 100644 --- a/phylo2vec/src/tree_vec/ops/mod.rs +++ b/phylo2vec/src/tree_vec/ops/mod.rs @@ -2,4 +2,7 @@ pub mod avl; pub mod vector; #[allow(unused_imports)] -pub use vector::{build_newick, get_ancestry, get_pairs, get_pairs_avl, to_newick, Ancestry}; +pub use vector::{ + build_newick, build_vector, find_coords_of_first_leaf, get_ancestry, get_pairs, get_pairs_avl, + order_cherries, order_cherries_no_parents, to_newick, Ancestry, +}; diff --git a/phylo2vec/src/tree_vec/ops/vector.rs b/phylo2vec/src/tree_vec/ops/vector.rs index 4b1667d..49f0518 100644 --- a/phylo2vec/src/tree_vec/ops/vector.rs +++ b/phylo2vec/src/tree_vec/ops/vector.rs @@ -1,5 +1,6 @@ use crate::tree_vec::ops::avl::{AVLTree, Pair}; use crate::utils::is_unordered; +use std::usize; /// A type alias for the Ancestry type, which is a vector of vectors representing [child1, child2, parent] pub type Ancestry = Vec<[usize; 3]>; @@ -121,25 +122,25 @@ pub fn get_ancestry(v: &Vec) -> Ancestry { // Initialize Ancestry with capacity `k` let mut ancestry: Ancestry = Vec::with_capacity(num_of_leaves); // Keep track of child->highest parent relationship - let mut parents: Vec = vec![-1; 2 * num_of_leaves + 1]; + let mut parents: Vec = vec![usize::MAX; 2 * num_of_leaves + 1]; for i in 0..num_of_leaves { let (c1, c2) = pairs[i]; - let parent_of_child1 = if parents[c1] != -1 { - parents[c1] as usize + let parent_of_child1 = if parents[c1] != usize::MAX { + parents[c1] } else { c1 }; - let parent_of_child2 = if parents[c2] != -1 { - parents[c2] as usize + let parent_of_child2 = if parents[c2] != usize::MAX { + parents[c2] } else { c2 }; // Next parent - let next_parent = (num_of_leaves + i + 1) as isize; - ancestry.push([parent_of_child1, parent_of_child2, next_parent as usize]); + let next_parent = num_of_leaves + i + 1; + ancestry.push([parent_of_child1, parent_of_child2, next_parent]); // Update the parents of current children parents[c1] = next_parent; @@ -154,8 +155,7 @@ fn _build_newick_recursive_inner(p: usize, ancestry: &Ancestry) -> String { let leaf_max = ancestry.len(); // Extract the children (c1, c2) and ignore the parent from the ancestry tuple - let c1 = ancestry[p - leaf_max - 1][0]; - let c2 = ancestry[p - leaf_max - 1][1]; + let [c1, c2, _] = ancestry[p - leaf_max - 1]; // Recursive calls for left and right children, checking if they are leaves or internal nodes let left = if c1 > leaf_max { @@ -188,3 +188,112 @@ pub fn to_newick(v: &Vec) -> String { let ancestry: Ancestry = get_ancestry(&v); build_newick(&ancestry) } + +pub fn find_coords_of_first_leaf(ancestry: &Ancestry, leaf: usize) -> (usize, usize) { + for r in 0..ancestry.len() { + for c in 0..3 { + if ancestry[r][c] == leaf { + return (r, c); + } + } + } + panic!("Leaf not found in ancestry"); +} + +pub fn order_cherries(ancestry: &mut Ancestry) { + let num_cherries = ancestry.len(); + let num_nodes = 2 * num_cherries + 2; + + let mut min_desc = vec![usize::MAX; num_nodes]; + + // Sort by the parent node (ascending order) + ancestry.sort_by_key(|x| x[2]); + + for i in 0..num_cherries { + let [c1, c2, p] = ancestry[i]; + // Get the minimum descendant of c1 and c2 (if they exist) + // min_desc[child_x] doesn't exist, min_desc_x --> child_x + let min_desc1 = if min_desc[c1] != usize::MAX { + min_desc[c1] + } else { + c1 + }; + let min_desc2 = if min_desc[c2] != usize::MAX { + min_desc[c2] + } else { + c2 + }; + + // Collect the minimum descendant and allocate it to min_desc[parent] + let desc_min = std::cmp::min(min_desc1, min_desc2); + min_desc[p] = desc_min; + + // Instead of the parent, we collect the max node + let desc_max = std::cmp::max(min_desc1, min_desc2); + ancestry[i] = [min_desc1, min_desc2, desc_max]; + } +} + +pub fn order_cherries_no_parents(ancestry: &mut Ancestry) { + let num_cherries = ancestry.len(); + + for i in 0..num_cherries { + // Find the next index to process: + // The goal is to find the row with the highest leaf + // where both leaves were previously un-visited + // why? If a leaf in a cherry already appeared in the ancestry, + // it means that leaf was already involved in a shallower cherry + let mut idx = usize::MAX; + + // Initially, all cherries have not been processed + let mut unvisited = vec![true; num_cherries + 1]; + + // Temporary max leaf + let mut max_leaf = 0; + + for j in i..num_cherries { + let [c1, c2, c_max] = ancestry[j]; + + if c_max > max_leaf { + if unvisited[c1] && unvisited[c2] { + max_leaf = c_max; + idx = j; + } + } + + // c1 and c2 have been processed + unvisited[c1] = false; + unvisited[c2] = false; + } + + if idx != i { + ancestry[i..idx + 1].rotate_right(1); + } + } +} + +pub fn build_vector(cherries: Ancestry) -> Vec { + let num_cherries = cherries.len(); + let num_leaves = num_cherries + 1; + + let mut v = vec![0; num_cherries]; + let mut idxs = vec![0; num_leaves]; + + for i in 0..num_cherries { + let [c1, c2, c_max] = cherries[i]; + + let mut idx = 0; + + for j in 1..c_max { + idx += idxs[j]; + } + // Reminder: v[i] = j --> branch i yields leaf j + v[c_max - 1] = if idx == 0 { + std::cmp::min(c1, c2) + } else { + c_max - 1 + idx + }; + idxs[c_max] = 1; + } + return v; +}