From 1262e10936b1468acde6f6ed42140a00af173419 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Fri, 28 Jun 2024 21:49:52 +0100 Subject: [PATCH 1/5] Islands() --- DESCRIPTION | 4 +- NAMESPACE | 1 + NEWS.md | 3 +- R/Islands.R | 79 +++++++++++++++++++++++++++++++++++ inst/REFERENCES.bib | 11 +++++ man/Islands.Rd | 76 +++++++++++++++++++++++++++++++++ man/MSTSegments.Rd | 1 + man/MapTrees.Rd | 1 + man/MappingQuality.Rd | 1 + man/SpectralEigens.Rd | 1 + man/cluster-statistics.Rd | 1 + man/median.multiPhylo.Rd | 1 + tests/testthat/test-Islands.R | 19 +++++++++ vignettes/landscapes.Rmd | 10 +++-- vignettes/tree-islands.Rmd | 30 +++++++++++++ vignettes/treespace.Rmd | 41 +++++++++++++++++- 16 files changed, 273 insertions(+), 7 deletions(-) create mode 100644 R/Islands.R create mode 100644 man/Islands.Rd create mode 100644 tests/testthat/test-Islands.R create mode 100644 vignettes/tree-islands.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index 9bf1deb9..749ada12 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: TreeDist Type: Package Title: Calculate and Map Distances Between Phylogenetic Trees -Version: 2.7.1.9002 +Version: 2.7.1.9003 Authors@R: c(person("Martin R.", "Smith", email = "martin.smith@durham.ac.uk", role = c("aut", "cre", "cph", "prg"), @@ -29,6 +29,8 @@ Description: Implements measures of tree similarity, including (1996) . Includes tools for visualizing mappings of tree space (Smith 2022) , + for identifying islands of trees (Silva and Wilkinson 2021) + , for calculating the median of sets of trees, and for computing the information content of trees and splits. Copyright: Jonker-Volgenant Linear Assignment Problem implementation diff --git a/NAMESPACE b/NAMESPACE index fe255511..b780b055 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,6 +51,7 @@ export(GeneralizedRF) export(GetParallel) export(InfoRobinsonFoulds) export(InfoRobinsonFouldsSplits) +export(Islands) export(JaccardRobinsonFoulds) export(JaccardSplitSimilarity) export(KCDiameter) diff --git a/NEWS.md b/NEWS.md index a3a8e4db..bfc51f47 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ -# branch no-phangorn (9002) +# TreeDist 2.7.1.9003 +- `Islands()` allows the identification of islands of trees. - `PathDist()` uses internal implementation of path distance. diff --git a/R/Islands.R b/R/Islands.R new file mode 100644 index 00000000..7c4a51e2 --- /dev/null +++ b/R/Islands.R @@ -0,0 +1,79 @@ +#' Find islands from distance matrix +#' +#' `Islands()` assigns a set of objects to islands, such that all elements +#' within an island can form a connected graph in which each edge is no longer +#' than `threshold` distance units \insertRef{Silva2021}{TreeDist}. +#' +#' @inheritParams SpectralEigens +#' @param threshold Elements greater than `threshold` distance units will not be +#' assigned to the same island. +#' @param dense Logical; if `FALSE`, each island will be named according to the +#' index of its lowest-indexed member; if `TRUE`, each island will be numbered +#' sequentially from `1`, ordered by the index of the lowest-indexed member. +#' @param smallest Integer; Islands comprising no more than `smallest` elements +#' will be assigned to island `NA`. +#' @return `Islands()` returns a vector listing the island to which +#' each element is assigned. +#' @references \insertAllCited{} +#' @examples +#' library("TreeTools", quietly = TRUE) +#' # Generate a set of trees +#' trees <- as.phylo(as.TreeNumber(BalancedTree(16)) + c(-(40:20), 70:105), 16) +#' +#' # Calculate distances between trees +#' distances <- ClusteringInfoDist(trees) +#' summary(distances) +#' +#' # Assign trees to islands +#' isle <- Islands(distances, quantile(distances, 0.1)) +#' table(isle) +#' +#' # Indicate island membership on 2D mapping of tree distances +#' mapping <- cmdscale(distances, 2) +#' plot(mapping, col = isle + 1, +#' asp = 1, # Preserve aspect ratio - do not distort distances +#' ann = FALSE, axes = FALSE, # Don't label axes: dimensions are meaningless) +#' pch = 16 # Plotting character: Filled circle +#' ) +#' +#' # Compare strict consensus with island consensus trees +#' oPar <- par(mfrow = c(2, 2), mai = rep(0.1, 4)) +#' plot(Consensus(trees), main = "Strict") +#' plot(Consensus(trees[isle == 1]), edge.col = 2, main = "Island 1") +#' plot(Consensus(trees[isle == 2]), edge.col = 3, main = "Island 2") +#' plot(Consensus(trees[isle == 3]), edge.col = 4, main = "Island 3") +#' +#' # Restore graphical parameters +#' par(oPar) +#' @template MRS +#' @family tree space functions +#' @export +Islands <- function(D, threshold, dense = TRUE, smallest = 0) { + linked <- as.matrix(D) <= threshold + n <- dim(linked)[[1]] + ret <- integer(n) + i <- 1 + repeat { + links <- seq_len(n) == i + repeat { + nowLinked <- colSums(linked[links, , drop = FALSE]) > 0 + if (any(nowLinked[!links])) { + # Added to island + links <- nowLinked + } else { + break + } + } + ret[links] <- i + i <- which.min(ret) + if (ret[[i]]) break + } + tab <- table(ret, dnn = NULL) + ret[ret %in% as.integer(names(tab)[tab < smallest])] <- NA + + if (dense) { + as.integer(factor(rank(ret, ties.method = "min", na.last = "keep"))) + } else { + ret + } +} \ No newline at end of file diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 0690eda5..b1c07ce3 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -459,6 +459,17 @@ @article{Shannon1948 journal = {Bell System Technical Journal} } +@article{Silva2021, + title = {On Defining and Finding Islands of Trees and Mitigating Large Island Bias}, + author = {Silva, Ana Serra and Wilkinson, Mark}, + date = {2021-11-01}, + journaltitle = {Systematic Biology}, + volume = {70}, + number = {6}, + pages = {1282--1294}, + doi = {10.1093/sysbio/syab015} +} + @article{Smith2014, author = {Smith, Martin R. and Ortega-Hern{\'{a}}ndez, Javier}, doi = {10.1038/nature13576}, diff --git a/man/Islands.Rd b/man/Islands.Rd new file mode 100644 index 00000000..56763a30 --- /dev/null +++ b/man/Islands.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Islands.R +\name{Islands} +\alias{Islands} +\title{Find islands from distance matrix} +\usage{ +Islands(D, threshold, dense = TRUE) +} +\arguments{ +\item{D}{Square matrix or \code{dist} object containing Euclidean distances +between data points.} + +\item{threshold}{Elements greater than \code{threshold} distance units will not be +assigned to the same island.} + +\item{dense}{Logical; if \code{FALSE}, each island will be named according to the +index of its lowest-indexed member; if \code{TRUE}, each island will be numbered +sequentially from \code{1}, ordered by the index of the lowest-indexed member.} +} +\value{ +\code{Islands()} returns a vector listing the island to which +each element is assigned. +} +\description{ +\code{Islands()} assigns a set of objects to islands, such that all elements +within an island can form a connected graph in which each edge is no longer +than \code{threshold} distance units \insertRef{Silva2021}{TreeDist}. +} +\examples{ +library("TreeTools", quietly = TRUE) +# Generate a set of trees +trees <- as.phylo(as.TreeNumber(BalancedTree(16)) + c(-(40:20), 70:105), 16) + +# Calculate distances between trees +distances <- ClusteringInfoDist(trees) +summary(distances) + +# Assign trees to islands +isle <- Islands(distances, quantile(distances, 0.1)) +table(isle) + +# Indicate island membership on 2D mapping of tree distances +mapping <- cmdscale(distances, 2) +plot(mapping, col = isle + 1, + asp = 1, # Preserve aspect ratio - do not distort distances + ann = FALSE, axes = FALSE, # Don't label axes: dimensions are meaningless) + pch = 16 # Plotting character: Filled circle +) + +# Compare strict consensus with island consensus trees +oPar <- par(mfrow = c(2, 2), mai = rep(0.1, 4)) +plot(Consensus(trees), main = "Strict") +plot(Consensus(trees[isle == 1]), edge.col = 2, main = "Island 1") +plot(Consensus(trees[isle == 2]), edge.col = 3, main = "Island 2") +plot(Consensus(trees[isle == 3]), edge.col = 4, main = "Island 3") + +# Restore graphical parameters +par(oPar) +} +\references{ +\insertAllCited{} +} +\seealso{ +Other tree space functions: +\code{\link{MSTSegments}()}, +\code{\link{MapTrees}()}, +\code{\link{MappingQuality}()}, +\code{\link{SpectralEigens}()}, +\code{\link{cluster-statistics}}, +\code{\link{median.multiPhylo}()} +} +\author{ +\href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} +(\href{mailto:martin.smith@durham.ac.uk}{martin.smith@durham.ac.uk}) +} +\concept{tree space functions} diff --git a/man/MSTSegments.Rd b/man/MSTSegments.Rd index 7a163192..1ac77390 100644 --- a/man/MSTSegments.Rd +++ b/man/MSTSegments.Rd @@ -71,6 +71,7 @@ PlotTools::SpectrumLegend( } \seealso{ Other tree space functions: +\code{\link{Islands}()}, \code{\link{MapTrees}()}, \code{\link{MappingQuality}()}, \code{\link{SpectralEigens}()}, diff --git a/man/MapTrees.Rd b/man/MapTrees.Rd index 0c101316..59890840 100644 --- a/man/MapTrees.Rd +++ b/man/MapTrees.Rd @@ -111,6 +111,7 @@ Full detail of tree space analysis in R is provided in the accompanying \href{https://ms609.github.io/TreeDist/articles/treespace.html}{vignette}. Other tree space functions: +\code{\link{Islands}()}, \code{\link{MSTSegments}()}, \code{\link{MappingQuality}()}, \code{\link{SpectralEigens}()}, diff --git a/man/MappingQuality.Rd b/man/MappingQuality.Rd index 9beaf509..05751dfb 100644 --- a/man/MappingQuality.Rd +++ b/man/MappingQuality.Rd @@ -42,6 +42,7 @@ MappingQuality(distances, dist(mapping), 4) } \seealso{ Other tree space functions: +\code{\link{Islands}()}, \code{\link{MSTSegments}()}, \code{\link{MapTrees}()}, \code{\link{SpectralEigens}()}, diff --git a/man/SpectralEigens.Rd b/man/SpectralEigens.Rd index 241736c6..16067450 100644 --- a/man/SpectralEigens.Rd +++ b/man/SpectralEigens.Rd @@ -38,6 +38,7 @@ plot(cmdscale(distances), pch = 15, col = clusts$cluster) } \seealso{ Other tree space functions: +\code{\link{Islands}()}, \code{\link{MSTSegments}()}, \code{\link{MapTrees}()}, \code{\link{MappingQuality}()}, diff --git a/man/cluster-statistics.Rd b/man/cluster-statistics.Rd index dbb3b5c3..14386843 100644 --- a/man/cluster-statistics.Rd +++ b/man/cluster-statistics.Rd @@ -84,6 +84,7 @@ MeanMSTEdge(points, cluster) } \seealso{ Other tree space functions: +\code{\link{Islands}()}, \code{\link{MSTSegments}()}, \code{\link{MapTrees}()}, \code{\link{MappingQuality}()}, diff --git a/man/median.multiPhylo.Rd b/man/median.multiPhylo.Rd index 1686df7a..43282ceb 100644 --- a/man/median.multiPhylo.Rd +++ b/man/median.multiPhylo.Rd @@ -80,6 +80,7 @@ Consensus methods: \code{\link[TreeTools:ConsensusWithout]{TreeTools::ConsensusWithout()}} Other tree space functions: +\code{\link{Islands}()}, \code{\link{MSTSegments}()}, \code{\link{MapTrees}()}, \code{\link{MappingQuality}()}, diff --git a/tests/testthat/test-Islands.R b/tests/testthat/test-Islands.R new file mode 100644 index 00000000..b02d3023 --- /dev/null +++ b/tests/testthat/test-Islands.R @@ -0,0 +1,19 @@ +test_that("Islands() works", { + library("TreeTools", quietly = TRUE) + trees <- as.phylo(as.TreeNumber(BalancedTree(16)) + c(70:78, -(39:30), 80:99), 16) + distances <- ClusteringInfoDist(trees) + expect_equal(Islands(distances, 2.5), rep(c(1, 2, 1), c(9, 10, 20))) + expect_equal(Islands(distances, 2.5, FALSE), rep(c(1, 10, 1), c(9, 10, 20))) + + + trees <- as.phylo(c(0:10, 1000:1005, 2000:2019), 16) + distances <- ClusteringInfoDist(trees) + expect_equal(Islands(distances, 2.5, TRUE, smallest = 6), + rep(c(1, 2, 3), c(11, 6, 20))) + expect_equal(Islands(distances, 2.5, TRUE, smallest = 7), + rep(c(1, NA, 2), c(11, 6, 20))) + expect_equal(Islands(distances, 2.5, FALSE, smallest = 6), + rep(c(1, 12, 18), c(11, 6, 20))) + expect_equal(Islands(distances, 2.5, FALSE, smallest = 7), + rep(c(1, NA, 18), c(11, 6, 20))) +}) diff --git a/vignettes/landscapes.Rmd b/vignettes/landscapes.Rmd index d3d2bd19..68bea203 100644 --- a/vignettes/landscapes.Rmd +++ b/vignettes/landscapes.Rmd @@ -37,12 +37,14 @@ library("TreeDist") # Generate a set of trees trees <- as.phylo(as.TreeNumber(BalancedTree(16)) + 0:100 - 15, 16) -# Create a 2D mapping +# Calculate distances between trees distances <- ClusteringInfoDist(trees) -mapping <- cmdscale(distances, 2) +summary(distances) + +islands <- Islands(distances, threshold = 0.1) -# Score trees according to their balance -scores <- TotalCopheneticIndex(trees) +# Create a 2D mapping +mapping <- cmdscale(distances, 2) # Normalize scores scoreMax <- TCIContext(trees[[1]])[["maximum"]] diff --git a/vignettes/tree-islands.Rmd b/vignettes/tree-islands.Rmd new file mode 100644 index 00000000..c33810dc --- /dev/null +++ b/vignettes/tree-islands.Rmd @@ -0,0 +1,30 @@ +--- +title: "Finding islands of phylogenetic trees" +author: "[Martin R. Smith](https://smithlabdurham.github.io/)" +output: + rmarkdown::html_vignette: + fig_width: 6 + fig_height: 4 +bibliography: ../inst/REFERENCES.bib +csl: https://raw.githubusercontent.com/citation-style-language/styles/master/apa-old-doi-prefix.csl +vignette: > + %\VignetteIndexEntry{Finding islands of phylogenetic trees} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +Collections of phylogenetic trees, such as those output by Bayesian or parsimony +analysis, may occupy discrete regions of tree space such that individual +clusters of trees are separated from all other trees by a certain distance. +Finding such islands, and taking their consensus, can be an effective way +of summarising conflicting signal in a phylogenetic dataset [@Silva2021]. + +```{r col-trees-by-score} +# Load required libraries +library("TreeTools", quietly = TRUE) +library("TreeDist") + +# Generate a set of trees +trees <- as.phylo(as.TreeNumber(BalancedTree(16)) + 0:100 - 15, 16) + +``` \ No newline at end of file diff --git a/vignettes/treespace.Rmd b/vignettes/treespace.Rmd index ea2400c3..a9686177 100644 --- a/vignettes/treespace.Rmd +++ b/vignettes/treespace.Rmd @@ -207,7 +207,7 @@ points(seq_along(trees), rep(1, length(trees)), pch = 16, Another thing we may wish to do is to take the consensus of each cluster: -```{r consensus, fig.align="center"} +```{r cluster-consensus, fig.align="center"} par(mfrow = c(1, 2), mar = rep(0.2, 4)) col1 <- spectrum[mean(treeNumbers[cluster == 1])] col2 <- spectrum[mean(treeNumbers[cluster == 2])] @@ -219,6 +219,45 @@ plot(consensus(trees[cluster == 2], p = 0.5), Here, we learn that the two clusters are distinguished by the position of `t7`. + +### Identifying islands + +Besides clustering, we can also define 'islands' in tree space that are +separated by a 'moat', such that all trees on one island are separated from +all trees on another by at least a certain distance [@Silva2021]. + +```{r island-id, fig.asp = 1, fig.width = 3, fig.align="center"} +par(mar = rep(0, 4)) +# set a threshold corresponding to the width of the "moat" between islands +threshold <- 1.8 +island <- Islands(distances, threshold) + +# See how many trees are on each island +table(island) + +# Let's ignore the small islands for now +largeIsle <- Islands(distances, threshold, smallest = 5) + +# Colour trees according to their island +plot(mapping, + asp = 1, # Preserve aspect ratio - do not distort distances + ann = FALSE, axes = FALSE, # Don't label axes: dimensions are meaningless + col = ifelse(is.na(largeIsle), "grey", largeIsle + 1), + pch = 16 + ) +``` + +Let's view the consensus of each large island cluster: + +```{r island-consensus, fig.align="center"} +par(mfrow = c(1, 2), mar = rep(0.2, 4)) +plot(consensus(trees[!is.na(largeIsle) & largeIsle == 1], p = 0.5), + edge.color = 2, edge.width = 2, tip.color = 2) +plot(consensus(trees[!is.na(largeIsle) & largeIsle == 2], p = 0.5), + edge.color = 3, edge.width = 2, tip.color = 3) +``` + + ### Validating a mapping Now let's evaluate whether our map of tree space is representative. From bee19a7e319fad15f192360da06098d634172256 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Fri, 28 Jun 2024 21:53:28 +0100 Subject: [PATCH 2/5] Update Islands.Rd --- man/Islands.Rd | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/man/Islands.Rd b/man/Islands.Rd index 56763a30..41c24d1b 100644 --- a/man/Islands.Rd +++ b/man/Islands.Rd @@ -4,7 +4,7 @@ \alias{Islands} \title{Find islands from distance matrix} \usage{ -Islands(D, threshold, dense = TRUE) +Islands(D, threshold, dense = TRUE, smallest = 0) } \arguments{ \item{D}{Square matrix or \code{dist} object containing Euclidean distances @@ -16,6 +16,9 @@ assigned to the same island.} \item{dense}{Logical; if \code{FALSE}, each island will be named according to the index of its lowest-indexed member; if \code{TRUE}, each island will be numbered sequentially from \code{1}, ordered by the index of the lowest-indexed member.} + +\item{smallest}{Integer; Islands comprising no more than \code{smallest} elements +will be assigned to island \code{NA}.} } \value{ \code{Islands()} returns a vector listing the island to which From cae02cd547d3f314bb2e75e776e11b805a0c1446 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Fri, 28 Jun 2024 21:54:30 +0100 Subject: [PATCH 3/5] bib entries --- inst/REFERENCES.bib | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index e5247703..fa6826a1 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -494,8 +494,8 @@ @article{Shannon1948 @article{Silva2021, title = {On Defining and Finding Islands of Trees and Mitigating Large Island Bias}, author = {Silva, Ana Serra and Wilkinson, Mark}, - date = {2021-11-01}, - journaltitle = {Systematic Biology}, + year = {2021}, + journal = {Systematic Biology}, volume = {70}, number = {6}, pages = {1282--1294}, From 16a9486ff0b42dcb37fdbe6fccc75293394aaf9f Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Fri, 28 Jun 2024 21:56:51 +0100 Subject: [PATCH 4/5] Revert --- vignettes/landscapes.Rmd | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/vignettes/landscapes.Rmd b/vignettes/landscapes.Rmd index 68bea203..d3d2bd19 100644 --- a/vignettes/landscapes.Rmd +++ b/vignettes/landscapes.Rmd @@ -37,15 +37,13 @@ library("TreeDist") # Generate a set of trees trees <- as.phylo(as.TreeNumber(BalancedTree(16)) + 0:100 - 15, 16) -# Calculate distances between trees -distances <- ClusteringInfoDist(trees) -summary(distances) - -islands <- Islands(distances, threshold = 0.1) - # Create a 2D mapping +distances <- ClusteringInfoDist(trees) mapping <- cmdscale(distances, 2) +# Score trees according to their balance +scores <- TotalCopheneticIndex(trees) + # Normalize scores scoreMax <- TCIContext(trees[[1]])[["maximum"]] scoreMin <- TCIContext(trees[[1]])[["minimum"]] From 33e21964762c07669e107ebc841e863f6aadb485 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Fri, 28 Jun 2024 22:09:01 +0100 Subject: [PATCH 5/5] Output sequence --- src/spr.cpp | 8 ++++---- tests/testthat/test-tree_distance_spr.R | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/spr.cpp b/src/spr.cpp index 0a2ce23d..b5c9727c 100644 --- a/src/spr.cpp +++ b/src/spr.cpp @@ -174,8 +174,8 @@ List keep_and_reroot(const List tree1, const intx n_node = ret_edge1.nrow() / 2; if (!n_node) { List nullTree = List::create(Named("edge") = ret_edge1, - _["tip.label"] = CharacterVector(0), - _["Nnode"] = n_node); + _["Nnode"] = n_node, + _["tip.label"] = CharacterVector(0)); nullTree.attr("class") = "phylo"; nullTree.attr("order") = "preorder"; @@ -228,8 +228,8 @@ List keep_and_reduce(const List tree1, if (edge1.nrow() < 1) { List nullTree = List::create(Named("edge") = NumericMatrix(0, 2), - _["tip.label"] = CharacterVector(0), - _["Nnode"] = 0); + _["Nnode"] = 0, + _["tip.label"] = CharacterVector(0)); nullTree.attr("class") = "phylo"; nullTree.attr("order") = "preorder"; diff --git a/tests/testthat/test-tree_distance_spr.R b/tests/testthat/test-tree_distance_spr.R index 955981f4..6deb477f 100644 --- a/tests/testthat/test-tree_distance_spr.R +++ b/tests/testthat/test-tree_distance_spr.R @@ -13,7 +13,7 @@ test_that("SPR: keep_and_reroot()", { expect_equal(Preorder(reduced[[1]]), Preorder(DropTip(result[[1]], "t9"))) expect_equal(Preorder(reduced[[2]]), Preorder(DropTip(result[[2]], "t9"))) - skip_if_not_installed("TreeTools", "1.11.1.9002") + skip_if_not_installed("TreeTools", "1.11.1.9003") twoZeroes <- list(Preorder(ZeroTaxonTree()), Preorder(ZeroTaxonTree())) expect_equal(keep_and_reroot(SingleTaxonTree(), SingleTaxonTree(), FALSE), twoZeroes)