From c4c735423c7946752ae7b5390eba2c998bfe0a42 Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 11:17:58 +0200 Subject: [PATCH 01/11] Fixed a typo --- R/distance-methods.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/distance-methods.R b/R/distance-methods.R index 8e2534c9..c373387d 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -317,7 +317,7 @@ JSD <- function(physeq, parallel=FALSE){ #' and is encouraged in the instances of large trees, many samples, or both. #' Parallelization has been implemented via the \code{\link{foreach-package}}. #' This means that parallel calls need to be preceded by 2 or more commands -#' that register the parallel ``backend''. This is acheived via your choice of +#' that register the parallel ``backend''. This is achieved via your choice of #' helper packages. One of the simplest seems to be the \emph{doParallel} package. #' #' For more information, see the following links on registering the ``backend'': From 38a3d1597ec709a031fd14556ab50125d6c21fcd Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 11:43:44 +0200 Subject: [PATCH 02/11] Tidied up the mixture of space- and tab- indentations with formatR --- R/IO-methods.R | 1741 ++++++++++++++-------------- R/allClasses.R | 80 +- R/allData.R | 39 +- R/allPackage.R | 4 +- R/almostAllAccessors.R | 257 +++-- R/as-methods.R | 48 +- R/assignment-methods.R | 203 ++-- R/deprecated_functions.R | 204 +++- R/distance-methods.R | 573 +++++----- R/extend_DESeq2.R | 20 +- R/extend_vegan.R | 279 +++-- R/extract-methods.R | 56 +- R/merge-methods.R | 714 ++++++------ R/multtest-wrapper.R | 170 ++- R/network-methods.R | 183 +-- R/ordination-methods.R | 432 ++++--- R/otuTable-class.R | 96 +- R/phylo-class.R | 25 +- R/phyloseq-class.R | 416 ++++--- R/plot-methods.R | 2087 +++++++++++++++++----------------- R/sampleData-class.R | 88 +- R/show-methods.R | 117 +- R/taxonomyTable-class.R | 81 +- R/transform_filter-methods.R | 881 +++++++------- R/validity-methods.R | 174 ++- 25 files changed, 4490 insertions(+), 4478 deletions(-) diff --git a/R/IO-methods.R b/R/IO-methods.R index 5a34a8f3..63a299ac 100644 --- a/R/IO-methods.R +++ b/R/IO-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Universal import method (wrapper) for phyloseq-package #' #' A user must still understand the additional arguments required for each @@ -15,7 +15,7 @@ #' @param pipelineName (Required). Character string. The name of the #' analysis tool / pipeline / package #' that created the OTU-cluster data or other data that you now want to import. -#' Current options are \code{c("mothur", "pyrotagger", "QIIME", "RDP")}, and +#' Current options are \code{c('mothur', 'pyrotagger', 'QIIME', 'RDP')}, and #' only the first letter is necessary. #' #' @param ... (Required). Additional named arguments providing file paths, and possible @@ -66,33 +66,32 @@ #' @export #' @examples #' ## See documentation of a specific import function -import <- function(pipelineName, ...){ - # Reduce pipelineName to just its first letter, as all are different - pipelineName <- substr(pipelineName, 1, 1) - - # Test that it is in the set - if( !(pipelineName %in% c("B", "b", "M", "m", "P", "p", "Q", "q", "R", "r")) ){ - stop("You need to select among available importer types:\n", - "\"BIOM\", \"mothur\", \"pyrotagger\", \"QIIME\", \"RDP\" \n See ?import for details") - } - - if( pipelineName %in% c("B", "b") ){ - return( import_biom(...) ) - } - if( pipelineName %in% c("M", "m") ){ - return( import_mothur(...) ) - } - if( pipelineName %in% c("P", "p") ){ - return( import_pyrotagger_tab(...) ) - } - if( pipelineName %in% c("Q", "q") ){ - return( import_qiime(...) ) - } - if( pipelineName %in% c("R", "r") ){ - return( import_RDP_cluster(...) ) - } +import <- function(pipelineName, ...) { + # Reduce pipelineName to just its first letter, as all are different + pipelineName <- substr(pipelineName, 1, 1) + + # Test that it is in the set + if (!(pipelineName %in% c("B", "b", "M", "m", "P", "p", "Q", "q", "R", "r"))) { + stop("You need to select among available importer types:\n", "\"BIOM\", \"mothur\", \"pyrotagger\", \"QIIME\", \"RDP\" \n See ?import for details") + } + + if (pipelineName %in% c("B", "b")) { + return(import_biom(...)) + } + if (pipelineName %in% c("M", "m")) { + return(import_mothur(...)) + } + if (pipelineName %in% c("P", "p")) { + return(import_pyrotagger_tab(...)) + } + if (pipelineName %in% c("Q", "q")) { + return(import_qiime(...)) + } + if (pipelineName %in% c("R", "r")) { + return(import_RDP_cluster(...)) + } } -################################################################################ +################################################################################ #' Import function to read the now legacy-format QIIME OTU table. #' #' QIIME produces several files that can be directly imported by @@ -115,7 +114,7 @@ import <- function(pipelineName, ...){ #' phylogenetic tree with a tip for each OTU, which can also be imported #' specified here or imported separately using \code{\link{read_tree}}. #' -#' See \url{"http://www.qiime.org/"} for details on using QIIME. While there are +#' See \url{'http://www.qiime.org/'} for details on using QIIME. While there are #' many complex dependencies, QIIME can be downloaded as a pre-installed #' linux virtual machine that runs ``off the shelf''. #' @@ -194,7 +193,7 @@ import <- function(pipelineName, ...){ #' @param parseFunction (Optional). An optional custom function for parsing the #' character string that contains the taxonomic assignment of each OTU. #' The default parsing function is \code{\link{parse_taxonomy_qiime}}, -#' specialized for splitting the \code{";"}-delimited strings and also +#' specialized for splitting the \code{';'}-delimited strings and also #' attempting to interpret greengenes prefixes, if any, as that is a common #' format of the taxonomy string produced by QIIME. #' @@ -233,76 +232,77 @@ import <- function(pipelineName, ...){ #' @importFrom Biostrings readDNAStringSet #' @export #' @examples -#' otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq") -#' mapfile <- system.file("extdata", "master_map.txt", package="phyloseq") -#' trefile <- system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq") +#' otufile <- system.file('extdata', 'GP_otu_table_rand_short.txt.gz', package='phyloseq') +#' mapfile <- system.file('extdata', 'master_map.txt', package='phyloseq') +#' trefile <- system.file('extdata', 'GP_tree_rand_short.newick.gz', package='phyloseq') #' import_qiime(otufile, mapfile, trefile) -import_qiime <- function(otufilename=NULL, mapfilename=NULL, - treefilename=NULL, refseqfilename=NULL, - refseqFunction=readDNAStringSet, refseqArgs=NULL, - parseFunction=parse_taxonomy_qiime, verbose=TRUE, ...){ - - # initialize the argument-list for phyloseq. Start empty. - argumentlist <- list() - - if( !is.null(mapfilename) ){ - if( verbose ){ - cat("Processing map file...", fill=TRUE) - } - QiimeMap <- import_qiime_sample_data(mapfilename) - argumentlist <- c(argumentlist, list(QiimeMap)) - } - - if( !is.null(otufilename) ){ - if( verbose ){ - cat("Processing otu/tax file...", fill=TRUE) - } - otutax <- import_qiime_otu_tax(otufilename, parseFunction, verbose=verbose) - otutab <- otu_table(otutax$otutab, TRUE) - taxtab <- tax_table(otutax$taxtab) - argumentlist <- c(argumentlist, list(otutab), list(taxtab) ) - } - - if( !is.null(treefilename) ){ - if(verbose){cat("Processing phylogenetic tree...\n", treefilename, "...\n")} - if(inherits(treefilename, "phylo")){ - # If argument is already a tree, don't read, just assign. - tree = treefilename - } else { - # If it is not a tree, assume file and attempt to import. - # NULL is silently returned if tree is not read properly. - tree <- read_tree(treefilename, ...) - } - # Add to argument list or warn - if( is.null(tree) ){ - warning("treefilename failed import. It will not be included.") - } else { - argumentlist <- c(argumentlist, list(tree) ) - } - } - - if( !is.null(refseqfilename) ){ - if( verbose ){ - cat("Processing Reference Sequences...", fill=TRUE) - } - if( inherits(refseqfilename, "XStringSet") ){ - # If argument is already a XStringSet, don't read, just assign. - refseq = refseqfilename - } else { - # call refseqFunction and read refseqfilename, - # either with or without additional args - if( !is.null(refseqArgs) ){ - refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs)) - } else { - refseq = refseqFunction(refseqfilename) - } - } - argumentlist <- c(argumentlist, list(refseq) ) - } - - do.call("phyloseq", argumentlist) +import_qiime <- function(otufilename = NULL, mapfilename = NULL, treefilename = NULL, + refseqfilename = NULL, refseqFunction = readDNAStringSet, refseqArgs = NULL, + parseFunction = parse_taxonomy_qiime, verbose = TRUE, ...) { + + # initialize the argument-list for phyloseq. Start empty. + argumentlist <- list() + + if (!is.null(mapfilename)) { + if (verbose) { + cat("Processing map file...", fill = TRUE) + } + QiimeMap <- import_qiime_sample_data(mapfilename) + argumentlist <- c(argumentlist, list(QiimeMap)) + } + + if (!is.null(otufilename)) { + if (verbose) { + cat("Processing otu/tax file...", fill = TRUE) + } + otutax <- import_qiime_otu_tax(otufilename, parseFunction, verbose = verbose) + otutab <- otu_table(otutax$otutab, TRUE) + taxtab <- tax_table(otutax$taxtab) + argumentlist <- c(argumentlist, list(otutab), list(taxtab)) + } + + if (!is.null(treefilename)) { + if (verbose) { + cat("Processing phylogenetic tree...\n", treefilename, "...\n") + } + if (inherits(treefilename, "phylo")) { + # If argument is already a tree, don't read, just assign. + tree = treefilename + } else { + # If it is not a tree, assume file and attempt to import. NULL is silently + # returned if tree is not read properly. + tree <- read_tree(treefilename, ...) + } + # Add to argument list or warn + if (is.null(tree)) { + warning("treefilename failed import. It will not be included.") + } else { + argumentlist <- c(argumentlist, list(tree)) + } + } + + if (!is.null(refseqfilename)) { + if (verbose) { + cat("Processing Reference Sequences...", fill = TRUE) + } + if (inherits(refseqfilename, "XStringSet")) { + # If argument is already a XStringSet, don't read, just assign. + refseq = refseqfilename + } else { + # call refseqFunction and read refseqfilename, either with or without additional + # args + if (!is.null(refseqArgs)) { + refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs)) + } else { + refseq = refseqFunction(refseqfilename) + } + } + argumentlist <- c(argumentlist, list(refseq)) + } + + do.call("phyloseq", argumentlist) } -################################################################################ +################################################################################ #' Somewhat flexible tree-import function #' #' This function is a convenience wrapper around the @@ -346,32 +346,33 @@ import_qiime <- function(otufilename=NULL, mapfilename=NULL, #' @importFrom ape read.tree #' @export #' @examples -#' read_tree(system.file("extdata", "esophagus.tree.gz", package="phyloseq")) -#' read_tree(system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq")) -read_tree <- function(treefile, errorIfNULL=FALSE, ...){ - # "phylo" object provided directly - if( class(treefile)[1] %in% c("phylo") ){ - tree <- treefile - } else { - # file path to tree file provided. - # Try Nexus first, protected, then newick if it fails - tree <- NULL - try(tree <- read.nexus(treefile, ...), TRUE) - # Try Newick if nexus didn't work. - if(is.null(tree)) try(tree <- read.tree(treefile, ...), TRUE) - } - # If neither tree-import worked (still NULL), report warning - if( errorIfNULL & is.null(tree) ){ - stop("tree file could not be read.\nPlease retry with valid tree.") - } - if( !is.null(tree) ){ - # Perform any standard phyloseq checks/fixes - # E.g. Replace any NA branch-length values in the tree with zero. - tree = fix_phylo(tree) +#' read_tree(system.file('extdata', 'esophagus.tree.gz', package='phyloseq')) +#' read_tree(system.file('extdata', 'GP_tree_rand_short.newick.gz', package='phyloseq')) +read_tree <- function(treefile, errorIfNULL = FALSE, ...) { + # 'phylo' object provided directly + if (class(treefile)[1] %in% c("phylo")) { + tree <- treefile + } else { + # file path to tree file provided. Try Nexus first, protected, then newick if it + # fails + tree <- NULL + try(tree <- read.nexus(treefile, ...), TRUE) + # Try Newick if nexus didn't work. + if (is.null(tree)) + try(tree <- read.tree(treefile, ...), TRUE) + } + # If neither tree-import worked (still NULL), report warning + if (errorIfNULL & is.null(tree)) { + stop("tree file could not be read.\nPlease retry with valid tree.") + } + if (!is.null(tree)) { + # Perform any standard phyloseq checks/fixes E.g. Replace any NA branch-length + # values in the tree with zero. + tree = fix_phylo(tree) } - return(tree) + return(tree) } -################################################################################ +################################################################################ #' Read GreenGenes tree released in annotated newick format #' #' In principal, this is a standard newick format, that can be imported @@ -410,7 +411,7 @@ read_tree <- function(treefile, errorIfNULL=FALSE, ...){ #' @examples #' # Read the May 2013, 73% similarity official tree, #' # included as extra data in phyloseq. -#' treefile = system.file("extdata", "gg13-5-73.tree.gz", package="phyloseq") +#' treefile = system.file('extdata', 'gg13-5-73.tree.gz', package='phyloseq') #' x = read_tree_greengenes(treefile) #' x #' class(x) @@ -418,29 +419,29 @@ read_tree <- function(treefile, errorIfNULL=FALSE, ...){ #' y #' class(y) #' ## Not run, causes an error: -#' # library("ape") +#' # library('ape') #' # read.tree(treefile) -read_tree_greengenes = function(treefile){ - alines = readLines(treefile, warn=FALSE) +read_tree_greengenes = function(treefile) { + alines = readLines(treefile, warn = FALSE) # Collapse to one line, in case it isn't already. - alines = paste0(alines, collapse="") - # replace all semicolons with something weird - # that isn't already a special newick character. + alines = paste0(alines, collapse = "") + # replace all semicolons with something weird that isn't already a special newick + # character. newdelim = "><-><" clines = gsub("\\;", newdelim, alines) # reinstate the final character as a semicolon clines = gsub(paste0(newdelim, "$"), ";", clines) # Convert your newick string into a phylo-class tree. - tree = read.tree("", text=clines) - # Now that it is phylo-class, reinstate semicolon - # as the delimiter in the node labels + tree = read.tree("", text = clines) + # Now that it is phylo-class, reinstate semicolon as the delimiter in the node + # labels gsub(newdelim, ";", tree$node.label) # Also get rid of those extra quotes gsub("'", "", tree$node.label) # Return the cleaned-up tree return(tree) } -################################################################################ +################################################################################ #' Import now legacy-format QIIME OTU table as a list of two matrices. #' #' Now a legacy-format, older versions of QIIME @@ -472,7 +473,7 @@ read_tree_greengenes = function(treefile){ #' @param parseFunction (Optional). An optional custom function for parsing the #' character string that contains the taxonomic assignment of each OTU. #' The default parsing function is \code{\link{parse_taxonomy_qiime}}, -#' specialized for splitting the \code{";"}-delimited strings and also +#' specialized for splitting the \code{';'}-delimited strings and also #' attempting to interpret greengenes prefixes, if any, as that is a common #' format of the taxonomy string produced by QIIME. #' @@ -513,44 +514,55 @@ read_tree_greengenes = function(treefile){ #' #' @export #' @examples -#' otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq") +#' otufile <- system.file('extdata', 'GP_otu_table_rand_short.txt.gz', package='phyloseq') #' import_qiime_otu_tax(otufile) -import_qiime_otu_tax <- function(file, parseFunction=parse_taxonomy_qiime, - verbose=TRUE, parallel=FALSE){ - if(verbose){cat("Reading file into memory prior to parsing...\n")} +import_qiime_otu_tax <- function(file, parseFunction = parse_taxonomy_qiime, verbose = TRUE, + parallel = FALSE) { + if (verbose) { + cat("Reading file into memory prior to parsing...\n") + } x = readLines(file) - if(verbose){cat("Detecting first header line...\n")} - # Check for commented lines, starting with line 1. - # The deepest commented line (largest n) is assumed to have header information. - skipLines = max(which(substr(x[1:25L], 1, 1)=="#"))-1L - if(verbose){cat("Header is on line", (skipLines + 1L), " \n")} - if(verbose){cat("Converting input file to a table...\n")} - x = fread(input=paste0(x, collapse="\n"), sep="\t", header=TRUE, skip=skipLines) - if(verbose){cat("Defining OTU table... \n")} + if (verbose) { + cat("Detecting first header line...\n") + } + # Check for commented lines, starting with line 1. The deepest commented line + # (largest n) is assumed to have header information. + skipLines = max(which(substr(x[1:25L], 1, 1) == "#")) - 1L + if (verbose) { + cat("Header is on line", (skipLines + 1L), " \n") + } + if (verbose) { + cat("Converting input file to a table...\n") + } + x = fread(input = paste0(x, collapse = "\n"), sep = "\t", header = TRUE, skip = skipLines) + if (verbose) { + cat("Defining OTU table... \n") + } taxstring = x$`Consensus Lineage` # This pops the taxonomy (Consensus Lineage) column, in-place statement - x[, `Consensus Lineage`:=NULL] + x[, `:=`(`Consensus Lineage`, NULL)] # Store the OTU names, you will pop the column OTUnames = x$`#OTU ID` # This pops the OTUID column, in-place statement - x[, `#OTU ID`:=NULL] + x[, `:=`(`#OTU ID`, NULL)] x <- as(x, "matrix") rownames(x) <- OTUnames rm(OTUnames) - if(verbose){cat("Parsing taxonomy table...\n")} - # Split into "jagged" list (vectors of different lengths) - taxlist = llply(taxstring, parseFunction, .parallel=parallel) + if (verbose) { + cat("Parsing taxonomy table...\n") + } + # Split into 'jagged' list (vectors of different lengths) + taxlist = llply(taxstring, parseFunction, .parallel = parallel) # Add OTU names to list element names names(taxlist) <- rownames(x) - # Build the tax table from the jagged list. + # Build the tax table from the jagged list. taxtab <- build_tax_table(taxlist) - # Call garbage collection one more time. Lots of unneeded stuff. - garbage.collection <- gc(FALSE) + # Call garbage collection one more time. Lots of unneeded stuff. + garbage.collection <- gc(FALSE) # Return the named list - return(list(otutab=x, taxtab=taxtab)) + return(list(otutab = x, taxtab = taxtab)) } -################################################################################ -################################################################################ +################################################################################ #' Import just \code{sample_data} file from QIIME pipeline. #' #' QIIME produces several files that can be analyzed in the phyloseq-package, @@ -590,16 +602,15 @@ import_qiime_otu_tax <- function(file, parseFunction=parse_taxonomy_qiime, #' #' @export #' @examples -#' mapfile <- system.file("extdata", "master_map.txt", package = "phyloseq") +#' mapfile <- system.file('extdata', 'master_map.txt', package = 'phyloseq') #' import_qiime_sample_data(mapfile) -import_qiime_sample_data <- function(mapfilename){ - # Process mapfile. Name rows as samples. - QiimeMap <- read.table(file=mapfilename, header=TRUE, - sep="\t", comment.char="") - rownames(QiimeMap) <- as.character(QiimeMap[,1]) - return( sample_data(QiimeMap) ) +import_qiime_sample_data <- function(mapfilename) { + # Process mapfile. Name rows as samples. + QiimeMap <- read.table(file = mapfilename, header = TRUE, sep = "\t", comment.char = "") + rownames(QiimeMap) <- as.character(QiimeMap[, 1]) + return(sample_data(QiimeMap)) } -################################################################################ +################################################################################ #' Read a UniFrac-formatted ENV file. #' #' Convenience wrapper function to read the environment-file, as formatted for @@ -608,7 +619,7 @@ import_qiime_sample_data <- function(mapfilename){ #' each row specifies (in order) the sequence name, source sample, and (optionally) #' the number of times the sequence was observed. #' -#' @usage import_env_file(envfilename, tree=NULL, sep="\t", ...) +#' @usage import_env_file(envfilename, tree=NULL, sep='\t', ...) #' #' @param envfilename (Required). A charater string of the ENV filename (relative or absolute) #' @@ -616,7 +627,7 @@ import_qiime_sample_data <- function(mapfilename){ #' the output otu_table. #' #' @param sep A character string indicating the delimiter used in the file. -#' The default is \code{"\t"}. +#' The default is \code{'\t'}. #' #' @param ... Additional parameters passed on to \code{\link{read.table}}. #' @@ -632,18 +643,19 @@ import_qiime_sample_data <- function(mapfilename){ #' @export #' @examples #' # import_env_file(myEnvFile, myTree) -import_env_file <- function(envfilename, tree=NULL, sep="\t", ...){ - tipSampleTable <- read.table(envfilename, sep=sep, ...) - # Convert to otu_table-class table (trivial table) - physeq <- envHash2otu_table(tipSampleTable) - # If tree is provided, combine it with the OTU Table - if( class(tree) == "phylo" ){ - # Create phyloseq-class with a tree and OTU Table (will perform any needed pruning) - physeq <- phyloseq(physeq, tree) - } - return(physeq) +import_env_file <- function(envfilename, tree = NULL, sep = "\t", ...) { + tipSampleTable <- read.table(envfilename, sep = sep, ...) + # Convert to otu_table-class table (trivial table) + physeq <- envHash2otu_table(tipSampleTable) + # If tree is provided, combine it with the OTU Table + if (class(tree) == "phylo") { + # Create phyloseq-class with a tree and OTU Table (will perform any needed + # pruning) + physeq <- phyloseq(physeq, tree) + } + return(physeq) } -################################################################################ +################################################################################ #' Convert a sequence-sample hash (like ENV file) into an OTU table. #' #' Parses an ENV-file into a sparse matrix of species-by-sample, where @@ -673,34 +685,33 @@ import_env_file <- function(envfilename, tree=NULL, sep="\t", ...){ #' #' @keywords internal #' @examples # -#' ## fakeSeqNameVec <- paste("seq_", 1:8, sep="") -#' ## fakeSamNameVec <- c(rep("A", 4), rep("B", 4)) +#' ## fakeSeqNameVec <- paste('seq_', 1:8, sep='') +#' ## fakeSamNameVec <- c(rep('A', 4), rep('B', 4)) #' ## fakeSeqAbunVec <- sample(1:50, 8, TRUE) #' ## test <- cbind(fakeSeqNameVec, fakeSamNameVec, fakeSeqAbunVec) #' ## testotu <- envHash2otu_table( test ) #' ## test <- cbind(fakeSeqNameVec, fakeSamNameVec) #' ## testotu <- envHash2otu_table( test ) -envHash2otu_table <- function(tipSampleTable){ - if( ncol(tipSampleTable) > 2 ){ - tst <- tipSampleTable - trivialOTU <- matrix(0, nrow=nrow(tst), ncol=length(unique(tst[,2]))) - colnames(trivialOTU) <- unique(tst[,2]) - rownames(trivialOTU) <- tst[,1] - for( i in 1:nrow(tst) ){ - trivialOTU[tst[i, 1], tst[i, 2]] <- as.integer(tst[i, 3]) - } - } else { - trivialOTU <- table(as.data.frame(tipSampleTable)) - trivialOTU <- as(trivialOTU, "matrix") - } - return( otu_table(trivialOTU, taxa_are_rows=TRUE) ) +envHash2otu_table <- function(tipSampleTable) { + if (ncol(tipSampleTable) > 2) { + tst <- tipSampleTable + trivialOTU <- matrix(0, nrow = nrow(tst), ncol = length(unique(tst[, 2]))) + colnames(trivialOTU) <- unique(tst[, 2]) + rownames(trivialOTU) <- tst[, 1] + for (i in 1:nrow(tst)) { + trivialOTU[tst[i, 1], tst[i, 2]] <- as.integer(tst[i, 3]) + } + } else { + trivialOTU <- table(as.data.frame(tipSampleTable)) + trivialOTU <- as(trivialOTU, "matrix") + } + return(otu_table(trivialOTU, taxa_are_rows = TRUE)) } -################################################################################ -################################################################################ +################################################################################ #' Import RDP cluster file and return otu_table (abundance table). #' #' The RDP cluster pipeline (specifically, the output of the complete linkage clustering step) -#' has no formal documentation for the \code{".clust"} +#' has no formal documentation for the \code{'.clust'} #' file or its apparent sequence naming convention. #' #' \code{http://pyro.cme.msu.edu/index.jsp} @@ -709,109 +720,113 @@ envHash2otu_table <- function(tipSampleTable){ #' the names of all sequences contained in input alignment. If the upstream #' barcode and aligment processing steps are also done with the RDP pipeline, #' then the sequence names follow a predictable naming convention wherein each -#' sequence is named by its sample and sequence ID, separated by a \code{"_"} as +#' sequence is named by its sample and sequence ID, separated by a \code{'_'} as #' delimiter: #' -#' \code{"sampleName_sequenceIDnumber"} +#' \code{'sampleName_sequenceIDnumber'} #' #' This import function assumes that the sequence names in the cluster file follow -#' this convention, and that the sample name does not contain any \code{"_"}. It +#' this convention, and that the sample name does not contain any \code{'_'}. It #' is unlikely to work if this is not the case. It is likely to work if you used #' the upstream steps in the RDP pipeline to process your raw (barcoded, untrimmed) #' fasta/fastq data. #' -#' This function first loops through the \code{".clust"} file and collects all -#' of the sample names that appear. It secondly loops through each OTU (\code{"cluster"}; +#' This function first loops through the \code{'.clust'} file and collects all +#' of the sample names that appear. It secondly loops through each OTU (\code{'cluster'}; #' each row of the cluster file) and sums the number of sequences (reads) from #' each sample. The resulting abundance table of OTU-by-sample is trivially #' coerced to an \code{\link{otu_table}} object, and returned. #' #' @usage import_RDP_cluster(RDP_cluster_file) #' -#' @param RDP_cluster_file A character string. The name of the \code{".clust"} +#' @param RDP_cluster_file A character string. The name of the \code{'.clust'} #' file produced by the #' the complete linkage clustering step of the RDP pipeline. #' -#' @return An \code{\link{otu_table}} object parsed from the \code{".clust"} file. +#' @return An \code{\link{otu_table}} object parsed from the \code{'.clust'} file. #' #' @references \url{http://pyro.cme.msu.edu/index.jsp} #' #' @export #' -import_RDP_cluster <- function(RDP_cluster_file){ - - # Read file and pop the header lines - RDP_raw_otu_lines_only <- readLines(RDP_cluster_file)[-(1:5)] - - # internal function: - make_verbose_sample_list <- function(RDP_raw_otu_lines_only){ - # Each OTU line has a 3 element "line header" that indicates the OTUID, the name of the file, - # and the number of sequences that are included in this cluster. - # From each line, remove the header elements - get_sample_names_from_one_line <- function(otuline){ - # first split the line on tabs "\t" - splittabs <- strsplit(otuline, "\t")[[1]] - - # next, remove the header by keeping on the 4th element. - seqIDonly <- splittabs[4] - - # Finally, split on white space - seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]] - - # For each element in seqIDonly, split on the underscore delimiter - splitseqnames <- strsplit(seqIDonly, "_", fixed=TRUE) - - # Return the sample names from the first element (assumes no "_" in sample names) - return( sapply(splitseqnames, function(i){i[1]}) ) - } - return( sapply(RDP_raw_otu_lines_only, get_sample_names_from_one_line) ) - } - - ## Get the verbose sample name list, and then shrink to the - ## unique sample names in the entire dataset. - ## Need this unique list for initializing the OTU abundance matrix - RDPsamplenameslist <- make_verbose_sample_list(RDP_raw_otu_lines_only) - RDPsamplenames <- unique(unlist(RDPsamplenameslist)) - - # remove NAs - RDPsamplenames <- RDPsamplenames[!is.na(RDPsamplenames)] - - # Initialize otu abundance matrix. - otumat <- matrix(0, nrow=length(RDP_raw_otu_lines_only), ncol=length(RDPsamplenames)) - rownames(otumat) <- paste("OTUID_", 1:length(RDP_raw_otu_lines_only)) - colnames(otumat) <- RDPsamplenames - - # Now re-loop through the cluster file (by OTU) and sum the - # abundance of sequences from each sample - for( i in 1:length(RDP_raw_otu_lines_only) ){ - # i = 1 - - # first split the line on tabs "\t" - splittabs <- strsplit(RDP_raw_otu_lines_only[i], "\t")[[1]] - - # next, remove the header by keeping on the 4th element. - seqIDonly <- splittabs[4] - - # Finally, split on white space - seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]] - - # For each element in seqIDonly, split on the underscore delimiter - splitseqnames <- strsplit(seqIDonly, "_", fixed=TRUE) - - # make the verbose vector - verbosesamplenamesi <- sapply(splitseqnames, function(i){i[1]}) - - # sum the reads from each sample with tapply - OTUi <- tapply(verbosesamplenamesi, factor(verbosesamplenamesi), length) - - # store results of this OTU in abundance matrix - otumat[i, names(OTUi)] <- OTUi - } - - # Return the abundance table. - return( otu_table(otumat, taxa_are_rows=TRUE) ) +import_RDP_cluster <- function(RDP_cluster_file) { + + # Read file and pop the header lines + RDP_raw_otu_lines_only <- readLines(RDP_cluster_file)[-(1:5)] + + # internal function: + make_verbose_sample_list <- function(RDP_raw_otu_lines_only) { + # Each OTU line has a 3 element 'line header' that indicates the OTUID, the name + # of the file, and the number of sequences that are included in this cluster. + # From each line, remove the header elements + get_sample_names_from_one_line <- function(otuline) { + # first split the line on tabs '\t' + splittabs <- strsplit(otuline, "\t")[[1]] + + # next, remove the header by keeping on the 4th element. + seqIDonly <- splittabs[4] + + # Finally, split on white space + seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]] + + # For each element in seqIDonly, split on the underscore delimiter + splitseqnames <- strsplit(seqIDonly, "_", fixed = TRUE) + + # Return the sample names from the first element (assumes no '_' in sample names) + return(sapply(splitseqnames, function(i) { + i[1] + })) + } + return(sapply(RDP_raw_otu_lines_only, get_sample_names_from_one_line)) + } + + ## Get the verbose sample name list, and then shrink to the unique sample names in + ## the entire dataset. Need this unique list for initializing the OTU abundance + ## matrix + RDPsamplenameslist <- make_verbose_sample_list(RDP_raw_otu_lines_only) + RDPsamplenames <- unique(unlist(RDPsamplenameslist)) + + # remove NAs + RDPsamplenames <- RDPsamplenames[!is.na(RDPsamplenames)] + + # Initialize otu abundance matrix. + otumat <- matrix(0, nrow = length(RDP_raw_otu_lines_only), ncol = length(RDPsamplenames)) + rownames(otumat) <- paste("OTUID_", 1:length(RDP_raw_otu_lines_only)) + colnames(otumat) <- RDPsamplenames + + # Now re-loop through the cluster file (by OTU) and sum the abundance of + # sequences from each sample + for (i in 1:length(RDP_raw_otu_lines_only)) { + # i = 1 + + # first split the line on tabs '\t' + splittabs <- strsplit(RDP_raw_otu_lines_only[i], "\t")[[1]] + + # next, remove the header by keeping on the 4th element. + seqIDonly <- splittabs[4] + + # Finally, split on white space + seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]] + + # For each element in seqIDonly, split on the underscore delimiter + splitseqnames <- strsplit(seqIDonly, "_", fixed = TRUE) + + # make the verbose vector + verbosesamplenamesi <- sapply(splitseqnames, function(i) { + i[1] + }) + + # sum the reads from each sample with tapply + OTUi <- tapply(verbosesamplenamesi, factor(verbosesamplenamesi), length) + + # store results of this OTU in abundance matrix + otumat[i, names(OTUi)] <- OTUi + } + + # Return the abundance table. + return(otu_table(otumat, taxa_are_rows = TRUE)) } -################################################################################ +################################################################################ #' Import new RDP OTU-table format #' #' Recently updated tools on RDP Pyro site make it easier to import Pyrosequencing output @@ -819,7 +834,7 @@ import_RDP_cluster <- function(RDP_cluster_file){ #' (generated from RDP Clustering tools) to create a community data matrix file #' for distance cutoff range you are interested in. The resulting output file #' is a tab-delimited file containing the number of sequences for each sample -#' for each OTU. The OTU header naming convention is \code{"OTU_"} followed by the OTU +#' for each OTU. The OTU header naming convention is \code{'OTU_'} followed by the OTU #' number in the cluster file. It pads ``0''s to make the OTU header easy to sort. #' The OTU numbers are not necessarily in order. #' @@ -840,7 +855,7 @@ import_RDP_cluster <- function(RDP_cluster_file){ #' #' @export #' @examples -#' otufile <- system.file("extdata", "rformat_dist_0.03.txt.gz", package="phyloseq") +#' otufile <- system.file('extdata', 'rformat_dist_0.03.txt.gz', package='phyloseq') #' ### the gzipped file is automatically recognized, and read using R-connections #' ex_otu <- import_RDP_otu(otufile) #' class(ex_otu) @@ -848,28 +863,24 @@ import_RDP_cluster <- function(RDP_cluster_file){ #' nsamples(ex_otu) #' sample_sums(ex_otu) #' head(t(ex_otu)) -import_RDP_otu <- function(otufile){ - otumat <- read.table(otufile, TRUE, sep="\t", row.names=1) - return(otu_table(otumat, FALSE)) +import_RDP_otu <- function(otufile) { + otumat <- read.table(otufile, TRUE, sep = "\t", row.names = 1) + return(otu_table(otumat, FALSE)) } -################################################################################ -################################################################################ -################################################################################ -################################################################################ -################################################################################ +################################################################################ #' Imports a tab-delimited version of the pyrotagger output file. #' #' PyroTagger is a web-server that takes raw, barcoded 16S rRNA amplicon sequences -#' and returns an excel spreadsheet (\code{".xls"}) with both abundance and +#' and returns an excel spreadsheet (\code{'.xls'}) with both abundance and #' taxonomy data. It also includes some confidence information related to the #' taxonomic assignment. #' #' PyroTagger is created and maintained by the Joint Genome Institute -#' at \code{"http://pyrotagger.jgi-psf.org/"} +#' at \code{'http://pyrotagger.jgi-psf.org/'} #' -#' The typical output form PyroTagger is a spreadsheet format \code{".xls"}, which poses +#' The typical output form PyroTagger is a spreadsheet format \code{'.xls'}, which poses #' additional import challenges. However, virtually all spreadsheet applications -#' support the \code{".xls"} format, and can further export this file in a +#' support the \code{'.xls'} format, and can further export this file in a #' tab-delimited format. It is recommended that you convert the xls-file without #' any modification (as tempting as it might be once you have loaded it) into a #' tab-delimited text file. Deselect any options to encapsulate fields in quotes, @@ -880,7 +891,7 @@ import_RDP_otu <- function(otufile){ #' #' A highly-functional and free spreadsheet application can be obtained as part #' of the cross-platform \code{OpenOffice} suite. It works for the above -#' required conversion. Go to \code{"http://www.openoffice.org/"}. +#' required conversion. Go to \code{'http://www.openoffice.org/'}. #' #' It is regrettable that this importer does not take the xls-file directly #' as input. However, because of the moving-target nature of spreadsheet @@ -892,7 +903,7 @@ import_RDP_otu <- function(otufile){ #' long-run. #' #' @usage import_pyrotagger_tab(pyrotagger_tab_file, -#' strict_taxonomy=FALSE, keep_potential_chimeras=FALSE) +#'\tstrict_taxonomy=FALSE, keep_potential_chimeras=FALSE) #' #' @param pyrotagger_tab_file (Required). A character string. The name of the tab-delimited #' pyrotagger output table. @@ -915,87 +926,78 @@ import_RDP_otu <- function(otufile){ #' #' @examples #' ## New_otuTaxObject <- import_pyrotagger_tab(pyrotagger_tab_file) -import_pyrotagger_tab <- function(pyrotagger_tab_file, - strict_taxonomy=FALSE, keep_potential_chimeras=FALSE){ - - x <- readLines(pyrotagger_tab_file, warn=FALSE) - # Get the header - pyro_header <- strsplit(x[1], "\t", TRUE)[[1]] - # Pop the first (header) line from the list. - x <- x[-1] - - ######################################## - ### There are "Potential chimeras" - ### listed in the typical output, separated by 2 completely blank lines - ### after the last confidently-good OTU. - ######################################## - chimera_line <- grep("Potential chimeras", x, fixed=TRUE) - if( keep_potential_chimeras ){ - # Pop just the blank lines that delimit the chimeras - # at the bottom of the table - x <- x[-((chimera_line-2):chimera_line)] - } else { - x <- x[-((chimera_line-2):length(x))] - } - - ######################################## - # The tab-split character list, z - ######################################## - z <- strsplit(x, "\t", TRUE) - names(z) <- sapply(z, function(z){z[1]}) - - # The table switches from abundance to taxonomy at the "% Identity" column - taxonomy_table_column_index <- which( pyro_header == "% identity" ) - - ######################################## - # Initialize the two matrices - # (otu_table and taxonomyTable) - ######################################## - ### Initialize abundance matrix, a - a <- matrix(0, nrow=length(x), ncol=(taxonomy_table_column_index-2)) - colnames(a) <- pyro_header[2:(taxonomy_table_column_index-1)] - rownames(a) <- names(z) - - ###### Initialize the raw pyrotagger taxonomy matrix, w - ntax_tablecols <- (max(sapply(z, length)) - taxonomy_table_column_index + 1) - w <- matrix("", nrow=length(x), ncol=ntax_tablecols) - rownames(w) <- names(z) - colnamesw <- pyro_header[-(1:(taxonomy_table_column_index-1))] - colnamesw <- colnamesw[1:which(colnamesw=="Taxonomy")] - colnamesw <- c(colnamesw, paste("col", (which(colnamesw=="Taxonomy")+1):ntax_tablecols, sep="") ) - colnames(w) <- colnamesw - - # Rename the taxonomy columns - biotaxonomy <- c("Domain", "Phylum", "Class", "Order", - "Family", "Genus", "Species", "Strain") - colnames(w)[which(colnames(w)=="Taxonomy"):length(colnames(w))][1:length(biotaxonomy)] <- biotaxonomy - - # Loop through each line and add to appropriate matrix. - for( i in rownames(a) ){ - # i <- rownames(a)[[1]] - # cut out just the abundance part, and convert to integer - y <- as.integer(z[[i]][2:(taxonomy_table_column_index-1)]) - y[is.na(y)] <- 0 - a[i, ] <- y - - # Taxonomy data is jagged - taxi <- z[[i]][-(1:(taxonomy_table_column_index-1))] - w[i, 1:length(taxi)] <- taxi - } - - # Create the component objects - OTU <- otu_table(a, taxa_are_rows=TRUE) - if( strict_taxonomy ){ - TAX <- tax_table[, biotaxonomy] - } else { - TAX <- tax_table(w) - } - - return( phyloseq(OTU, TAX) ) - +import_pyrotagger_tab <- function(pyrotagger_tab_file, strict_taxonomy = FALSE, keep_potential_chimeras = FALSE) { + + x <- readLines(pyrotagger_tab_file, warn = FALSE) + # Get the header + pyro_header <- strsplit(x[1], "\t", TRUE)[[1]] + # Pop the first (header) line from the list. + x <- x[-1] + + ######################################## There are 'Potential chimeras' listed in the typical output, separated by 2 + ######################################## completely blank lines after the last confidently-good OTU. + chimera_line <- grep("Potential chimeras", x, fixed = TRUE) + if (keep_potential_chimeras) { + # Pop just the blank lines that delimit the chimeras at the bottom of the table + x <- x[-((chimera_line - 2):chimera_line)] + } else { + x <- x[-((chimera_line - 2):length(x))] + } + + ######################################## The tab-split character list, z + z <- strsplit(x, "\t", TRUE) + names(z) <- sapply(z, function(z) { + z[1] + }) + + # The table switches from abundance to taxonomy at the '% Identity' column + taxonomy_table_column_index <- which(pyro_header == "% identity") + + ######################################## Initialize the two matrices (otu_table and taxonomyTable) Initialize abundance + ######################################## matrix, a + a <- matrix(0, nrow = length(x), ncol = (taxonomy_table_column_index - 2)) + colnames(a) <- pyro_header[2:(taxonomy_table_column_index - 1)] + rownames(a) <- names(z) + + ###### Initialize the raw pyrotagger taxonomy matrix, w + ntax_tablecols <- (max(sapply(z, length)) - taxonomy_table_column_index + 1) + w <- matrix("", nrow = length(x), ncol = ntax_tablecols) + rownames(w) <- names(z) + colnamesw <- pyro_header[-(1:(taxonomy_table_column_index - 1))] + colnamesw <- colnamesw[1:which(colnamesw == "Taxonomy")] + colnamesw <- c(colnamesw, paste("col", (which(colnamesw == "Taxonomy") + 1):ntax_tablecols, + sep = "")) + colnames(w) <- colnamesw + + # Rename the taxonomy columns + biotaxonomy <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", + "Strain") + colnames(w)[which(colnames(w) == "Taxonomy"):length(colnames(w))][1:length(biotaxonomy)] <- biotaxonomy + + # Loop through each line and add to appropriate matrix. + for (i in rownames(a)) { + # i <- rownames(a)[[1]] cut out just the abundance part, and convert to integer + y <- as.integer(z[[i]][2:(taxonomy_table_column_index - 1)]) + y[is.na(y)] <- 0 + a[i, ] <- y + + # Taxonomy data is jagged + taxi <- z[[i]][-(1:(taxonomy_table_column_index - 1))] + w[i, 1:length(taxi)] <- taxi + } + + # Create the component objects + OTU <- otu_table(a, taxa_are_rows = TRUE) + if (strict_taxonomy) { + TAX <- tax_table[, biotaxonomy] + } else { + TAX <- tax_table(w) + } + + return(phyloseq(OTU, TAX)) + } -################################################################################ -################################################################################ +################################################################################ #' Show cutoff values available in a mothur file. #' #' This is a helper function to report back to the user the different cutoff @@ -1016,17 +1018,17 @@ import_pyrotagger_tab <- function(pyrotagger_tab_file, #' #' @seealso \code{\link{import_mothur}} #' -show_mothur_cutoffs <- function(mothur_list_file){ - unique(scan(mothur_list_file, "character", comment.char="\t", quiet=TRUE)) +show_mothur_cutoffs <- function(mothur_list_file) { + unique(scan(mothur_list_file, "character", comment.char = "\t", quiet = TRUE)) } -################################################################################ +################################################################################ #' Import mothur list file and return as list object in R. #' #' This is a user-available module of a more comprehensive function for importing #' OTU clustering/abundance data using the \emph{mothur} package. The list object #' returned by this function is not immediately useable by other \emph{phyloseq} #' functions, and must be first parsed in conjunction with a separate \emph{mothur} -#' \code{"group"} file. This function is made accessible to \emph{phyloseq} users +#' \code{'group'} file. This function is made accessible to \emph{phyloseq} users #' for troubleshooting and inspection, but the \code{link{import_mothur()}} function #' is suggested if the goal is to import the OTU clustering results from \emph{mothur} #' into \emph{phyloseq}. @@ -1035,7 +1037,7 @@ show_mothur_cutoffs <- function(mothur_list_file){ #' #' @param mothur_list_file The list file name and/or location as produced by \emph{mothur}. #' -#' @param cutoff A character string indicating the cutoff value, (or \code{"unique"}), +#' @param cutoff A character string indicating the cutoff value, (or \code{'unique'}), #' that matches one of the cutoff-values used to produce the OTU clustering #' results contained within the list-file created by \emph{mothur}. The default #' is to take the largest value among the cutoff values contained in the list @@ -1054,59 +1056,61 @@ show_mothur_cutoffs <- function(mothur_list_file){ #' @seealso \code{\link{show_mothur_cutoffs}}, \code{\link{import_mothur}} #' @keywords internal #' -import_mothur_otulist <- function(mothur_list_file, cutoff=NULL){ - # mothur_list_file = system.file("extdata", "esophagus.fn.list.gz", package="phyloseq") - # cutoff = 0.04 +import_mothur_otulist <- function(mothur_list_file, cutoff = NULL) { + # mothur_list_file = system.file('extdata', 'esophagus.fn.list.gz', + # package='phyloseq') cutoff = 0.04 cutoffs = show_mothur_cutoffs(mothur_list_file) cutoff = select_mothur_cutoff(cutoff, cutoffs) - # Read only the line corresponding to that cutoff + # Read only the line corresponding to that cutoff inputline = which(cutoffs == cutoff) - rawlines = scan(mothur_list_file, "character", sep="\t", skip=(inputline-1), nlines=1, na.strings="", quiet=TRUE) + rawlines = scan(mothur_list_file, "character", sep = "\t", skip = (inputline - + 1), nlines = 1, na.strings = "", quiet = TRUE) rawlines = rawlines[!is.na(rawlines)] - # The first two elements are the cutoff and the number of OTUs. skip, and read to first comma for OTUnames - OTUnames = scan(text=rawlines, what="character", comment.char=",", quiet=TRUE)[3:as.integer(rawlines[2])] + # The first two elements are the cutoff and the number of OTUs. skip, and read to + # first comma for OTUnames + OTUnames = scan(text = rawlines, what = "character", comment.char = ",", quiet = TRUE)[3:as.integer(rawlines[2])] # split each element on commas - OTUs <- strsplit(rawlines[3:as.integer(rawlines[2])], ",", fixed=TRUE) - # Name each OTU (currently as the first seq name in each cluster), and return the list + OTUs <- strsplit(rawlines[3:as.integer(rawlines[2])], ",", fixed = TRUE) + # Name each OTU (currently as the first seq name in each cluster), and return the + # list names(OTUs) <- OTUnames # return as-is return(OTUs) } -################################################################################ -# Need to select a cutoff if none was provided by user. -# Take the largest non-"unique" cutoff possible, -# if "unique" is the only cutoff included in the list file, use that. -# Multiple cutoffs are provided in both `.shared` and `.list` files. -# This function consolidates the heuristic for selecting/checking a specified cutoff. +################################################################################ Need to select a cutoff if none was provided by user. Take the largest +################################################################################ non-'unique' cutoff possible, if 'unique' is the only cutoff included in the +################################################################################ list file, use that. Multiple cutoffs are provided in both `.shared` and +################################################################################ `.list` files. This function consolidates the heuristic for selecting/checking +################################################################################ a specified cutoff. #' @keywords internal -select_mothur_cutoff = function(cutoff, cutoffs){ - if( is.null(cutoff) ){ +select_mothur_cutoff = function(cutoff, cutoffs) { + if (is.null(cutoff)) { # cutoff was NULL, need to select one. - if( length(cutoffs) > 1 ){ - # Select the largest value, avoiding the "unique" option. + if (length(cutoffs) > 1) { + # Select the largest value, avoiding the 'unique' option. selectCutoffs <- as(cutoffs[cutoffs != "unique"], "numeric") cutoff <- as.character(max(selectCutoffs)) } else { - # There is only one cutoff value, so use it. - # Don't have to specify a cutoff, in this case + # There is only one cutoff value, so use it. Don't have to specify a cutoff, in + # this case cutoff <- cutoffs } } else { # Provided by user, non-null. Coerce to character for indexing cutoff <- as.character(cutoff) # Check that it is in set of available cutoffs. - if( !cutoff %in% cutoffs ){ + if (!cutoff %in% cutoffs) { stop("The cutoff value you provided is not among those available. Try show_mothur_cutoffs()") } } } -################################################################################ +################################################################################ #' Parse mothur group file into a simple hash table. #' #' The data.frame object #' returned by this function is not immediately useable by other \emph{phyloseq} #' functions, and must be first parsed in conjunction with a separate \emph{mothur} -#' \code{"list"} file. This function is made accessible to \emph{phyloseq} users +#' \code{'list'} file. This function is made accessible to \emph{phyloseq} users #' for troubleshooting and inspection, but the \code{link{import_mothur()}} function #' is suggested if the goal is to import the OTU clustering results from \emph{mothur} #' into \emph{phyloseq}. You will need both a group file and a list file for that end. @@ -1124,10 +1128,11 @@ select_mothur_cutoff = function(cutoff, cutoffs){ #' @seealso \code{\link{import_mothur}} #' @keywords internal #' -import_mothur_groups <- function(mothur_group_file){ - read.table(mothur_group_file, sep="\t", as.is=TRUE, stringsAsFactors=FALSE, colClasses="character", row.names=1) +import_mothur_groups <- function(mothur_group_file) { + read.table(mothur_group_file, sep = "\t", as.is = TRUE, stringsAsFactors = FALSE, + colClasses = "character", row.names = 1) } -################################################################################ +################################################################################ #' Import mothur list and group files and return an otu_table #' #' @usage import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff=NULL) @@ -1140,7 +1145,7 @@ import_mothur_groups <- function(mothur_group_file){ #' species/taxa abundance table (\code{otu_table}). See #' \code{http://www.mothur.org/wiki/Make.group} #' -#' @param cutoff A character string indicating the cutoff value, (or \code{"unique"}), +#' @param cutoff A character string indicating the cutoff value, (or \code{'unique'}), #' that matches one of the cutoff-values used to produce the OTU clustering #' results contained within the list-file created by \emph{mothur} (and specified #' by the \code{mothur_list_file} argument). @@ -1159,32 +1164,34 @@ import_mothur_groups <- function(mothur_group_file){ #' @keywords internal #' @importFrom plyr ldply #' @importFrom plyr ddply -import_mothur_otu_table <- function(mothur_list_file, mothur_group_file, cutoff=NULL){ - otulist <- import_mothur_otulist(mothur_list_file, cutoff) - mothur_groups <- import_mothur_groups(mothur_group_file) - # Initialize abundance matrix with zeros for sparse assignment +import_mothur_otu_table <- function(mothur_list_file, mothur_group_file, cutoff = NULL) { + otulist <- import_mothur_otulist(mothur_list_file, cutoff) + mothur_groups <- import_mothur_groups(mothur_group_file) + # Initialize abundance matrix with zeros for sparse assignment samplenames = unique(mothur_groups[, 1]) - mothur_otu_table <- matrix(0, nrow=length(otulist), ncol=length(samplenames)) - colnames(mothur_otu_table) <- samplenames - rownames(mothur_otu_table) <- names(otulist) - - # Write a sparse versino of the abundance table - df = ldply(otulist, function(x){data.frame(read=x, stringsAsFactors=FALSE)}) - colnames(df)[1] <- "OTU" - df = data.frame(df, sample=mothur_groups[df[, "read"], 1], stringsAsFactors=FALSE) - adf = ddply(df, c("OTU", "sample"), function(x){ - # x = subset(df, OTU=="59_3_17" & sample=="C") - data.frame(x[1, c("OTU", "sample"), drop=FALSE], abundance=nrow(x)) - }) - - # Vectorized for speed using matrix indexing. - # See help("Extract") for details about matrix indexing. Diff than 2-vec index. - mothur_otu_table[as(adf[, c("OTU", "sample")], "matrix")] <- adf[, "abundance"] - - # Finally, return the otu_table as a phyloseq otu_table object. - return(otu_table(mothur_otu_table, taxa_are_rows=TRUE)) + mothur_otu_table <- matrix(0, nrow = length(otulist), ncol = length(samplenames)) + colnames(mothur_otu_table) <- samplenames + rownames(mothur_otu_table) <- names(otulist) + + # Write a sparse versino of the abundance table + df = ldply(otulist, function(x) { + data.frame(read = x, stringsAsFactors = FALSE) + }) + colnames(df)[1] <- "OTU" + df = data.frame(df, sample = mothur_groups[df[, "read"], 1], stringsAsFactors = FALSE) + adf = ddply(df, c("OTU", "sample"), function(x) { + # x = subset(df, OTU=='59_3_17' & sample=='C') + data.frame(x[1, c("OTU", "sample"), drop = FALSE], abundance = nrow(x)) + }) + + # Vectorized for speed using matrix indexing. See help('Extract') for details + # about matrix indexing. Diff than 2-vec index. + mothur_otu_table[as(adf[, c("OTU", "sample")], "matrix")] <- adf[, "abundance"] + + # Finally, return the otu_table as a phyloseq otu_table object. + return(otu_table(mothur_otu_table, taxa_are_rows = TRUE)) } -################################################################################ +################################################################################ #' Import mothur shared file and return an otu_table #' #' @param mothur_shared_file (Required). A @@ -1195,18 +1202,19 @@ import_mothur_otu_table <- function(mothur_list_file, mothur_group_file, cutoff= #' #' @seealso \code{\link{import_mothur}} #' @keywords internal -import_mothur_shared = function(mothur_shared_file, cutoff=NULL){ - #mothur_shared_file = "~/github/phyloseq/inst/extdata/esophagus.fn.shared.gz" +import_mothur_shared = function(mothur_shared_file, cutoff = NULL) { + # mothur_shared_file = '~/github/phyloseq/inst/extdata/esophagus.fn.shared.gz' # Check that cutoff is in cutoffs, or select a cutoff if none given. cutoffs = show_mothur_cutoffs(mothur_shared_file) cutoffs = cutoffs[!cutoffs %in% "label"] cutoff = select_mothur_cutoff(cutoff, cutoffs) x = readLines(mothur_shared_file) - rawtab = read.table(text=x[grep(paste0("^", cutoff), x)], header=FALSE, row.names=2, stringsAsFactors=FALSE)[, -(1:2)] - colnames(rawtab) <- strsplit(x[1], "\t")[[1]][4:(ncol(rawtab)+3)] - return(otu_table(t(as.matrix(rawtab)), taxa_are_rows=TRUE)) + rawtab = read.table(text = x[grep(paste0("^", cutoff), x)], header = FALSE, row.names = 2, + stringsAsFactors = FALSE)[, -(1:2)] + colnames(rawtab) <- strsplit(x[1], "\t")[[1]][4:(ncol(rawtab) + 3)] + return(otu_table(t(as.matrix(rawtab)), taxa_are_rows = TRUE)) } -################################################################################ +################################################################################ #' Import mothur constaxonomy file and return a taxonomyTable #' #' @param mothur_constaxonomy_file (Required). A @@ -1229,19 +1237,20 @@ import_mothur_shared = function(mothur_shared_file, cutoff=NULL){ #' \code{\link{phyloseq}} #' #' @keywords internal -import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction=parse_taxonomy_default){ +import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction = parse_taxonomy_default) { read.table(mothur_constaxonomy_file) - rawtab = read.table(mothur_constaxonomy_file, header=TRUE, row.names=1, stringsAsFactors=FALSE)[, "Taxonomy", drop=FALSE] - if( identical(parseFunction, parse_taxonomy_default) ){ - # Proceed with default parsing stuff. - # Remove the confidence strings inside the parentheses, if present + rawtab = read.table(mothur_constaxonomy_file, header = TRUE, row.names = 1, stringsAsFactors = FALSE)[, + "Taxonomy", drop = FALSE] + if (identical(parseFunction, parse_taxonomy_default)) { + # Proceed with default parsing stuff. Remove the confidence strings inside the + # parentheses, if present rawtab[, "Taxonomy"] = gsub("\\([[:digit:]]+\\)", "", rawtab[, "Taxonomy"]) # Remove the quotation marks, if present rawtab[, "Taxonomy"] = gsub("\"", "", rawtab[, "Taxonomy"]) # Remove trailing semicolon rawtab[, "Taxonomy"] = gsub(";$", "", rawtab[, "Taxonomy"]) # Split on semicolon - taxlist = strsplit(rawtab[, "Taxonomy"], ";", fixed=TRUE) + taxlist = strsplit(rawtab[, "Taxonomy"], ";", fixed = TRUE) taxlist = lapply(taxlist, parseFunction) } else { taxlist = lapply(rawtab[, "Taxonomy"], parseFunction) @@ -1249,7 +1258,7 @@ import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction=pa names(taxlist) <- rownames(rawtab) return(build_tax_table(taxlist)) } -################################################################################ +################################################################################ #' General function for importing mothur data files into phyloseq. #' #' Technically all parameters are optional, @@ -1277,9 +1286,9 @@ import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction=pa #' @param mothur_tree_file (Optional). #' A tree file, presumably produced by \emph{mothur}, #' and readable by \code{\link{read_tree}}. -#' The file probably has extension \code{".tree"}. +#' The file probably has extension \code{'.tree'}. #' -#' @param cutoff (Optional). A character string indicating the cutoff value, (or \code{"unique"}), +#' @param cutoff (Optional). A character string indicating the cutoff value, (or \code{'unique'}), #' that matches one of the cutoff-values used to produce the OTU clustering #' results contained within the list-file created by \emph{mothur} (and specified #' by the \code{mothur_list_file} argument). The default @@ -1341,54 +1350,53 @@ import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction=pa #' @examples #' # # The following example assumes you have downloaded the esophagus example #' # # dataset from the mothur wiki: -#' # # "http://www.mothur.org/wiki/Esophageal_community_analysis" -#' # # "http://www.mothur.org/w/images/5/55/Esophagus.zip" +#' # # 'http://www.mothur.org/wiki/Esophageal_community_analysis' +#' # # 'http://www.mothur.org/w/images/5/55/Esophagus.zip' #' # # The path on your machine may (probably will) vary -#' # mothur_list_file <- "~/Downloads/mothur/Esophagus/esophagus.an.list" -#' # mothur_group_file <- "~/Downloads/mothur/Esophagus/esophagus.good.groups" -#' # mothur_tree_file <- "~/Downloads/mothur/Esophagus/esophagus.tree" +#' # mothur_list_file <- '~/Downloads/mothur/Esophagus/esophagus.an.list' +#' # mothur_group_file <- '~/Downloads/mothur/Esophagus/esophagus.good.groups' +#' # mothur_tree_file <- '~/Downloads/mothur/Esophagus/esophagus.tree' #' # # # Actual examples follow: #' # show_mothur_cutoffs(mothur_list_file) #' # test1 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file) -#' # test2 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file, cutoff="0.02") +#' # test2 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file, cutoff='0.02') #' # # Returns just a tree #' # import_mothur(mothur_list_file, mothur_tree_file=mothur_tree_file) #' # # Returns just an otu_table #' # import_mothur(mothur_list_file, mothur_group_file=mothur_group_file) #' # # Returns an error #' # import_mothur(mothur_list_file) -#' # # Should return an "OMG, you must provide the list file" error +#' # # Should return an 'OMG, you must provide the list file' error #' # import_mothur() -import_mothur <- function(mothur_list_file=NULL, mothur_group_file=NULL, - mothur_tree_file=NULL, cutoff=NULL, - mothur_shared_file=NULL, mothur_constaxonomy_file=NULL, parseFunction=parse_taxonomy_default){ - +import_mothur <- function(mothur_list_file = NULL, mothur_group_file = NULL, mothur_tree_file = NULL, + cutoff = NULL, mothur_shared_file = NULL, mothur_constaxonomy_file = NULL, parseFunction = parse_taxonomy_default) { + pslist = vector("list") - if( !is.null(mothur_group_file) & !is.null(mothur_list_file) ){ - # If list & group files provided, you can make an OTU table. - groupOTU = import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff) - pslist = c(pslist, list(groupOTU)) - } - - if( !is.null(mothur_tree_file) ){ - tree <- read_tree(mothur_tree_file) - pslist = c(pslist, list(tree)) - } + if (!is.null(mothur_group_file) & !is.null(mothur_list_file)) { + # If list & group files provided, you can make an OTU table. + groupOTU = import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff) + pslist = c(pslist, list(groupOTU)) + } + + if (!is.null(mothur_tree_file)) { + tree <- read_tree(mothur_tree_file) + pslist = c(pslist, list(tree)) + } - if( !is.null(mothur_shared_file) ){ + if (!is.null(mothur_shared_file)) { OTUshared <- import_mothur_shared(mothur_shared_file) pslist = c(pslist, list(OTUshared)) } - if( !is.null(mothur_constaxonomy_file) ){ + if (!is.null(mothur_constaxonomy_file)) { tax <- import_mothur_constaxonomy(mothur_constaxonomy_file, parseFunction) pslist = c(pslist, list(tax)) - } + } return(do.call("phyloseq", pslist)) } -################################################################################ +################################################################################ #' Import mothur-formatted distance file #' #' The mothur application will produce a file containing the pairwise distances @@ -1408,37 +1416,43 @@ import_mothur <- function(mothur_list_file=NULL, mothur_group_file=NULL, #' #' @examples #' # # Take a look at the dataset shown here as an example: -#' # # "http://www.mothur.org/wiki/Esophageal_community_analysis" -#' # # find the file ending with extension ".dist", download to your system +#' # # 'http://www.mothur.org/wiki/Esophageal_community_analysis' +#' # # find the file ending with extension '.dist', download to your system #' # # The location of your file may vary -#' # mothur_dist_file <- "~/Downloads/mothur/Esophagus/esophagus.dist" +#' # mothur_dist_file <- '~/Downloads/mothur/Esophagus/esophagus.dist' #' # myNewDistObject <- import_mothur_dist(mothur_dist_file) -import_mothur_dist <- function(mothur_dist_file){ - # Read the raw distance matrix file produced by mothur: - raw_dist_lines <- readLines(mothur_dist_file) - - # split each line on white space, and begin modifying into dist-matrix format - dist_char <- strsplit(raw_dist_lines, "[[:space:]]+") - dist_char <- dist_char[-1] - # add name to each list element - names(dist_char) <- sapply(dist_char, function(i){i[1]}) - # pop out the names from each vector - dist_char <- sapply(dist_char, function(i){i[-1]}) - # convert to numeric vectors - dist_char <- sapply(dist_char, function(i){as(i, "numeric")}) - - # Initialize and fill the matrix - distm <- matrix(0, nrow=length(dist_char), ncol=length(dist_char)) - rownames(distm) <- names(dist_char); colnames(distm) <- names(dist_char) - for( i in names(dist_char)[-1] ){ - distm[i, 1:length(dist_char[[i]])] <- dist_char[[i]] - } - diag(distm) <- 1 - distd <- as.dist(distm) - return(distd) +import_mothur_dist <- function(mothur_dist_file) { + # Read the raw distance matrix file produced by mothur: + raw_dist_lines <- readLines(mothur_dist_file) + + # split each line on white space, and begin modifying into dist-matrix format + dist_char <- strsplit(raw_dist_lines, "[[:space:]]+") + dist_char <- dist_char[-1] + # add name to each list element + names(dist_char) <- sapply(dist_char, function(i) { + i[1] + }) + # pop out the names from each vector + dist_char <- sapply(dist_char, function(i) { + i[-1] + }) + # convert to numeric vectors + dist_char <- sapply(dist_char, function(i) { + as(i, "numeric") + }) + + # Initialize and fill the matrix + distm <- matrix(0, nrow = length(dist_char), ncol = length(dist_char)) + rownames(distm) <- names(dist_char) + colnames(distm) <- names(dist_char) + for (i in names(dist_char)[-1]) { + distm[i, 1:length(dist_char[[i]])] <- dist_char[[i]] + } + diag(distm) <- 1 + distd <- as.dist(distm) + return(distd) } -################################################################################ -################################################################################ +################################################################################ #' Export a distance object as \code{.names} and \code{.dist} files for mothur #' #' The purpose of this function is to allow a user to easily export a distance object @@ -1448,7 +1462,7 @@ import_mothur_dist <- function(mothur_dist_file){ #' #' @usage export_mothur_dist(x, out=NULL, makeTrivialNamesFile=NULL) #' -#' @param x (Required). A \code{"dist"} object, or a symmetric matrix. +#' @param x (Required). A \code{'dist'} object, or a symmetric matrix. #' #' @param out (Optional). The desired output filename for the \code{.dist} file, OR #' left \code{NULL}, the default, in which case the mothur-formated distance table @@ -1471,47 +1485,54 @@ import_mothur_dist <- function(mothur_dist_file){ #' data(esophagus) #' myDistObject <- as.dist(ape::cophenetic.phylo(phy_tree(esophagus))) #' export_mothur_dist(myDistObject) -export_mothur_dist <- function(x, out=NULL, makeTrivialNamesFile=NULL){ - if( class(x)== "matrix" ){ x <- as.dist(x) } - if( class(x)!= "dist" ){ stop("x must be a dist object, or symm matrix") } - - # While x is a dist-object, get the length of unique pairs - # to initialize the dist table. - distdf <- matrix("", nrow=length(x), ncol=3) - - # Now convert x to matrix for looping, indexing. - x <- as(x, "matrix") - colnames(distdf) <- c("i", "j", "d") - # distdf row counter - z <- 1 - - # The big loop. - for( i in 2:nrow(x) ){ # i <- 2 - thisvec <- x[i, 1:(i-1)] - for( j in 1:length(thisvec) ){ # j <- 1 - distdf[z, "i"] <- rownames(x)[i] - distdf[z, "j"] <- colnames(x)[j] - distdf[z, "d"] <- thisvec[j] - z <- z + 1 - } - } - - # mothur requires a .names file, in case you removed identical sequences - # from within mothur and need to keep track and add them back. - if( !is.null(makeTrivialNamesFile) ){ - namestab <- matrix(rownames(x), nrow=length(rownames(x)), ncol=2) - write.table(namestab, file=makeTrivialNamesFile, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) - } - - # If is.null(out)==TRUE, then return two-column table. - # If it's a character, write.table-it - if( is.null(out) ){ - return(distdf) - } else { - write.table(distdf, file=out, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) - } +export_mothur_dist <- function(x, out = NULL, makeTrivialNamesFile = NULL) { + if (class(x) == "matrix") { + x <- as.dist(x) + } + if (class(x) != "dist") { + stop("x must be a dist object, or symm matrix") + } + + # While x is a dist-object, get the length of unique pairs to initialize the dist + # table. + distdf <- matrix("", nrow = length(x), ncol = 3) + + # Now convert x to matrix for looping, indexing. + x <- as(x, "matrix") + colnames(distdf) <- c("i", "j", "d") + # distdf row counter + z <- 1 + + # The big loop. i <- 2 + for (i in 2:nrow(x)) { + thisvec <- x[i, 1:(i - 1)] + for (j in 1:length(thisvec)) { + # j <- 1 + distdf[z, "i"] <- rownames(x)[i] + distdf[z, "j"] <- colnames(x)[j] + distdf[z, "d"] <- thisvec[j] + z <- z + 1 + } + } + + # mothur requires a .names file, in case you removed identical sequences from + # within mothur and need to keep track and add them back. + if (!is.null(makeTrivialNamesFile)) { + namestab <- matrix(rownames(x), nrow = length(rownames(x)), ncol = 2) + write.table(namestab, file = makeTrivialNamesFile, quote = FALSE, sep = "\t", + row.names = FALSE, col.names = FALSE) + } + + # If is.null(out)==TRUE, then return two-column table. If it's a character, + # write.table-it + if (is.null(out)) { + return(distdf) + } else { + write.table(distdf, file = out, quote = FALSE, sep = "\t", row.names = FALSE, + col.names = FALSE) + } } -################################################################################ +################################################################################ #' Export environment (ENV) file for UniFrac Server. #' #' Creates the environment table that is needed for the original UniFrac @@ -1525,7 +1546,7 @@ export_mothur_dist <- function(x, out=NULL, makeTrivialNamesFile=NULL){ #' #' @param file (Optional). The file path for export. If not-provided, the #' expectation is that you will want to set \code{return} to \code{TRUE}, -#' and manipulate the ENV table on your own. Default is \code{""}, skipping +#' and manipulate the ENV table on your own. Default is \code{''}, skipping #' the ENV file from being written to a file. #' #' @param writeTree (Optional). Write the phylogenetic tree as well as the @@ -1540,59 +1561,54 @@ export_mothur_dist <- function(x, out=NULL, makeTrivialNamesFile=NULL){ #' @examples #' # # Load example data #' # data(esophagus) -#' # export_env_file(esophagus, "~/Desktop/esophagus.txt") -export_env_file <- function(physeq, file="", writeTree=TRUE, return=FALSE){ - # data(esophagus) - # physeq <- esophagus - - # Create otu_table matrix and force orientation - OTU <- as(otu_table(physeq), "matrix") - if( !taxa_are_rows(physeq) ){ OTU <- t(OTU) } - - # initialize sequence/sample names - seqs <- taxa_names(physeq) - samples <- sample_names(physeq) - - # initialize output table as matrix - ENV <- matrix("", nrow=sum(OTU >= 1), ncol=3) - - # i counts the row of the output , ENV - i=1 - while( i < nrow(ENV) ){ - for( j in seqs){ - for( k in which(OTU[j, ]>0) ){ - ENV[i, 1] <- j - ENV[i, 2] <- samples[k] - ENV[i, 3] <- OTU[j, k] - i <- i + 1 - } - } - } - # If a file path is provided, write the table to that file - if(file != ""){ - write.table(ENV, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) - } - - # If needed, also write the associated tree-file. - if( writeTree ){ - fileTree <- paste(file, ".nex", sep="") - write.nexus(phy_tree(physeq), file=fileTree, original.data=FALSE) - } - - # If return argument is TRUE, return the environment table - if(return){ return(ENV) } +#' # export_env_file(esophagus, '~/Desktop/esophagus.txt') +export_env_file <- function(physeq, file = "", writeTree = TRUE, return = FALSE) { + # data(esophagus) physeq <- esophagus + + # Create otu_table matrix and force orientation + OTU <- as(otu_table(physeq), "matrix") + if (!taxa_are_rows(physeq)) { + OTU <- t(OTU) + } + + # initialize sequence/sample names + seqs <- taxa_names(physeq) + samples <- sample_names(physeq) + + # initialize output table as matrix + ENV <- matrix("", nrow = sum(OTU >= 1), ncol = 3) + + # i counts the row of the output , ENV + i = 1 + while (i < nrow(ENV)) { + for (j in seqs) { + for (k in which(OTU[j, ] > 0)) { + ENV[i, 1] <- j + ENV[i, 2] <- samples[k] + ENV[i, 3] <- OTU[j, k] + i <- i + 1 + } + } + } + # If a file path is provided, write the table to that file + if (file != "") { + write.table(ENV, file = file, quote = FALSE, sep = "\t", row.names = FALSE, + col.names = FALSE) + } + + # If needed, also write the associated tree-file. + if (writeTree) { + fileTree <- paste(file, ".nex", sep = "") + write.nexus(phy_tree(physeq), file = fileTree, original.data = FALSE) + } + + # If return argument is TRUE, return the environment table + if (return) { + return(ENV) + } } -################################################################################ -# UniFrac ENV files have the form: -# -# SEQ1 ENV1 1 -# SEQ1 ENV2 2 -# SEQ2 ENV1 15 -# SEQ3 ENV1 2 -# SEQ4 ENV2 8 -# SEQ5 ENV1 4 -# http://128.138.212.43/unifrac/help.psp#env_file -################################################################################ +################################################################################ UniFrac ENV files have the form: SEQ1 ENV1 1 SEQ1 ENV2 2 SEQ2 ENV1 15 SEQ3 ENV1 +################################################################################ 2 SEQ4 ENV2 8 SEQ5 ENV1 4 http://128.138.212.43/unifrac/help.psp#env_file #' Import phyloseq data from biom-format file #' #' New versions of QIIME produce a more-comprehensive and formally-defined @@ -1752,104 +1768,95 @@ export_env_file <- function(physeq, file="", writeTree=TRUE, return=FALSE){ #' @export #' @examples #' # An included example of a rich dense biom file -#' rich_dense_biom <- system.file("extdata", "rich_dense_otu_table.biom", package="phyloseq") +#' rich_dense_biom <- system.file('extdata', 'rich_dense_otu_table.biom', package='phyloseq') #' import_biom(rich_dense_biom, parseFunction=parse_taxonomy_greengenes) #' # An included example of a sparse dense biom file -#' rich_sparse_biom <- system.file("extdata", "rich_sparse_otu_table.biom", package="phyloseq") +#' rich_sparse_biom <- system.file('extdata', 'rich_sparse_otu_table.biom', package='phyloseq') #' import_biom(rich_sparse_biom, parseFunction=parse_taxonomy_greengenes) #' # # # Example code for importing large file with parallel backend -#' # library("doParallel") +#' # library('doParallel') #' # registerDoParallel(cores=6) -#' # import_biom("my/file/path/file.biom", parseFunction=parse_taxonomy_greengenes, parallel=TRUE) -import_biom <- function(BIOMfilename, - treefilename=NULL, refseqfilename=NULL, refseqFunction=readDNAStringSet, refseqArgs=NULL, - parseFunction=parse_taxonomy_default, parallel=FALSE, version=1.0, ...){ - - # initialize the argument-list for phyloseq. Start empty. - argumentlist <- list() - - # Read the data - x = read_biom(biom_file=BIOMfilename) - - ######################################## - # OTU table: - ######################################## - otutab = otu_table(as(biom_data(x), "matrix"), taxa_are_rows=TRUE) - argumentlist <- c(argumentlist, list(otutab)) - - ######################################## - # Taxonomy Table - ######################################## - # Need to check if taxonomy information is empty (minimal BIOM file) - if( all( sapply(sapply(x$rows, function(i){i$metadata}), is.null) ) ){ - taxtab <- NULL +#' # import_biom('my/file/path/file.biom', parseFunction=parse_taxonomy_greengenes, parallel=TRUE) +import_biom <- function(BIOMfilename, treefilename = NULL, refseqfilename = NULL, + refseqFunction = readDNAStringSet, refseqArgs = NULL, parseFunction = parse_taxonomy_default, + parallel = FALSE, version = 1, ...) { + + # initialize the argument-list for phyloseq. Start empty. + argumentlist <- list() + + # Read the data + x = read_biom(biom_file = BIOMfilename) + + ######################################## OTU table: + otutab = otu_table(as(biom_data(x), "matrix"), taxa_are_rows = TRUE) + argumentlist <- c(argumentlist, list(otutab)) + + ######################################## Taxonomy Table Need to check if taxonomy information is empty (minimal BIOM + ######################################## file) + if (all(sapply(sapply(x$rows, function(i) { + i$metadata + }), is.null))) { + taxtab <- NULL } else { # parse once each character vector, save as a list - taxlist = lapply(x$rows, function(i){ + taxlist = lapply(x$rows, function(i) { parseFunction(i$metadata$taxonomy) }) - names(taxlist) = sapply(x$rows, function(i){i$id}) + names(taxlist) = sapply(x$rows, function(i) { + i$id + }) taxtab = build_tax_table(taxlist) - } - argumentlist <- c(argumentlist, list(taxtab)) - - ######################################## - # Sample Data ("columns" in QIIME/BIOM) - ######################################## - # If there is no metadata (all NULL), then set sam_data <- NULL - if( is.null(sample_metadata(x)) ){ - samdata <- NULL - } else { - samdata = sample_data(sample_metadata(x)) - } - argumentlist <- c(argumentlist, list(samdata)) - - ######################################## - # Tree data - ######################################## - if( !is.null(treefilename) ){ - if( inherits(treefilename, "phylo") ){ - # If argument is already a tree, don't read, just assign. - tree = treefilename - } else { - # NULL is silently returned if tree is not read properly. - tree <- read_tree(treefilename, ...) - } - # Add to argument list or warn - if( is.null(tree) ){ - warning("treefilename failed import. It not included.") - } else { - argumentlist <- c(argumentlist, list(tree) ) - } - } - - ######################################## - # Reference Sequence data - ######################################## - if( !is.null(refseqfilename) ){ - if( inherits(refseqfilename, "XStringSet") ){ - # If argument is already a XStringSet, don't read, just assign. - refseq = refseqfilename - } else { - # call refseqFunction and read refseqfilename, either with or without additional args - if( !is.null(refseqArgs) ){ - refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs)) - } else { - refseq = refseqFunction(refseqfilename) - } - } - argumentlist <- c(argumentlist, list(refseq) ) - } - - ######################################## - # Put together into a phyloseq object - ######################################## - return( do.call("phyloseq", argumentlist) ) - + } + argumentlist <- c(argumentlist, list(taxtab)) + + ######################################## Sample Data ('columns' in QIIME/BIOM) If there is no metadata (all NULL), then + ######################################## set sam_data <- NULL + if (is.null(sample_metadata(x))) { + samdata <- NULL + } else { + samdata = sample_data(sample_metadata(x)) + } + argumentlist <- c(argumentlist, list(samdata)) + + ######################################## Tree data + if (!is.null(treefilename)) { + if (inherits(treefilename, "phylo")) { + # If argument is already a tree, don't read, just assign. + tree = treefilename + } else { + # NULL is silently returned if tree is not read properly. + tree <- read_tree(treefilename, ...) + } + # Add to argument list or warn + if (is.null(tree)) { + warning("treefilename failed import. It not included.") + } else { + argumentlist <- c(argumentlist, list(tree)) + } + } + + ######################################## Reference Sequence data + if (!is.null(refseqfilename)) { + if (inherits(refseqfilename, "XStringSet")) { + # If argument is already a XStringSet, don't read, just assign. + refseq = refseqfilename + } else { + # call refseqFunction and read refseqfilename, either with or without additional + # args + if (!is.null(refseqArgs)) { + refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs)) + } else { + refseq = refseqFunction(refseqfilename) + } + } + argumentlist <- c(argumentlist, list(refseq)) + } + + ######################################## Put together into a phyloseq object + return(do.call("phyloseq", argumentlist)) + } -################################################################################ -# Need to export these parsing functions as examples... -################################################################################ +################################################################################ Need to export these parsing functions as examples... #' Parse elements of a taxonomy vector #' #' These are provided as both example and default functions for @@ -1874,7 +1881,7 @@ import_biom <- function(BIOMfilename, #' \code{parse_taxonomy_greengenes} function clips the first 3 characters that #' identify the rank, and uses these to name the corresponding element according #' to the appropriate taxonomic rank name used by greengenes -#' (e.g. \code{"p__"} at the beginning of an element means that element is +#' (e.g. \code{'p__'} at the beginning of an element means that element is #' the name of the phylum to which this OTU belongs). #' Most importantly, the expectations for these functions described above #' make them compatible to use during data import, @@ -1891,8 +1898,8 @@ import_biom <- function(BIOMfilename, #' #' @return A character vector in which each element is a different #' taxonomic rank of the same OTU, and each element name is the name of -#' the rank level. For example, an element might be \code{"Firmicutes"} -#' and named \code{"phylum"}. +#' the rank level. For example, an element might be \code{'Firmicutes'} +#' and named \code{'phylum'}. #' These parsed, named versions of the taxonomic vector should #' reflect embedded information, naming conventions, #' desired length limits, etc; or in the case of \code{\link{parse_taxonomy_default}}, @@ -1906,60 +1913,58 @@ import_biom <- function(BIOMfilename, #' \code{\link{import_qiime}} #' #' @examples -#' taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae") +#' taxvec1 = c('Root', 'k__Bacteria', 'p__Firmicutes', 'c__Bacilli', 'o__Bacillales', 'f__Staphylococcaceae') #' parse_taxonomy_default(taxvec1) #' parse_taxonomy_greengenes(taxvec1) -#' taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae") +#' taxvec2 = c('Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae') #' parse_taxonomy_qiime(taxvec2) -parse_taxonomy_default = function(char.vec){ - # Remove any leading empty space - char.vec = gsub("^[[:space:]]{1,}", "", char.vec) - # Remove any trailing space - char.vec = gsub("[[:space:]]{1,}$", "", char.vec) - if( length(char.vec) > 0 ){ - # Add dummy element (rank) name - names(char.vec) = paste("Rank", 1:length(char.vec), sep="") - } else { - warning("Empty taxonomy vector encountered.") - } - return(char.vec) +parse_taxonomy_default = function(char.vec) { + # Remove any leading empty space + char.vec = gsub("^[[:space:]]{1,}", "", char.vec) + # Remove any trailing space + char.vec = gsub("[[:space:]]{1,}$", "", char.vec) + if (length(char.vec) > 0) { + # Add dummy element (rank) name + names(char.vec) = paste("Rank", 1:length(char.vec), sep = "") + } else { + warning("Empty taxonomy vector encountered.") + } + return(char.vec) } #' @rdname parseTaxonomy-functions #' @aliases parse_taxonomy_default #' @export -parse_taxonomy_greengenes <- function(char.vec){ - # Use default to assign names to elements in case problem with greengenes prefix - char.vec = parse_taxonomy_default(char.vec) - # Define the meaning of each prefix according to GreenGenes taxonomy - Tranks = c(k="Kingdom", p="Phylum", c="Class", o="Order", f="Family", g="Genus", s="Species") - # Check for prefix using regexp, warn if there were none. trim indices, ti - ti = grep("[[:alpha:]]{1}\\_\\_", char.vec) - if( length(ti) == 0L ){ - warning( - "No greengenes prefixes were found. \n", - "Consider using parse_taxonomy_default() instead if true for all OTUs. \n", - "Dummy ranks may be included among taxonomic ranks now." - ) - # Will want to return without further modifying char.vec - taxvec = char.vec - # Replace names of taxvec according to prefix, if any present... - } else { - # Remove prefix using sub-"" regexp, call result taxvec - taxvec = gsub("[[:alpha:]]{1}\\_\\_", "", char.vec) - # Define the ranks that will be replaced - repranks = Tranks[substr(char.vec[ti], 1, 1)] - # Replace, being sure to avoid prefixes not present in Tranks - names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)] - } - return(taxvec) +parse_taxonomy_greengenes <- function(char.vec) { + # Use default to assign names to elements in case problem with greengenes prefix + char.vec = parse_taxonomy_default(char.vec) + # Define the meaning of each prefix according to GreenGenes taxonomy + Tranks = c(k = "Kingdom", p = "Phylum", c = "Class", o = "Order", f = "Family", + g = "Genus", s = "Species") + # Check for prefix using regexp, warn if there were none. trim indices, ti + ti = grep("[[:alpha:]]{1}\\_\\_", char.vec) + if (length(ti) == 0L) { + warning("No greengenes prefixes were found. \n", "Consider using parse_taxonomy_default() instead if true for all OTUs. \n", + "Dummy ranks may be included among taxonomic ranks now.") + # Will want to return without further modifying char.vec + taxvec = char.vec + # Replace names of taxvec according to prefix, if any present... + } else { + # Remove prefix using sub-'' regexp, call result taxvec + taxvec = gsub("[[:alpha:]]{1}\\_\\_", "", char.vec) + # Define the ranks that will be replaced + repranks = Tranks[substr(char.vec[ti], 1, 1)] + # Replace, being sure to avoid prefixes not present in Tranks + names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)] + } + return(taxvec) } #' @rdname parseTaxonomy-functions #' @aliases parse_taxonomy_default #' @export -parse_taxonomy_qiime <- function(char.vec){ - parse_taxonomy_greengenes(strsplit(char.vec, ";", TRUE)[[1]]) +parse_taxonomy_qiime <- function(char.vec) { + parse_taxonomy_greengenes(strsplit(char.vec, ";", TRUE)[[1]]) } -################################################################################ +################################################################################ #' Build a \code{\link{tax_table}} from a named possibly-jagged list #' #' @param taxlist (Required). A list in which each element is a vector of @@ -1983,39 +1988,38 @@ parse_taxonomy_qiime <- function(char.vec){ #' @export #' #' @examples -#' taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae") +#' taxvec1 = c('Root', 'k__Bacteria', 'p__Firmicutes', 'c__Bacilli', 'o__Bacillales', 'f__Staphylococcaceae') #' parse_taxonomy_default(taxvec1) #' parse_taxonomy_greengenes(taxvec1) -#' taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae") +#' taxvec2 = c('Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae') #' parse_taxonomy_qiime(taxvec2) #' taxlist1 = list(OTU1=parse_taxonomy_greengenes(taxvec1), OTU2=parse_taxonomy_qiime(taxvec2)) #' taxlist2 = list(OTU1=parse_taxonomy_default(taxvec1), OTU2=parse_taxonomy_qiime(taxvec2)) #' build_tax_table(taxlist1) #' build_tax_table(taxlist2) -build_tax_table = function(taxlist){ - # Determine column headers (rank names) of taxonomy table - columns = unique(unlist(lapply(taxlist, names))) - # Initialize taxonomic character matrix - taxmat <- matrix(NA_character_, nrow=length(taxlist), ncol=length(columns)) - colnames(taxmat) = columns - # Fill in the matrix by row. - for( i in 1:length(taxlist) ){ - # Protect against empty taxonomy - if( length(taxlist[[i]]) > 0 ){ - # The extra column name check solves issues with raggedness, and disorder. - taxmat[i, names(taxlist[[i]])] <- taxlist[[i]] - } - } - # Convert functionally empty elements, "", to NA - taxmat[taxmat==""] <- NA_character_ - # Now coerce to matrix, name the rows as "id" (the taxa name), coerce to taxonomyTable - taxmat <- as(taxmat, "matrix") - rownames(taxmat) = names(taxlist) - return( tax_table(taxmat) ) +build_tax_table = function(taxlist) { + # Determine column headers (rank names) of taxonomy table + columns = unique(unlist(lapply(taxlist, names))) + # Initialize taxonomic character matrix + taxmat <- matrix(NA_character_, nrow = length(taxlist), ncol = length(columns)) + colnames(taxmat) = columns + # Fill in the matrix by row. + for (i in 1:length(taxlist)) { + # Protect against empty taxonomy + if (length(taxlist[[i]]) > 0) { + # The extra column name check solves issues with raggedness, and disorder. + taxmat[i, names(taxlist[[i]])] <- taxlist[[i]] + } + } + # Convert functionally empty elements, '', to NA + taxmat[taxmat == ""] <- NA_character_ + # Now coerce to matrix, name the rows as 'id' (the taxa name), coerce to + # taxonomyTable + taxmat <- as(taxmat, "matrix") + rownames(taxmat) = names(taxlist) + return(tax_table(taxmat)) } -################################################################################ -################################################################################ -################################################################################ +################################################################################ #' Download and import directly from microbio.me/qiime #' #' This function is for accessing microbiome datasets from the @@ -2048,8 +2052,8 @@ build_tax_table = function(taxlist){ #' this function will complete the remainder of the ftp URL hosted at #' \href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}. #' For example, instead of the full URL string, -#' \code{"ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_494_split_library_seqs_and_mapping.zip"}, -#' you could simply provide \code{494} or \code{"494"} +#' \code{'ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_494_split_library_seqs_and_mapping.zip'}, +#' you could simply provide \code{494} or \code{'494'} #' as the first (`zipftp`) argument. #' #' @param ext (Optional). A \code{\link{character}} string of the expected @@ -2098,12 +2102,12 @@ build_tax_table = function(taxlist){ #' # This should return TRUE on your system if you have internet turned on #' # and a standard R installation. Indicates whether this is likely to #' # work on your system for a URL or local file, respectively. -#' capabilities("http/ftp"); capabilities("fifo") +#' capabilities('http/ftp'); capabilities('fifo') #' # A working example with a local example file included in phyloseq -#' zipfile = "study_816_split_library_seqs_and_mapping.zip" -#' zipfile = system.file("extdata", zipfile, package="phyloseq") -#' tarfile = "study_816_split_library_seqs_and_mapping.tar.gz" -#' tarfile = system.file("extdata", tarfile, package="phyloseq") +#' zipfile = 'study_816_split_library_seqs_and_mapping.zip' +#' zipfile = system.file('extdata', zipfile, package='phyloseq') +#' tarfile = 'study_816_split_library_seqs_and_mapping.tar.gz' +#' tarfile = system.file('extdata', tarfile, package='phyloseq') #' tarps = microbio_me_qiime(tarfile) #' zipps = microbio_me_qiime(zipfile) #' identical(tarps, zipps) @@ -2111,109 +2115,108 @@ build_tax_table = function(taxlist){ #' plot_heatmap(tarps) #' # A real example #' # # Smokers dataset -#' # smokezip = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_524_split_library_seqs_and_mapping.zip" +#' # smokezip = 'ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_524_split_library_seqs_and_mapping.zip' #' # smokers1 = microbio_me_qiime(smokezip) #' # # Alternatively, just use the study number #' # smokers2 = microbio_me_qiime(524) #' # identical(smokers1, smokers2) -microbio_me_qiime = function(zipftp, ext=".zip", parsef=parse_taxonomy_greengenes, ...){ - # Define naming convention - front = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_" - if( !is.na(as.integer(zipftp)) ){ - # If study number instead of string, - # create the ftp URL using ext and convention - back = paste0("_split_library_seqs_and_mapping", ext) - zipftp = paste0(front, zipftp, back) - } else { - # Determine file extension from the file path itself - ext = substring(zipftp, regexpr("\\.([[:alnum:]]+)$", zipftp)[1]) - back = paste0("_split_library_seqs_and_mapping", ext) - } - # Check if zipftp is clearly an externally located file, ftp, http, etc. - externprefixes = c("http://", "https://", "ftp://") - prefix = regexpr("^([[:alnum:]]+)\\://", zipftp) - if( substr(zipftp, 1, attr(prefix, "match.length")[1]) %in% externprefixes ){ - # If external, then create temporary file and download - zipfile = tempfile() - download.file(zipftp, zipfile, ...) - } else { - # Else it is a local zipfile - zipfile = zipftp - } - # Use the apparent file naming convention for microbio.me/qiime - # as the de facto guide for this API. In particular, - # the expectation o fthe study name (already used above) - studyname = gsub("\\_split\\_.+$", "", basename(zipftp)) - # The output of tempdir() is always the same in the same R session - # To avoid conflict with multiple microbio.me/qiime unpacks - # in the same session, pre-pend the study name and datestamp +microbio_me_qiime = function(zipftp, ext = ".zip", parsef = parse_taxonomy_greengenes, + ...) { + # Define naming convention + front = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_" + if (!is.na(as.integer(zipftp))) { + # If study number instead of string, create the ftp URL using ext and convention + back = paste0("_split_library_seqs_and_mapping", ext) + zipftp = paste0(front, zipftp, back) + } else { + # Determine file extension from the file path itself + ext = substring(zipftp, regexpr("\\.([[:alnum:]]+)$", zipftp)[1]) + back = paste0("_split_library_seqs_and_mapping", ext) + } + # Check if zipftp is clearly an externally located file, ftp, http, etc. + externprefixes = c("http://", "https://", "ftp://") + prefix = regexpr("^([[:alnum:]]+)\\://", zipftp) + if (substr(zipftp, 1, attr(prefix, "match.length")[1]) %in% externprefixes) { + # If external, then create temporary file and download + zipfile = tempfile() + download.file(zipftp, zipfile, ...) + } else { + # Else it is a local zipfile + zipfile = zipftp + } + # Use the apparent file naming convention for microbio.me/qiime as the de facto + # guide for this API. In particular, the expectation o fthe study name (already + # used above) + studyname = gsub("\\_split\\_.+$", "", basename(zipftp)) + # The output of tempdir() is always the same in the same R session To avoid + # conflict with multiple microbio.me/qiime unpacks in the same session, pre-pend + # the study name and datestamp unpackdir = paste0(studyname, "_", gsub("[[:blank:][:punct:]]", "", date())) # Add the temp path - unpackdir = file.path(tempdir(), unpackdir) + unpackdir = file.path(tempdir(), unpackdir) # Create the unpack directory if needed (most likely). - if( !file.exists(unpackdir) ){dir.create(unpackdir)} - # Unpack to the temporary directory using unzip or untar - if( ext == ".zip" ){ - unzip(zipfile, exdir=unpackdir, overwrite=TRUE) - } else if( ext %in% c("tar.gz", ".tgz", ".gz", ".gzip", ".bzip2", ".xz") ){ - # untar the tarfile to the new temp dir - untar(zipfile, exdir=unpackdir) - } else { - # The compression format was not recognized. Provide informative error msg. - msg = paste("Could not determine the compression type.", - "Expected extensions are (mostly):", - ".zip, .tgz, .tar.gz", sep="\n") - stop(msg) - } - # Define a list of imported objects that might grow - # if the right file types are present and imported correctly. - imported_objects = vector("list") - # Search recursively in the unpacked directory for the .biom file - # and parse if it is. - # There should be only one. Throw warning if more than one, take the first. - biomfile = list.files(unpackdir, "\\.biom", full.names=TRUE, recursive=TRUE) - if( length(biomfile) > 1 ){ + if (!file.exists(unpackdir)) { + dir.create(unpackdir) + } + # Unpack to the temporary directory using unzip or untar + if (ext == ".zip") { + unzip(zipfile, exdir = unpackdir, overwrite = TRUE) + } else if (ext %in% c("tar.gz", ".tgz", ".gz", ".gzip", ".bzip2", ".xz")) { + # untar the tarfile to the new temp dir + untar(zipfile, exdir = unpackdir) + } else { + # The compression format was not recognized. Provide informative error msg. + msg = paste("Could not determine the compression type.", "Expected extensions are (mostly):", + ".zip, .tgz, .tar.gz", sep = "\n") + stop(msg) + } + # Define a list of imported objects that might grow if the right file types are + # present and imported correctly. + imported_objects = vector("list") + # Search recursively in the unpacked directory for the .biom file and parse if it + # is. There should be only one. Throw warning if more than one, take the first. + biomfile = list.files(unpackdir, "\\.biom", full.names = TRUE, recursive = TRUE) + if (length(biomfile) > 1) { warning("more than one .biom file found in compressed archive. Importing first only.") biomfile = biomfile[1] - } else if( length(biomfile) == 1 ){ - cat("Found biom-format file, now parsing it... \n") - biom = import_biom(biomfile, parseFunction=parsef) - cat("Done parsing biom... \n") - imported_objects = c(imported_objects, list(biom)) - } - # Check if sample_data (qiime mapping) file present, and parse if it is. - sdfile = list.files(unpackdir, "\\_mapping\\_file\\.txt", full.names=TRUE, recursive=TRUE) - if( length(sdfile) > 1 ){ - warning("more than one mapping file found in compressed archive. Importing first only.") - sdfile = sdfile[1] - } else if( length(sdfile)==1 ){ - cat("Importing Sample Metdadata from mapping file...", fill=TRUE) - sample_metadata = import_qiime_sample_data(sdfile) - imported_objects = c(imported_objects, list(sample_metadata)) - } - # Check success, notify user, and return. - if( length(imported_objects) > 1 ){ - # If there are more than one imported objects, merge them and return - cat("Merging the imported objects... \n") - physeq = do.call("merge_phyloseq", imported_objects) - if( inherits(physeq, "phyloseq") ){ - cat("Successfully merged, phyloseq-class created. \n Returning... \n") - } - return(physeq) - } else if( length(imported_objects) == 1 ){ - cat("Note: only on object in the zip file was imported. \n") - cat("It was ", class(imported_objects[[1]]), " class. \n") - return(imported_objects[[1]]) - } else { - cat("PLEASE NOTE: No objects were imported. \n", - "You chould check the zip file, \n", - "as well as the naming conventions in the zipfile \n", - "to make sure that they match microbio.me/qiime. \n", - "Instead returning NULL... \n") - return(NULL) - } + } else if (length(biomfile) == 1) { + cat("Found biom-format file, now parsing it... \n") + biom = import_biom(biomfile, parseFunction = parsef) + cat("Done parsing biom... \n") + imported_objects = c(imported_objects, list(biom)) + } + # Check if sample_data (qiime mapping) file present, and parse if it is. + sdfile = list.files(unpackdir, "\\_mapping\\_file\\.txt", full.names = TRUE, + recursive = TRUE) + if (length(sdfile) > 1) { + warning("more than one mapping file found in compressed archive. Importing first only.") + sdfile = sdfile[1] + } else if (length(sdfile) == 1) { + cat("Importing Sample Metdadata from mapping file...", fill = TRUE) + sample_metadata = import_qiime_sample_data(sdfile) + imported_objects = c(imported_objects, list(sample_metadata)) + } + # Check success, notify user, and return. + if (length(imported_objects) > 1) { + # If there are more than one imported objects, merge them and return + cat("Merging the imported objects... \n") + physeq = do.call("merge_phyloseq", imported_objects) + if (inherits(physeq, "phyloseq")) { + cat("Successfully merged, phyloseq-class created. \n Returning... \n") + } + return(physeq) + } else if (length(imported_objects) == 1) { + cat("Note: only on object in the zip file was imported. \n") + cat("It was ", class(imported_objects[[1]]), " class. \n") + return(imported_objects[[1]]) + } else { + cat("PLEASE NOTE: No objects were imported. \n", "You chould check the zip file, \n", + "as well as the naming conventions in the zipfile \n", "to make sure that they match microbio.me/qiime. \n", + "Instead returning NULL... \n") + return(NULL) + } } -################################################################################ +################################################################################ #' Import usearch table format (\code{.uc}) to OTU table #' #' UPARSE is an algorithm for OTU-clustering implemented within usearch. @@ -2239,7 +2242,7 @@ microbio_me_qiime = function(zipftp, ext=".zip", parsef=parse_taxonomy_greengene #' assumed that the 9th and 10th columns of the \code{.uc} table #' hold the read-label and OTU ID, respectively; #' and it is also assumed that the delimiter between sample-name and read -#' in the read-name entries is a single \code{"_"}. +#' in the read-name entries is a single \code{'_'}. #' #' @param ucfile (Required). A file location character string #' or \code{\link{connection}} @@ -2260,7 +2263,7 @@ microbio_me_qiime = function(zipftp, ext=".zip", parsef=parse_taxonomy_greengene #' This should be the delimiter that separates the sample ID #' from the original ID in the demultiplexed read ID of your sequence file. #' The default is plain underscore, which in this \code{\link{regex}} context -#' is \code{"_"}. +#' is \code{'_'}. #' #' @param verbose (Optional). A \code{\link{logical}}. #' Default is \code{TRUE}. @@ -2277,36 +2280,34 @@ microbio_me_qiime = function(zipftp, ext=".zip", parsef=parse_taxonomy_greengene #' \code{\link{import_qiime}} #' #' @examples -#' usearchfile <- system.file("extdata", "usearch.uc", package="phyloseq") +#' usearchfile <- system.file('extdata', 'usearch.uc', package='phyloseq') #' import_usearch_uc(usearchfile) -import_usearch_uc <- function(ucfile, colRead=9, colOTU=10, - readDelimiter="_", verbose=TRUE){ - if(verbose){cat("Reading `ucfile` into memory and parsing into table \n")} - # fread is one of the fastest and most-efficient importers for R. - # It creates a data.table object, suitable for large size objects - x = fread(ucfile, sep="\t", header=FALSE, na.strings=c("*", '*', "NA","N/A",""), - select=c(colRead, colOTU), colClasses="character", showProgress=TRUE) +import_usearch_uc <- function(ucfile, colRead = 9, colOTU = 10, readDelimiter = "_", + verbose = TRUE) { + if (verbose) { + cat("Reading `ucfile` into memory and parsing into table \n") + } + # fread is one of the fastest and most-efficient importers for R. It creates a + # data.table object, suitable for large size objects + x = fread(ucfile, sep = "\t", header = FALSE, na.strings = c("*", "*", "NA", + "N/A", ""), select = c(colRead, colOTU), colClasses = "character", showProgress = TRUE) setnames(x, c("read", "OTU")) NrawEntries = nrow(x) - if(verbose){ + if (verbose) { cat("Initially read", NrawEntries, "entries. \n") cat("... Now removing unassigned OTUs (* or NA)... \n") } x = x[!is.na(OTU), ] - if(verbose){ + if (verbose) { cat("Removed", NrawEntries - nrow(x), "entries that had no OTU assignment. \n") cat("A total of", nrow(x), "will be assigned to the OTU table.\n") } # Process sequence label to be sample label only - x[, sample:=gsub(paste0(readDelimiter, ".+$"), "", read)] + x[, `:=`(sample, gsub(paste0(readDelimiter, ".+$"), "", read))] # Convert long (melted) table into a sample-by-OTU OTU table, and return OTU <- as(table(x$sample, x$OTU), "matrix") - # system.time({setkey(x, OTU, sample) - # OTU2 <- dcast.data.table(x, sample ~ OTU, fun.aggregate=length, fill=0L) - # }) - return(otu_table(OTU, taxa_are_rows=FALSE)) + # system.time({setkey(x, OTU, sample) OTU2 <- dcast.data.table(x, sample ~ OTU, + # fun.aggregate=length, fill=0L) }) + return(otu_table(OTU, taxa_are_rows = FALSE)) } -################################################################################ -################################################################################ -################################################################################ -################################################################################ \ No newline at end of file +################################################################################ diff --git a/R/allClasses.R b/R/allClasses.R index 5eae1974..c4d3f82d 100644 --- a/R/allClasses.R +++ b/R/allClasses.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' The S4 class for storing taxa-abundance information. #' #' Because orientation of these tables can vary by method, the orientation is @@ -10,7 +10,7 @@ #' #' \describe{ #' \item{taxa_are_rows}{ -#' A single logical specifying the orientation of the abundance table. +#'\t\tA single logical specifying the orientation of the abundance table. #' } #' #' \item{.Data}{This slot is inherited from the \code{\link{matrix}} class.} @@ -18,8 +18,8 @@ #' @name otu_table-class #' @rdname otu_table-class #' @exportClass otu_table -setClass("otu_table", representation(taxa_are_rows="logical"), contains = "matrix") -################################################################################ +setClass("otu_table", representation(taxa_are_rows = "logical"), contains = "matrix") +################################################################################ #' The S4 for storing sample variables. #' #' Row indices represent samples, while column indices represent experimental @@ -30,7 +30,7 @@ setClass("otu_table", representation(taxa_are_rows="logical"), contains = "matri #' \item{.Data}{data-frame data, inherited from the data.frame class.} #' #' \item{row.names}{ -#' Also inherited from the data.frame class; +#'\t Also inherited from the data.frame class; #' it should contain the sample names. #' } #' @@ -41,8 +41,8 @@ setClass("otu_table", representation(taxa_are_rows="logical"), contains = "matri #' @name sample_data-class #' @rdname sample_data-class #' @exportClass sample_data -setClass("sample_data", contains="data.frame") -################################################################################ +setClass("sample_data", contains = "data.frame") +################################################################################ #' An S4 class that holds taxonomic classification data as a character #' matrix. #' @@ -56,8 +56,7 @@ setClass("sample_data", contains="data.frame") #' @rdname taxonomyTable-class #' @exportClass taxonomyTable setClass("taxonomyTable", contains = "matrix") -#metaMDS -################################################################################ +# metaMDS #' S3 class placeholder definition (list) for metaMDS #' #' The ape package does export a version of its \code{\link[vegan]{metaMDS}}-class, @@ -76,10 +75,7 @@ setClass("taxonomyTable", contains = "matrix") #' #' @keywords internal metaMDS <- structure(list(), class = "metaMDS") -### -# Remove if this ever works -# @importClassesFrom vegan metaMDS -################################################################################ +### Remove if this ever works @importClassesFrom vegan metaMDS #' S3 class placeholder definition (list) for decorana #' #' The ape package does export a version of its \code{\link[vegan]{decorana}}-class, @@ -98,10 +94,7 @@ metaMDS <- structure(list(), class = "metaMDS") #' #' @keywords internal decorana <- structure(list(), class = "decorana") -### -# Remove if this ever works -# @importClassesFrom vegan decorana -################################################################################ +### Remove if this ever works @importClassesFrom vegan decorana #' S3 class placeholder definition (list) for dpcoa #' #' The ade4 package does not export a version of its \code{\link[ade4]{dpcoa}}-class, @@ -128,13 +121,8 @@ decorana <- structure(list(), class = "decorana") #' #' @keywords internal dpcoa <- structure(list(), class = "dpcoa") -################################################################################ -## # @keywords internal -## print.dpcoa <- ade4:::print.dpcoa -################################################################################ -# If this ever works -# @importClassesFrom ade4 dpcoa -################################################################################ +################################################################################ # @keywords internal print.dpcoa <- ade4:::print.dpcoa If this ever works +################################################################################ @importClassesFrom ade4 dpcoa #' S3 class for ape-calculated MDS results #' #' Nothing to import, because ape doesn't (yet) export this S3 class. @@ -146,9 +134,7 @@ dpcoa <- structure(list(), class = "dpcoa") #' #' @keywords internal pcoa <- structure(list(), class = "pcoa") -# @importMethodsFrom ape print -# phyloseq-specific definition of "phylo" class, -################################################################################ +# @importMethodsFrom ape print phyloseq-specific definition of 'phylo' class, #' S3 class placeholder definition (list) for phylogenetic trees. #' #' The ape package does not export a version of its \code{\link[ape]{phylo}}-class, @@ -176,10 +162,7 @@ pcoa <- structure(list(), class = "pcoa") #' #' @keywords internal phylo <- structure(list(), class = "phylo") -################################################################################ -# If this ever works -# @importClassesFrom ape phylo -################################################################################ +################################################################################ If this ever works @importClassesFrom ape phylo #' An S4 placeholder of the main phylogenetic tree class from the ape package. #' #' See the \code{\link[ape]{ape}} package for details about this type of @@ -192,7 +175,7 @@ phylo <- structure(list(), class = "phylo") #' @rdname phylo-class #' @exportClass phylo setOldClass("phylo") -################################################################################ +################################################################################ #' An S4 placeholder for the \code{\link[stats]{dist}} class. #' #' See \code{\link[stats]{dist}} for details @@ -204,12 +187,10 @@ setOldClass("phylo") #' @rdname dist-class #' @exportClass dist setOldClass("dist") -################################################################################ -# Use setClassUnion to define the unholy NULL-data union as a virtual class. -# This is a way of dealing with the expected scenarios in which one or more of -# the component data classes is not available, in which case NULL will be used -# instead. -################################################################################ +################################################################################ Use setClassUnion to define the unholy NULL-data union as a virtual class. +################################################################################ This is a way of dealing with the expected scenarios in which one or more of +################################################################################ the component data classes is not available, in which case NULL will be used +################################################################################ instead. #' @keywords internal setClassUnion("otu_tableOrNULL", c("otu_table", "NULL")) #' @keywords internal @@ -234,15 +215,15 @@ setClassUnion("phyloOrNULL", c("phylo", "NULL")) #' @importClassesFrom Biostrings XStringSet #' @keywords internal setClassUnion("XStringSetOrNULL", c("XStringSet", "NULL")) -################################################################################ +################################################################################ #' The main experiment-level class for phyloseq data #' #' Contains all currently-supported component data classes: #' \code{\link{otu_table-class}}, #' \code{\link{sample_data-class}}, -#' \code{\link{taxonomyTable-class}} (\code{"tax_table"} slot), -#' \code{\link[ape]{phylo}}-class (\code{"phy_tree"} slot), -#' and the \code{\link[Biostrings]{XStringSet-class}} (\code{"refseq"} slot). +#' \code{\link{taxonomyTable-class}} (\code{'tax_table'} slot), +#' \code{\link[ape]{phylo}}-class (\code{'phy_tree'} slot), +#' and the \code{\link[Biostrings]{XStringSet-class}} (\code{'refseq'} slot). #' There are several advantages #' to storing your phylogenetic sequencing experiment as an instance of the #' phyloseq class, not the least of which is that it is easy to return to the @@ -283,13 +264,8 @@ setClassUnion("XStringSetOrNULL", c("XStringSet", "NULL")) #' @name phyloseq-class #' @rdname phyloseq-class #' @exportClass phyloseq -setClass(Class="phyloseq", - representation=representation( - otu_table="otu_tableOrNULL", - tax_table="taxonomyTableOrNULL", - sam_data="sample_dataOrNULL", - phy_tree="phyloOrNULL", - refseq = "XStringSetOrNULL"), - prototype=prototype(otu_table=NULL, tax_table=NULL, sam_data=NULL, phy_tree=NULL, refseq=NULL) -) -################################################################################ +setClass(Class = "phyloseq", representation = representation(otu_table = "otu_tableOrNULL", + tax_table = "taxonomyTableOrNULL", sam_data = "sample_dataOrNULL", phy_tree = "phyloOrNULL", + refseq = "XStringSetOrNULL"), prototype = prototype(otu_table = NULL, tax_table = NULL, + sam_data = NULL, phy_tree = NULL, refseq = NULL)) +################################################################################ diff --git a/R/allData.R b/R/allData.R index d8bd2d35..317583ad 100644 --- a/R/allData.R +++ b/R/allData.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' (Data) Small example dataset from a human esophageal community (2004) #' #' Includes just 3 samples, 1 each from 3 subjects. Although the research article mentions 4 subjects, @@ -34,15 +34,15 @@ #' data(esophagus) #' UniFrac(esophagus, weighted=TRUE) #' # How to re-create the esophagus dataset using import_mothur function -#' mothlist <- system.file("extdata", "esophagus.fn.list.gz", package="phyloseq") -#' mothgroup <- system.file("extdata", "esophagus.good.groups.gz", package="phyloseq") -#' mothtree <- system.file("extdata", "esophagus.tree.gz", package="phyloseq") +#' mothlist <- system.file('extdata', 'esophagus.fn.list.gz', package='phyloseq') +#' mothgroup <- system.file('extdata', 'esophagus.good.groups.gz', package='phyloseq') +#' mothtree <- system.file('extdata', 'esophagus.tree.gz', package='phyloseq') #' show_mothur_cutoffs(mothlist) -#' cutoff <- "0.10" -#' esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff) -################################################################################ +#' cutoff <- '0.10' +#' esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff)\t +################################################################################ NA -################################################################################ +################################################################################ #' (Data) Enterotypes of the human gut microbiome (2011) #' #' Published in Nature in early 2011, this work compared (among other things), @@ -81,11 +81,11 @@ NA #' @keywords data #' @examples #' data(enterotype) -#' ig <- make_network(enterotype, "samples", max.dist=0.3) -#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) -################################################################################ +#' ig <- make_network(enterotype, 'samples', max.dist=0.3) +#' plot_network(ig, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) +################################################################################ NA -################################################################################ +################################################################################ #' (Data) Reproducibility of soil microbiome data (2011) #' #' Published in early 2011, @@ -159,12 +159,11 @@ NA #' # No convincing difference in species richness between warmed/unwarmed soils. #' ################################################################################ #' # Graphically compare richness between the different treatments. -#' man.col <- c(WC="red", WU="brown", UC="blue", UU="darkgreen") -#' plot_richness(soilrep, x="Treatment", color="Treatment", measures=c("Observed", "Chao1", "Shannon")) -################################################################################ +#' man.col <- c(WC='red', WU='brown', UC='blue', UU='darkgreen') +#' plot_richness(soilrep, x='Treatment', color='Treatment', measures=c('Observed', 'Chao1', 'Shannon')) +################################################################################ NA -################################################################################ -################################################################################ +################################################################################ #' (Data) Global patterns of 16S rRNA diversity at a depth of millions of sequences per sample (2011) #' #' Published in PNAS in early 2011. This work compared the microbial @@ -207,7 +206,7 @@ NA #' #' @examples #' data(GlobalPatterns) -#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Observed", "Chao1", "Shannon")) -################################################################################ +#' plot_richness(GlobalPatterns, x='SampleType', measures=c('Observed', 'Chao1', 'Shannon')) +################################################################################ NA -################################################################################ +################################################################################ diff --git a/R/allPackage.R b/R/allPackage.R index 9945db72..9de2e027 100644 --- a/R/allPackage.R +++ b/R/allPackage.R @@ -1,4 +1,4 @@ -############################################### +############################################### #' Handling and analysis of high-throughput phylogenetic sequence data. #' #' There are already several ecology and phylogenetic packages available in R, @@ -21,4 +21,4 @@ #' @docType package #' @keywords package NA -############################################### +############################################### diff --git a/R/almostAllAccessors.R b/R/almostAllAccessors.R index 76c97090..456adbdd 100644 --- a/R/almostAllAccessors.R +++ b/R/almostAllAccessors.R @@ -1,7 +1,4 @@ -################################################################################ -### Accessor / subset methods. -################################################################################ -################################################################################ +################################################################################ Accessor / subset methods. #' Retrieve reference sequences (\code{\link[Biostrings]{XStringSet}}-class) from object. #' #' This is the suggested method @@ -38,18 +35,20 @@ #' @examples #' data(GlobalPatterns) #' refseq(GlobalPatterns, FALSE) -setGeneric("refseq", function(physeq, errorIfNULL=TRUE) standardGeneric("refseq")) +setGeneric("refseq", function(physeq, errorIfNULL = TRUE) standardGeneric("refseq")) #' @rdname refseq-methods #' @aliases refseq,ANY-method -setMethod("refseq", "ANY", function(physeq, errorIfNULL=TRUE){ - access(physeq, "refseq", errorIfNULL) +setMethod("refseq", "ANY", function(physeq, errorIfNULL = TRUE) { + access(physeq, "refseq", errorIfNULL) }) -# Return as-is if already a "XStringSet" object +# Return as-is if already a 'XStringSet' object #' @importClassesFrom Biostrings XStringSet #' @rdname refseq-methods #' @aliases refseq,XStringSet-method -setMethod("refseq", "XStringSet", function(physeq){ return(physeq) }) -################################################################################ +setMethod("refseq", "XStringSet", function(physeq) { + return(physeq) +}) +################################################################################ #' Retrieve phylogenetic tree (\code{\link[ape]{phylo}}-class) from object. #' #' This is the suggested method @@ -94,17 +93,19 @@ setMethod("refseq", "XStringSet", function(physeq){ return(physeq) }) #' @examples #' data(GlobalPatterns) #' phy_tree(GlobalPatterns) -setGeneric("phy_tree", function(physeq, errorIfNULL=TRUE) standardGeneric("phy_tree")) +setGeneric("phy_tree", function(physeq, errorIfNULL = TRUE) standardGeneric("phy_tree")) #' @rdname phy_tree-methods #' @aliases phy_tree,ANY-method -setMethod("phy_tree", "ANY", function(physeq, errorIfNULL=TRUE){ - access(physeq, "phy_tree", errorIfNULL) +setMethod("phy_tree", "ANY", function(physeq, errorIfNULL = TRUE) { + access(physeq, "phy_tree", errorIfNULL) }) -# Return as-is if already a "phylo" object +# Return as-is if already a 'phylo' object #' @rdname phy_tree-methods #' @aliases phy_tree,phylo-method -setMethod("phy_tree", "phylo", function(physeq){ return(physeq) }) -################################################################################ +setMethod("phy_tree", "phylo", function(physeq) { + return(physeq) +}) +################################################################################ #' Access taxa_are_rows slot from otu_table objects. #' #' @usage taxa_are_rows(physeq) @@ -121,16 +122,20 @@ setMethod("phy_tree", "phylo", function(physeq){ return(physeq) }) setGeneric("taxa_are_rows", function(physeq) standardGeneric("taxa_are_rows")) #' @rdname taxa_are_rows-methods #' @aliases taxa_are_rows,ANY-method -setMethod("taxa_are_rows", "ANY", function(physeq){NULL}) +setMethod("taxa_are_rows", "ANY", function(physeq) { + NULL +}) #' @rdname taxa_are_rows-methods #' @aliases taxa_are_rows,otu_table-method -setMethod("taxa_are_rows", "otu_table", function(physeq){physeq@taxa_are_rows}) +setMethod("taxa_are_rows", "otu_table", function(physeq) { + physeq@taxa_are_rows +}) #' @rdname taxa_are_rows-methods #' @aliases taxa_are_rows,phyloseq-method -setMethod("taxa_are_rows", "phyloseq", function(physeq){ - taxa_are_rows(otu_table(physeq)) +setMethod("taxa_are_rows", "phyloseq", function(physeq) { + taxa_are_rows(otu_table(physeq)) }) -################################################################################ +################################################################################ #' Get the number of taxa/species. #' #' @usage ntaxa(physeq) @@ -148,44 +153,46 @@ setMethod("taxa_are_rows", "phyloseq", function(physeq){ #' @export #' #' @examples -#' data("esophagus") +#' data('esophagus') #' ntaxa(esophagus) #' phy_tree(esophagus) #' ntaxa(phy_tree(esophagus)) setGeneric("ntaxa", function(physeq) standardGeneric("ntaxa")) #' @rdname ntaxa-methods #' @aliases ntaxa,ANY-method -setMethod("ntaxa", "ANY", function(physeq){ return(NULL) }) +setMethod("ntaxa", "ANY", function(physeq) { + return(NULL) +}) #' @rdname ntaxa-methods #' @aliases ntaxa,phyloseq-method -setMethod("ntaxa", "phyloseq", function(physeq){ - ntaxa(otu_table(physeq)) +setMethod("ntaxa", "phyloseq", function(physeq) { + ntaxa(otu_table(physeq)) }) #' @rdname ntaxa-methods #' @aliases ntaxa,otu_table-method -setMethod("ntaxa", "otu_table", function(physeq){ - if( taxa_are_rows(physeq) ){ - return( nrow(physeq) ) - } else { - return( ncol(physeq) ) - } +setMethod("ntaxa", "otu_table", function(physeq) { + if (taxa_are_rows(physeq)) { + return(nrow(physeq)) + } else { + return(ncol(physeq)) + } }) #' @rdname ntaxa-methods #' @aliases ntaxa,taxonomyTable-method -setMethod("ntaxa", "taxonomyTable", function(physeq){ - nrow(physeq) +setMethod("ntaxa", "taxonomyTable", function(physeq) { + nrow(physeq) }) #' @rdname ntaxa-methods #' @aliases ntaxa,phylo-method -setMethod("ntaxa", "phylo", function(physeq){ - length(physeq$tip.label) +setMethod("ntaxa", "phylo", function(physeq) { + length(physeq$tip.label) }) #' @rdname ntaxa-methods #' @aliases ntaxa,XStringSet-method -setMethod("ntaxa", "XStringSet", function(physeq){ - length(physeq) +setMethod("ntaxa", "XStringSet", function(physeq) { + length(physeq) }) -################################################################################ +################################################################################ #' Get species / taxa names. #' #' @usage taxa_names(physeq) @@ -203,44 +210,46 @@ setMethod("ntaxa", "XStringSet", function(physeq){ #' @export #' #' @examples # -#' data("esophagus") +#' data('esophagus') #' tree <- phy_tree(esophagus) #' OTU1 <- otu_table(esophagus) #' taxa_names(tree) #' taxa_names(OTU1) #' physeq1 <- phyloseq(OTU1, tree) #' taxa_names(physeq1) -setGeneric("taxa_names", function(physeq) standardGeneric("taxa_names")) +setGeneric("taxa_names", function(physeq) standardGeneric("taxa_names")) #' @rdname taxa_names-methods #' @aliases taxa_names,ANY-method -setMethod("taxa_names", "ANY", function(physeq){ return(NULL) }) +setMethod("taxa_names", "ANY", function(physeq) { + return(NULL) +}) #' @rdname taxa_names-methods #' @aliases taxa_names,phyloseq-method -setMethod("taxa_names", "phyloseq", function(physeq){ - taxa_names(otu_table(physeq)) +setMethod("taxa_names", "phyloseq", function(physeq) { + taxa_names(otu_table(physeq)) }) #' @rdname taxa_names-methods #' @aliases taxa_names,otu_table-method -setMethod("taxa_names", "otu_table", function(physeq){ - if( taxa_are_rows(physeq) ){ - return( rownames(physeq) ) - } else { - return( colnames(physeq) ) - } +setMethod("taxa_names", "otu_table", function(physeq) { + if (taxa_are_rows(physeq)) { + return(rownames(physeq)) + } else { + return(colnames(physeq)) + } }) #' @rdname taxa_names-methods #' @aliases taxa_names,taxonomyTable-method -setMethod("taxa_names", "taxonomyTable", function(physeq) rownames(physeq) ) +setMethod("taxa_names", "taxonomyTable", function(physeq) rownames(physeq)) #' @rdname taxa_names-methods #' @aliases taxa_names,sample_data-method -setMethod("taxa_names", "sample_data", function(physeq) NULL ) +setMethod("taxa_names", "sample_data", function(physeq) NULL) #' @rdname taxa_names-methods #' @aliases taxa_names,phylo-method -setMethod("taxa_names", "phylo", function(physeq) physeq$tip.label ) +setMethod("taxa_names", "phylo", function(physeq) physeq$tip.label) #' @rdname taxa_names-methods #' @aliases taxa_names,XStringSet-method -setMethod("taxa_names", "XStringSet", function(physeq) names(physeq) ) -################################################################################ +setMethod("taxa_names", "XStringSet", function(physeq) names(physeq)) +################################################################################ #' Get the number of samples. #' #' @usage nsamples(physeq) @@ -258,7 +267,7 @@ setMethod("taxa_names", "XStringSet", function(physeq) names(physeq) ) #' @export #' #' @examples # -#' data("esophagus") +#' data('esophagus') #' tree <- phy_tree(esophagus) #' OTU1 <- otu_table(esophagus) #' nsamples(OTU1) @@ -267,26 +276,28 @@ setMethod("taxa_names", "XStringSet", function(physeq) names(physeq) ) setGeneric("nsamples", function(physeq) standardGeneric("nsamples")) #' @rdname nsamples-methods #' @aliases nsamples,ANY-method -setMethod("nsamples", "ANY", function(physeq){ return(NULL) }) +setMethod("nsamples", "ANY", function(physeq) { + return(NULL) +}) #' @rdname nsamples-methods #' @aliases nsamples,phyloseq-method -setMethod("nsamples", "phyloseq", function(physeq){ - # dispatch to core, required component, otu_table - nsamples(otu_table(physeq)) +setMethod("nsamples", "phyloseq", function(physeq) { + # dispatch to core, required component, otu_table + nsamples(otu_table(physeq)) }) #' @rdname nsamples-methods #' @aliases nsamples,otu_table-method -setMethod("nsamples", "otu_table", function(physeq){ - if( taxa_are_rows(physeq) ){ - return( ncol(physeq) ) - } else { - return( nrow(physeq) ) - } +setMethod("nsamples", "otu_table", function(physeq) { + if (taxa_are_rows(physeq)) { + return(ncol(physeq)) + } else { + return(nrow(physeq)) + } }) #' @rdname nsamples-methods #' @aliases nsamples,sample_data-method -setMethod("nsamples", "sample_data", function(physeq) nrow(physeq) ) -################################################################################ +setMethod("nsamples", "sample_data", function(physeq) nrow(physeq)) +################################################################################ #' Get sample names. #' #' @usage sample_names(physeq) @@ -308,31 +319,33 @@ setMethod("nsamples", "sample_data", function(physeq) nrow(physeq) ) #' data(esophagus) #' sample_names(esophagus) setGeneric("sample_names", function(physeq) standardGeneric("sample_names")) -# Unless otherwise specified, this should return a value of NULL -# That way, objects that do not explicitly describe samples all -# behave in the same (returning NULL) way. +# Unless otherwise specified, this should return a value of NULL That way, +# objects that do not explicitly describe samples all behave in the same +# (returning NULL) way. #' @rdname sample_names-methods #' @aliases sample_names,ANY-method -setMethod("sample_names", "ANY", function(physeq){ return(NULL) }) +setMethod("sample_names", "ANY", function(physeq) { + return(NULL) +}) #' @rdname sample_names-methods #' @aliases sample_names,phyloseq-method -setMethod("sample_names", "phyloseq", function(physeq){ - # dispatch to core, required component, otu_table - sample_names(otu_table(physeq)) +setMethod("sample_names", "phyloseq", function(physeq) { + # dispatch to core, required component, otu_table + sample_names(otu_table(physeq)) }) #' @rdname sample_names-methods #' @aliases sample_names,sample_data-method -setMethod("sample_names", "sample_data", function(physeq) rownames(physeq) ) +setMethod("sample_names", "sample_data", function(physeq) rownames(physeq)) #' @rdname sample_names-methods #' @aliases sample_names,otu_table-method -setMethod("sample_names", "otu_table", function(physeq){ - if( taxa_are_rows(physeq) ){ - return( colnames(physeq) ) - } else { - return( rownames(physeq) ) - } +setMethod("sample_names", "otu_table", function(physeq) { + if (taxa_are_rows(physeq)) { + return(colnames(physeq)) + } else { + return(rownames(physeq)) + } }) -################################################################################ +################################################################################ #' Returns all abundance values for species \code{i}. #' #' This is a simple accessor function for investigating @@ -358,25 +371,25 @@ setMethod("sample_names", "otu_table", function(physeq){ #' @examples #' data(esophagus) #' taxa_names(esophagus) -#' get_sample(esophagus, "59_5_19") +#' get_sample(esophagus, '59_5_19') setGeneric("get_sample", function(physeq, i) standardGeneric("get_sample")) -################################################################################ +################################################################################ #' @aliases get_sample,otu_table-method #' @rdname get_sample-methods -setMethod("get_sample", "otu_table", function(physeq, i){ - if( taxa_are_rows(physeq) ){ - as(physeq, "matrix")[i, ] - } else { - as(physeq, "matrix")[, i] - } +setMethod("get_sample", "otu_table", function(physeq, i) { + if (taxa_are_rows(physeq)) { + as(physeq, "matrix")[i, ] + } else { + as(physeq, "matrix")[, i] + } }) -################################################################################ +################################################################################ #' @aliases get_sample,phyloseq-method #' @rdname get_sample-methods -setMethod("get_sample", "phyloseq", function(physeq, i){ - get_sample(otu_table(physeq), i) +setMethod("get_sample", "phyloseq", function(physeq, i) { + get_sample(otu_table(physeq), i) }) -################################################################################ +################################################################################ #' Returns all abundance values of sample \code{i}. #' #' This is a simple accessor function for investigating @@ -403,23 +416,23 @@ setMethod("get_sample", "phyloseq", function(physeq, i){ #' @examples #' data(esophagus) #' sample_names(esophagus) -#' get_taxa(esophagus, "B") +#' get_taxa(esophagus, 'B') setGeneric("get_taxa", function(physeq, i) standardGeneric("get_taxa")) #' @aliases get_taxa,otu_table-method #' @rdname get_taxa-methods -setMethod("get_taxa", "otu_table", function(physeq, i){ - if( taxa_are_rows(physeq) ){ - as(physeq, "matrix")[, i] - } else { - as(physeq, "matrix")[i, ] - } +setMethod("get_taxa", "otu_table", function(physeq, i) { + if (taxa_are_rows(physeq)) { + as(physeq, "matrix")[, i] + } else { + as(physeq, "matrix")[i, ] + } }) #' @aliases get_taxa,phyloseq-method #' @rdname get_taxa-methods -setMethod("get_taxa", "phyloseq", function(physeq, i){ - get_taxa(otu_table(physeq), i) +setMethod("get_taxa", "phyloseq", function(physeq, i) { + get_taxa(otu_table(physeq), i) }) -################################################################################ +################################################################################ #' Retrieve the names of the taxonomic ranks #' #' This is a simple accessor function to make it more convenient to determine @@ -446,10 +459,10 @@ setMethod("get_taxa", "phyloseq", function(physeq, i){ #' @examples #' data(enterotype) #' rank_names(enterotype) -rank_names <- function(physeq, errorIfNULL=TRUE){ - colnames(tax_table(physeq, errorIfNULL)) +rank_names <- function(physeq, errorIfNULL = TRUE) { + colnames(tax_table(physeq, errorIfNULL)) } -################################################################################ +################################################################################ #' Get a unique vector of the observed taxa at a particular taxonomic rank #' #' This is a simple accessor function to make it more convenient to determine @@ -481,11 +494,11 @@ rank_names <- function(physeq, errorIfNULL=TRUE){ #' data(enterotype) #' get_taxa_unique(enterotype) #' data(GlobalPatterns) -#' get_taxa_unique(GlobalPatterns, "Family") -get_taxa_unique <- function(physeq, taxonomic.rank=rank_names(physeq)[1], errorIfNULL=TRUE){ - unique(as(tax_table(physeq, errorIfNULL)[, taxonomic.rank], "character")) +#' get_taxa_unique(GlobalPatterns, 'Family') +get_taxa_unique <- function(physeq, taxonomic.rank = rank_names(physeq)[1], errorIfNULL = TRUE) { + unique(as(tax_table(physeq, errorIfNULL)[, taxonomic.rank], "character")) } -################################################################################ +################################################################################ #' Get the sample variables present in sample_data #' #' This is a simple accessor function to make it more convenient to determine @@ -512,10 +525,10 @@ get_taxa_unique <- function(physeq, taxonomic.rank=rank_names(physeq)[1], errorI #' @examples #' data(enterotype) #' sample_variables(enterotype) -sample_variables <- function(physeq, errorIfNULL=TRUE){ - colnames(sample_data(physeq, errorIfNULL)) +sample_variables <- function(physeq, errorIfNULL = TRUE) { + colnames(sample_data(physeq, errorIfNULL)) } -################################################################################ +################################################################################ #' Get the values for a particular variable in sample_data #' #' This is a simple accessor function for streamlining access @@ -543,12 +556,12 @@ sample_variables <- function(physeq, errorIfNULL=TRUE){ #' # Load the GlobalPatterns dataset into the workspace environment #' data(GlobalPatterns) #' # Look at the different values for SampleType -#' get_variable(GlobalPatterns, "SampleType") -get_variable <- function(physeq, varName){ - if( is.null(sample_data(physeq, FALSE)) ){ - stop("Your phyloseq data object does not have a sample-data component\n", - "Try ?sample_data for more details.") - } - return( as(sample_data(physeq), "data.frame")[, varName] ) +#' get_variable(GlobalPatterns, 'SampleType') +get_variable <- function(physeq, varName) { + if (is.null(sample_data(physeq, FALSE))) { + stop("Your phyloseq data object does not have a sample-data component\n", + "Try ?sample_data for more details.") + } + return(as(sample_data(physeq), "data.frame")[, varName]) } -################################################################################ +################################################################################ diff --git a/R/as-methods.R b/R/as-methods.R index cb89511a..9d490553 100644 --- a/R/as-methods.R +++ b/R/as-methods.R @@ -1,34 +1,32 @@ -################################################################################ -# coercion methods -################################################################################ -setAs("phyloseq", "matrix", function(from){ - from@.Data +################################################################################ coercion methods +setAs("phyloseq", "matrix", function(from) { + from@.Data }) -setAs("phyloseq", "otu_table", function(from){ - otu_table(from) +setAs("phyloseq", "otu_table", function(from) { + otu_table(from) }) -setAs("phyloseq", "otu_table", function(from){ - otu_table(from) +setAs("phyloseq", "otu_table", function(from) { + otu_table(from) }) -################################################################################ -setAs("data.frame", "sample_data", function(from){ - new("sample_data", from) +################################################################################ +setAs("data.frame", "sample_data", function(from) { + new("sample_data", from) }) -setAs("sample_data", "data.frame", function(from){ - data.frame(from) +setAs("sample_data", "data.frame", function(from) { + data.frame(from) }) -setAs("phyloseq", "sample_data", function(from){ - sample_data(from) +setAs("phyloseq", "sample_data", function(from) { + sample_data(from) }) -################################################################################ -setAs("taxonomyTable", "matrix", function(from){ - from@.Data +################################################################################ +setAs("taxonomyTable", "matrix", function(from) { + from@.Data }) -setAs("phyloseq", "taxonomyTable", function(from){ - tax_table(from) +setAs("phyloseq", "taxonomyTable", function(from) { + tax_table(from) }) -################################################################################ -setAs("phyloseq", "phylo", function(from){ - phy_tree(from) +################################################################################ +setAs("phyloseq", "phylo", function(from) { + phy_tree(from) }) -################################################################################ +################################################################################ diff --git a/R/assignment-methods.R b/R/assignment-methods.R index aa82c99e..d435935c 100644 --- a/R/assignment-methods.R +++ b/R/assignment-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Assign a new OTU Table to \code{x} #' #' @usage otu_table(x) <- value @@ -31,18 +31,20 @@ setGeneric("otu_table<-", function(x, value) standardGeneric("otu_table<-")) #' @rdname assign-otu_table #' @aliases otu_table<-,phyloseq,otu_table-method -setMethod("otu_table<-", c("phyloseq", "otu_table"), function(x, value){ - phyloseq(value, x@sam_data, x@tax_table, x@phy_tree, x@refseq) +setMethod("otu_table<-", c("phyloseq", "otu_table"), function(x, value) { + phyloseq(value, x@sam_data, x@tax_table, x@phy_tree, x@refseq) }) #' @rdname assign-otu_table #' @aliases otu_table<-,otu_table,otu_table-method -setMethod("otu_table<-", c("otu_table", "otu_table"), function(x, value){ value }) +setMethod("otu_table<-", c("otu_table", "otu_table"), function(x, value) { + value +}) #' @rdname assign-otu_table #' @aliases otu_table<-,phyloseq,phyloseq-method -setMethod("otu_table<-", c("phyloseq", "phyloseq"), function(x, value){ - phyloseq(otu_table(value), x@sam_data, x@tax_table, x@phy_tree, x@refseq) +setMethod("otu_table<-", c("phyloseq", "phyloseq"), function(x, value) { + phyloseq(otu_table(value), x@sam_data, x@tax_table, x@phy_tree, x@refseq) }) -################################################################################ +################################################################################ #' Manually change taxa_are_rows through assignment. #' #' The taxa_are_rows slot is a logical indicating the orientation of the @@ -65,22 +67,22 @@ setMethod("otu_table<-", c("phyloseq", "phyloseq"), function(x, value){ #' data(esophagus) #' taxa_are_rows(esophagus) #' taxa_are_rows(otu_table(esophagus)) -setGeneric("taxa_are_rows<-", function(x, value){ - standardGeneric("taxa_are_rows<-") +setGeneric("taxa_are_rows<-", function(x, value) { + standardGeneric("taxa_are_rows<-") }) #' @rdname assign-taxa_are_rows #' @aliases taxa_are_rows<-,otu_table,logical-method -setMethod("taxa_are_rows<-", c("otu_table", "logical"), function(x, value){ - x@taxa_are_rows <- value[1] - return(x) +setMethod("taxa_are_rows<-", c("otu_table", "logical"), function(x, value) { + x@taxa_are_rows <- value[1] + return(x) }) #' @rdname assign-taxa_are_rows #' @aliases taxa_are_rows<-,phyloseq,logical-method -setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value){ - taxa_are_rows(otu_table(x)) <- value - return(x) +setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value) { + taxa_are_rows(otu_table(x)) <- value + return(x) }) -################################################################################ +################################################################################ #' Assign (new) sample_data to \code{x} #' #' This replaces the current \code{sample_data} component of \code{x} with @@ -121,13 +123,13 @@ setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value){ #' head(sample_data(soilrep)) #' sample_data(soilrep)$Time <- as.integer(substr(sample_data(soilrep)$Sample, 1, 1)) #' head(sample_data(soilrep)) -"sample_data<-" <- function(x, value){ - if( !inherits(value, "sample_data") ){ - value <- sample_data(value) - } - phyloseq(x@otu_table, value, x@tax_table, x@phy_tree, x@refseq) +"sample_data<-" <- function(x, value) { + if (!inherits(value, "sample_data")) { + value <- sample_data(value) + } + phyloseq(x@otu_table, value, x@tax_table, x@phy_tree, x@refseq) } -################################################################################ +################################################################################ #' Assign a (new) Taxonomy Table to \code{x} #' #' @usage tax_table(x) <- value @@ -157,31 +159,31 @@ setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value){ #' # tax_table(ex2c) <- ex2b #' # identical(ex2a, ex2c) #' # ex2c <- phyloseq(otu_table(ex2b), sample_data(ex2b), phy_tree(ex2b)) -#' # tax_table(ex2c) <- as(tax_table(ex2b), "matrix") +#' # tax_table(ex2c) <- as(tax_table(ex2b), 'matrix') #' # identical(ex2a, ex2c) setGeneric("tax_table<-", function(x, value) standardGeneric("tax_table<-")) #' @rdname assign-tax_table #' @aliases tax_table<-,phyloseq,taxonomyTable-method -setMethod("tax_table<-", c("phyloseq", "taxonomyTable"), function(x, value){ - phyloseq(x@otu_table, x@sam_data, value, x@phy_tree, x@refseq) +setMethod("tax_table<-", c("phyloseq", "taxonomyTable"), function(x, value) { + phyloseq(x@otu_table, x@sam_data, value, x@phy_tree, x@refseq) }) #' @rdname assign-tax_table #' @aliases tax_table<-,phyloseq,ANY-method -setMethod("tax_table<-", c("phyloseq", "ANY"), function(x, value){ - phyloseq(x@otu_table, x@sam_data, tax_table(value, FALSE), x@phy_tree, x@refseq) +setMethod("tax_table<-", c("phyloseq", "ANY"), function(x, value) { + phyloseq(x@otu_table, x@sam_data, tax_table(value, FALSE), x@phy_tree, x@refseq) }) #' @rdname assign-tax_table #' @aliases tax_table<-,taxonomyTable,taxonomyTable-method -setMethod("tax_table<-", c("taxonomyTable", "taxonomyTable"), function(x, value){ - # Asign as-is. - value +setMethod("tax_table<-", c("taxonomyTable", "taxonomyTable"), function(x, value) { + # Asign as-is. + value }) #' @rdname assign-tax_table #' @aliases tax_table<-,taxonomyTable,ANY-method -setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value){ - tax_table(value, FALSE) +setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value) { + tax_table(value, FALSE) }) -################################################################################ +################################################################################ #' Assign a (new) phylogenetic tree to \code{x} #' #' @usage phy_tree(x) <- value @@ -193,7 +195,7 @@ setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value){ #' @rdname assign-phy_tree #' @aliases assign-phy_tree phy_tree<- #' @examples # -#' data("esophagus") +#' data('esophagus') #' # An example of pruning to just the first 20 taxa in esophagus #' ex2a <- prune_taxa(taxa_names(esophagus)[1:20], esophagus) #' # The following 3 lines produces an ex2b that is equal to ex2a @@ -203,21 +205,21 @@ setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value){ setGeneric("phy_tree<-", function(x, value) standardGeneric("phy_tree<-")) #' @rdname assign-phy_tree #' @aliases phy_tree<-,phyloseq,phylo-method -setMethod("phy_tree<-", c("phyloseq", "phylo"), function(x, value){ - phyloseq(x@otu_table, x@sam_data, x@tax_table, value, x@refseq) +setMethod("phy_tree<-", c("phyloseq", "phylo"), function(x, value) { + phyloseq(x@otu_table, x@sam_data, x@tax_table, value, x@refseq) }) #' @rdname assign-phy_tree #' @aliases phy_tree<-,phyloseq,phyloseq-method -setMethod("phy_tree<-", c("phyloseq", "phyloseq"), function(x, value){ - phyloseq(x@otu_table, x@sam_data, x@tax_table, phy_tree(value), x@refseq) +setMethod("phy_tree<-", c("phyloseq", "phyloseq"), function(x, value) { + phyloseq(x@otu_table, x@sam_data, x@tax_table, phy_tree(value), x@refseq) }) -################################################################################ +################################################################################ #' Replace OTU identifier names #' #' @usage taxa_names(x) <- value #' #' @param x (Required). An object defined by the \code{\link{phyloseq-package}} -#' that describes OTUs in some way. +#' \tthat describes OTUs in some way. #' @param value (Required). A character vector #' to replace the current \code{\link{taxa_names}}. #' @@ -227,43 +229,44 @@ setMethod("phy_tree<-", c("phyloseq", "phyloseq"), function(x, value){ #' @aliases assign-taxa_names taxa_names<- #' #' @examples -#' data("esophagus") +#' data('esophagus') #' taxa_names(esophagus) -#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left") -#' taxa_names(esophagus) <- paste("OTU-", taxa_names(esophagus), sep="") +#' # plot_tree(esophagus, label.tips='taxa_names', ladderize='left') +#' taxa_names(esophagus) <- paste('OTU-', taxa_names(esophagus), sep='') #' taxa_names(esophagus) -#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left") +#' # plot_tree(esophagus, label.tips='taxa_names', ladderize='left') #' ## non-characters are first coerced to characters. #' taxa_names(esophagus) <- 1:ntaxa(esophagus) #' taxa_names(esophagus) -#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left") +#' # plot_tree(esophagus, label.tips='taxa_names', ladderize='left') #' ## Cannot assign non-unique or differently-lengthed name vectors. Error. #' # taxa_names(esophagus) <- sample(c(TRUE, FALSE), ntaxa(esophagus), TRUE) #' # taxa_names(esophagus) <- sample(taxa_names(esophagus), ntaxa(esophagus)-5, FALSE) -setGeneric("taxa_names<-", function(x, value){ - if( anyDuplicated(value) ){ - stop("taxa_names<-: You are attempting to assign duplicated taxa_names") - } - standardGeneric("taxa_names<-") +setGeneric("taxa_names<-", function(x, value) { + if (anyDuplicated(value)) { + stop("taxa_names<-: You are attempting to assign duplicated taxa_names") + } + standardGeneric("taxa_names<-") }) -# Attempt to coerce value to a character vector. Remaining methods will require it. +# Attempt to coerce value to a character vector. Remaining methods will require +# it. #' @rdname assign-taxa_names #' @aliases taxa_names<-,ANY,ANY-method -setMethod("taxa_names<-", c("ANY", "ANY"), function(x, value){ +setMethod("taxa_names<-", c("ANY", "ANY"), function(x, value) { taxa_names(x) <- as(value, "character") return(x) }) -# value is now character, but no specific method for first argumet -# return x unchanged. +# value is now character, but no specific method for first argumet return x +# unchanged. #' @rdname assign-taxa_names #' @aliases taxa_names<-,ANY,character-method -setMethod("taxa_names<-", c("ANY", "character"), function(x, value){ +setMethod("taxa_names<-", c("ANY", "character"), function(x, value) { return(x) }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,otu_table,character-method -setMethod("taxa_names<-", c("otu_table", "character"), function(x, value){ - if( taxa_are_rows(x) ){ +setMethod("taxa_names<-", c("otu_table", "character"), function(x, value) { + if (taxa_are_rows(x)) { rownames(x) <- value } else { colnames(x) <- value @@ -272,40 +275,39 @@ setMethod("taxa_names<-", c("otu_table", "character"), function(x, value){ }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,taxonomyTable,character-method -setMethod("taxa_names<-", c("taxonomyTable", "character"), function(x, value){ +setMethod("taxa_names<-", c("taxonomyTable", "character"), function(x, value) { rownames(x) <- value return(x) }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,phylo,character-method -setMethod("taxa_names<-", c("phylo", "character"), function(x, value){ +setMethod("taxa_names<-", c("phylo", "character"), function(x, value) { x$tip.label <- value return(x) }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,XStringSet,character-method -setMethod("taxa_names<-", c("XStringSet", "character"), function(x, value){ +setMethod("taxa_names<-", c("XStringSet", "character"), function(x, value) { names(x) <- value return(x) }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,phyloseq,character-method -setMethod("taxa_names<-", c("phyloseq", "character"), function(x, value){ +setMethod("taxa_names<-", c("phyloseq", "character"), function(x, value) { # dispatch on components taxa_names(x@otu_table) <- value - taxa_names(x@phy_tree) <- value + taxa_names(x@phy_tree) <- value taxa_names(x@tax_table) <- value - taxa_names(x@refseq) <- value + taxa_names(x@refseq) <- value return(x) }) -################################################################################ -################################################################################ +################################################################################ #' Replace OTU identifier names #' #' @usage sample_names(x) <- value #' #' @param x (Required). An object defined by the \code{\link{phyloseq-package}} -#' that describes OTUs in some way. +#' \tthat describes OTUs in some way. #' @param value (Required). A character vector #' to replace the current \code{\link{sample_names}}. #' @@ -315,61 +317,62 @@ setMethod("taxa_names<-", c("phyloseq", "character"), function(x, value){ #' @aliases assign-sample_names sample_names<- #' #' @examples -#' data("esophagus") +#' data('esophagus') #' sample_names(esophagus) -#' # plot_tree(esophagus, color="sample_names", ladderize="left") -#' sample_names(esophagus) <- paste("Sa-", sample_names(esophagus), sep="") +#' # plot_tree(esophagus, color='sample_names', ladderize='left') +#' sample_names(esophagus) <- paste('Sa-', sample_names(esophagus), sep='') #' sample_names(esophagus) -#' # plot_tree(esophagus, color="sample_names", ladderize="left") +#' # plot_tree(esophagus, color='sample_names', ladderize='left') #' ## non-characters are first coerced to characters. #' sample_names(esophagus) <- 1:nsamples(esophagus) #' sample_names(esophagus) -#' # plot_tree(esophagus, color="sample_names", ladderize="left") +#' # plot_tree(esophagus, color='sample_names', ladderize='left') #' ## Cannot assign non-unique or differently-lengthed name vectors. Error. #' # sample_names(esophagus) <- sample(c(TRUE, FALSE), nsamples(esophagus), TRUE) #' # sample_names(esophagus) <- sample(sample_names(esophagus), nsamples(esophagus)-1, FALSE) -setGeneric("sample_names<-", function(x, value){ - if( anyDuplicated(value) ){ - stop("sample_names<-: You are attempting to assign duplicated sample_names") - } - standardGeneric("sample_names<-") +setGeneric("sample_names<-", function(x, value) { + if (anyDuplicated(value)) { + stop("sample_names<-: You are attempting to assign duplicated sample_names") + } + standardGeneric("sample_names<-") }) -# Attempt to coerce value to a character vector. Remaining methods will require it. +# Attempt to coerce value to a character vector. Remaining methods will require +# it. #' @rdname assign-sample_names #' @aliases sample_names<-,ANY,ANY-method -setMethod("sample_names<-", c("ANY", "ANY"), function(x, value){ - sample_names(x) <- as(value, "character") - return(x) +setMethod("sample_names<-", c("ANY", "ANY"), function(x, value) { + sample_names(x) <- as(value, "character") + return(x) }) -# value is now character, but no specific method for first argumet -# return x unchanged. +# value is now character, but no specific method for first argumet return x +# unchanged. #' @rdname assign-sample_names #' @aliases sample_names<-,ANY,character-method -setMethod("sample_names<-", c("ANY", "character"), function(x, value){ - return(x) +setMethod("sample_names<-", c("ANY", "character"), function(x, value) { + return(x) }) #' @rdname assign-sample_names #' @aliases sample_names<-,otu_table,character-method -setMethod("sample_names<-", c("otu_table", "character"), function(x, value){ - if( taxa_are_rows(x) ){ - colnames(x) <- value - } else { - rownames(x) <- value - } - return(x) +setMethod("sample_names<-", c("otu_table", "character"), function(x, value) { + if (taxa_are_rows(x)) { + colnames(x) <- value + } else { + rownames(x) <- value + } + return(x) }) #' @rdname assign-sample_names #' @aliases sample_names<-,sample_data,character-method -setMethod("sample_names<-", c("sample_data", "character"), function(x, value){ - rownames(x) <- value - return(x) +setMethod("sample_names<-", c("sample_data", "character"), function(x, value) { + rownames(x) <- value + return(x) }) #' @rdname assign-sample_names #' @aliases sample_names<-,phyloseq,character-method -setMethod("sample_names<-", c("phyloseq", "character"), function(x, value){ - # dispatch on components - sample_names(x@otu_table) <- value - sample_names(x@sam_data) <- value - return(x) +setMethod("sample_names<-", c("phyloseq", "character"), function(x, value) { + # dispatch on components + sample_names(x@otu_table) <- value + sample_names(x@sam_data) <- value + return(x) }) -################################################################################ \ No newline at end of file +################################################################################ diff --git a/R/deprecated_functions.R b/R/deprecated_functions.R index e4c62eb7..94f81132 100644 --- a/R/deprecated_functions.R +++ b/R/deprecated_functions.R @@ -1,7 +1,7 @@ -################################################################################ +################################################################################ #' Depcrecated functions in the phyloseq package. #' -#' These will be migrated to \code{"defunct"} status in the next release, +#' These will be migrated to \code{'defunct'} status in the next release, #' and removed completely in the release after that. #' These functions are provided for compatibility with older version of #' the phyloseq package. They may eventually be completely @@ -62,69 +62,173 @@ #' \code{taxTab<-} \tab now a synonym for \code{\link{tax_table<-}}\cr #' } #' -deprecated_phyloseq_function <- function(x, value, ...){return(NULL)} -plot_taxa_bar <- function(...){.Deprecated("plot_bar", package="phyloseq");return(plot_bar(...))} -taxaplot <- function(...){.Deprecated("plot_bar", package="phyloseq");return(plot_bar(...))} -taxtab <- function(...){.Deprecated("tax_table", package="phyloseq");return(tax_table(...))} -taxTab <- function(...){.Deprecated("tax_table", package="phyloseq");return(tax_table(...))} -sampleData <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))} -samData <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))} -sam_data <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))} -speciesSums <- function(...){.Deprecated("taxa_sums", package="phyloseq");return(taxa_sums(...))} -sampleSums <- function(...){.Deprecated("sample_sums", package="phyloseq");return(sample_sums(...))} -nspecies <- function(...){.Deprecated("ntaxa", package="phyloseq");return(ntaxa(...))} -species.names <- function(...){.Deprecated("taxa_names", package="phyloseq");return(taxa_names(...))} -sampleNames <- function(...){.Deprecated("sample_names", package="phyloseq");return(sample_names(...))} -sample.names <- function(...){.Deprecated("sample_names", package="phyloseq");return(sample_names(...))} -getSamples <- function(...){.Deprecated("get_sample", package="phyloseq");return(get_sample(...))} -getSpecies <- function(...){.Deprecated("get_taxa", package="phyloseq");return(get_taxa(...))} -rank.names <- function(...){.Deprecated("rank_names", package="phyloseq");return(rank_names(...))} -getTaxa <- function(...){.Deprecated("get_taxa_unique", package="phyloseq");return(get_taxa_unique(...))} -sample.variables <- function(...){.Deprecated("sample_variables", package="phyloseq");return(sample_variables(...))} -getVariable <- function(...){.Deprecated("get_variable", package="phyloseq");return(get_variable(...))} -merge_species <- function(...){.Deprecated("merge_taxa", package="phyloseq");return(merge_taxa(...))} -otuTable <- function(...){.Deprecated("otu_table", package="phyloseq");return(otu_table(...))} -speciesarerows <- function(...){.Deprecated("taxa_are_rows", package="phyloseq");return(taxa_are_rows(...))} -speciesAreRows <- function(...){.Deprecated("taxa_are_rows", package="phyloseq");return(taxa_are_rows(...))} -plot_richness_estimates <- function(...){.Deprecated("plot_richness", package="phyloseq");return(plot_richness(...))} -import_qiime_sampleData <- function(...){.Deprecated("import_qiime_sample_data", package="phyloseq");return(import_qiime_sample_data(...))} -filterfunSample <- function(...){.Deprecated("filterfun_sample", package="phyloseq");return(filterfun_sample(...))} -genefilterSample <- function(...){.Deprecated("genefilter_sample", package="phyloseq");return(genefilter_sample(...))} -prune_species <- function(...){.Deprecated("prune_taxa", package="phyloseq");return(prune_taxa(...))} -subset_species <- function(...){.Deprecated("subset_taxa", package="phyloseq");return(subset_taxa(...))} -tipglom <- function(...){.Deprecated("tip_glom", package="phyloseq");return(tip_glom(...))} -taxglom <- function(...){.Deprecated("tax_glom", package="phyloseq");return(tax_glom(...))} -tre <- function(...){.Deprecated("phy_tree", package="phyloseq");return(phy_tree(...))} -show_mothur_list_cutoffs <- function(...){.Deprecated("show_mothur_cutoffs", package="phyloseq");return(show_mothur_cutoffs(...))} -originalUniFrac <- function(...){.Deprecated("fastUniFrac", package="phyloseq");return(fastUniFrac(...))} -"sam_data<-" <- function(x, value){ - .Deprecated("sample_data<-", package="phyloseq") +deprecated_phyloseq_function <- function(x, value, ...) { + return(NULL) +} +plot_taxa_bar <- function(...) { + .Deprecated("plot_bar", package = "phyloseq") + return(plot_bar(...)) +} +taxaplot <- function(...) { + .Deprecated("plot_bar", package = "phyloseq") + return(plot_bar(...)) +} +taxtab <- function(...) { + .Deprecated("tax_table", package = "phyloseq") + return(tax_table(...)) +} +taxTab <- function(...) { + .Deprecated("tax_table", package = "phyloseq") + return(tax_table(...)) +} +sampleData <- function(...) { + .Deprecated("sample_data", package = "phyloseq") + return(sample_data(...)) +} +samData <- function(...) { + .Deprecated("sample_data", package = "phyloseq") + return(sample_data(...)) +} +sam_data <- function(...) { + .Deprecated("sample_data", package = "phyloseq") + return(sample_data(...)) +} +speciesSums <- function(...) { + .Deprecated("taxa_sums", package = "phyloseq") + return(taxa_sums(...)) +} +sampleSums <- function(...) { + .Deprecated("sample_sums", package = "phyloseq") + return(sample_sums(...)) +} +nspecies <- function(...) { + .Deprecated("ntaxa", package = "phyloseq") + return(ntaxa(...)) +} +species.names <- function(...) { + .Deprecated("taxa_names", package = "phyloseq") + return(taxa_names(...)) +} +sampleNames <- function(...) { + .Deprecated("sample_names", package = "phyloseq") + return(sample_names(...)) +} +sample.names <- function(...) { + .Deprecated("sample_names", package = "phyloseq") + return(sample_names(...)) +} +getSamples <- function(...) { + .Deprecated("get_sample", package = "phyloseq") + return(get_sample(...)) +} +getSpecies <- function(...) { + .Deprecated("get_taxa", package = "phyloseq") + return(get_taxa(...)) +} +rank.names <- function(...) { + .Deprecated("rank_names", package = "phyloseq") + return(rank_names(...)) +} +getTaxa <- function(...) { + .Deprecated("get_taxa_unique", package = "phyloseq") + return(get_taxa_unique(...)) +} +sample.variables <- function(...) { + .Deprecated("sample_variables", package = "phyloseq") + return(sample_variables(...)) +} +getVariable <- function(...) { + .Deprecated("get_variable", package = "phyloseq") + return(get_variable(...)) +} +merge_species <- function(...) { + .Deprecated("merge_taxa", package = "phyloseq") + return(merge_taxa(...)) +} +otuTable <- function(...) { + .Deprecated("otu_table", package = "phyloseq") + return(otu_table(...)) +} +speciesarerows <- function(...) { + .Deprecated("taxa_are_rows", package = "phyloseq") + return(taxa_are_rows(...)) +} +speciesAreRows <- function(...) { + .Deprecated("taxa_are_rows", package = "phyloseq") + return(taxa_are_rows(...)) +} +plot_richness_estimates <- function(...) { + .Deprecated("plot_richness", package = "phyloseq") + return(plot_richness(...)) +} +import_qiime_sampleData <- function(...) { + .Deprecated("import_qiime_sample_data", package = "phyloseq") + return(import_qiime_sample_data(...)) +} +filterfunSample <- function(...) { + .Deprecated("filterfun_sample", package = "phyloseq") + return(filterfun_sample(...)) +} +genefilterSample <- function(...) { + .Deprecated("genefilter_sample", package = "phyloseq") + return(genefilter_sample(...)) +} +prune_species <- function(...) { + .Deprecated("prune_taxa", package = "phyloseq") + return(prune_taxa(...)) +} +subset_species <- function(...) { + .Deprecated("subset_taxa", package = "phyloseq") + return(subset_taxa(...)) +} +tipglom <- function(...) { + .Deprecated("tip_glom", package = "phyloseq") + return(tip_glom(...)) +} +taxglom <- function(...) { + .Deprecated("tax_glom", package = "phyloseq") + return(tax_glom(...)) +} +tre <- function(...) { + .Deprecated("phy_tree", package = "phyloseq") + return(phy_tree(...)) +} +show_mothur_list_cutoffs <- function(...) { + .Deprecated("show_mothur_cutoffs", package = "phyloseq") + return(show_mothur_cutoffs(...)) +} +originalUniFrac <- function(...) { + .Deprecated("fastUniFrac", package = "phyloseq") + return(fastUniFrac(...)) +} +"sam_data<-" <- function(x, value) { + .Deprecated("sample_data<-", package = "phyloseq") sample_data(x) <- value return(x) } -"sampleData<-" <- function(x, value){ - .Deprecated("sample_data<-", package="phyloseq") +"sampleData<-" <- function(x, value) { + .Deprecated("sample_data<-", package = "phyloseq") sample_data(x) <- value return(x) } -"tre<-" <- function(x, value){ - .Deprecated("phy_tree<-", package="phyloseq") +"tre<-" <- function(x, value) { + .Deprecated("phy_tree<-", package = "phyloseq") phy_tree(x) <- value return(x) } -"speciesAreRows<-" <- function(x, value){ - .Deprecated("taxa_are_rows<-", package="phyloseq") +"speciesAreRows<-" <- function(x, value) { + .Deprecated("taxa_are_rows<-", package = "phyloseq") taxa_are_rows(x) <- value return(x) } -"otuTable<-" <- function(x, value){ - .Deprecated("otu_table<-", package="phyloseq") +"otuTable<-" <- function(x, value) { + .Deprecated("otu_table<-", package = "phyloseq") otu_table(x) <- value return(x) } -"taxTab<-" <- function(x, value){ - .Deprecated("tax_table<-", package="phyloseq") +"taxTab<-" <- function(x, value) { + .Deprecated("tax_table<-", package = "phyloseq") tax_table(x) <- value return(x) } -################################################################################ +################################################################################ diff --git a/R/distance-methods.R b/R/distance-methods.R index c373387d..a25aa0bc 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' General distance / dissimilarity index calculator #' #' Takes a \code{\link{phyloseq-class}} object and method option, and returns @@ -8,7 +8,7 @@ #' user-provided arbitrary methods via an interface to #' \code{\link{designdist}}. For the complete list of currently #' supported options/arguments to the \code{method} parameter, -#' type \code{distance("list")} at the command-line. +#' type \code{distance('list')} at the command-line. #' Only #' sample-wise distances are currently supported (the \code{type} argument), #' but eventually species-wise (OTU-wise) @@ -31,11 +31,11 @@ #' \code{\link{phyloseq-class}} that contains both an \code{otu_table} #' and a phylogenetic tree (\code{phylo}). #' -#' @param method (Optional). A character string. Default is \code{"unifrac"}. +#' @param method (Optional). A character string. Default is \code{'unifrac'}. #' Provide one of the 45 currently supported options. #' To see a list of supported options, enter the following into the command line: #' -#' \code{distance("list")} +#' \code{distance('list')} #' #' For further details and additional arguments, #' see the documentation for the supprting functions, linked below @@ -45,14 +45,14 @@ #' by the \code{\link{phyloseq-package}}, and accessed by the following #' \code{method} options: #' -#' \code{"unifrac"}, for (unweighted) UniFrac distance, \code{\link{UniFrac}}; +#' \code{'unifrac'}, for (unweighted) UniFrac distance, \code{\link{UniFrac}}; #' -#' \code{"wunifrac"}, for weighted-UniFrac distance, \code{\link{UniFrac}}; +#' \code{'wunifrac'}, for weighted-UniFrac distance, \code{\link{UniFrac}}; #' -#' \code{"dpcoa"}, sample-wise distance from Double Principle +#' \code{'dpcoa'}, sample-wise distance from Double Principle #' Coordinate Analysis, \code{\link{DPCoA}}; #' -#' \code{"jsd"}, for Jensen-Shannon Divergence, \code{\link{JSD}}; +#' \code{'jsd'}, for Jensen-Shannon Divergence, \code{\link{JSD}}; #' #' and it is recommended that you see their documentation #' for details, references, background and examples for use. @@ -63,7 +63,7 @@ #' #' @param type (Optional). A character string. The type of pairwise comparisons #' being calculated: sample-wise or taxa-wise. The default is -#' \code{c("samples")}. +#' \code{c('samples')}. #' #' @param ... Additional arguments passed on to the appropriate distance #' function, determined by the \code{method} argument. @@ -88,126 +88,124 @@ #' @examples #' data(esophagus) #' distance(esophagus) # Unweighted UniFrac -#' distance(esophagus, "wunifrac") # weighted UniFrac -#' distance(esophagus, "jaccard") # vegdist jaccard -#' distance(esophagus, "gower") # vegdist option "gower" -#' distance(esophagus, "g") # designdist method option "g" -#' distance(esophagus, "minkowski") # invokes a method from the base dist() function. -#' distance(esophagus, "(A+B-2*J)/(A+B)") # designdist custom distance -#' distance("help") -#' distance("list") -#' help("distance") -distance <- function(physeq, method="unifrac", type="samples", ...){ +#' distance(esophagus, 'wunifrac') # weighted UniFrac +#' distance(esophagus, 'jaccard') # vegdist jaccard +#' distance(esophagus, 'gower') # vegdist option 'gower' +#' distance(esophagus, 'g') # designdist method option 'g' +#' distance(esophagus, 'minkowski') # invokes a method from the base dist() function. +#' distance(esophagus, '(A+B-2*J)/(A+B)') # designdist custom distance +#' distance('help') +#' distance('list') +#' help('distance') +distance <- function(physeq, method = "unifrac", type = "samples", ...) { # Only one method at a time. - if(length(method) > 1){ - stop("`distance` only accepts one method at a time. ", - "You provided ", length(method), " methods. ") + if (length(method) > 1) { + stop("`distance` only accepts one method at a time. ", "You provided ", length(method), + " methods. ") } - # # Can't do partial matching for all options, - # # because too many similar options. - # # Do partial matching for wunifrac/unifrac. - # # Determine if method argument matches any options exactly. - # # If not, call designdist - vegdist_methods <- c("manhattan", "euclidean", "canberra", "bray", - "kulczynski", "jaccard", "gower", "altGower", "morisita", "horn", - "mountford", "raup" , "binomial", "chao", "cao") - # Standard distance methods - dist_methods <- c("maximum", "binary", "minkowski") - # Only keep the ones that are NOT already in vegdist_methods - dist_methods <- dist_methods[!dist_methods %in% intersect(vegdist_methods, dist_methods)] - # The methods supported by vegan::betadiver function. - betadiver_methods <- c("w", "-1", "c", "wb", "r", "I", "e", "t", "me", "j", - "sor", "m", "-2", "co", "cc", "g", "-3", "l", "19", "hk", "rlb", - "sim", "gl", "z") - method.list <- list( - UniFrac = c("unifrac", "wunifrac"), - DPCoA = "dpcoa", - JSD = "jsd", - vegdist = vegdist_methods, - betadiver = betadiver_methods, - dist = dist_methods, - designdist = "ANY" - ) + # # Can't do partial matching for all options, # because too many similar + # options. # Do partial matching for wunifrac/unifrac. # Determine if method + # argument matches any options exactly. # If not, call designdist + vegdist_methods <- c("manhattan", "euclidean", "canberra", "bray", "kulczynski", + "jaccard", "gower", "altGower", "morisita", "horn", "mountford", "raup", + "binomial", "chao", "cao") + # Standard distance methods + dist_methods <- c("maximum", "binary", "minkowski") + # Only keep the ones that are NOT already in vegdist_methods + dist_methods <- dist_methods[!dist_methods %in% intersect(vegdist_methods, dist_methods)] + # The methods supported by vegan::betadiver function. + betadiver_methods <- c("w", "-1", "c", "wb", "r", "I", "e", "t", "me", "j", "sor", + "m", "-2", "co", "cc", "g", "-3", "l", "19", "hk", "rlb", "sim", "gl", "z") + method.list <- list(UniFrac = c("unifrac", "wunifrac"), DPCoA = "dpcoa", JSD = "jsd", + vegdist = vegdist_methods, betadiver = betadiver_methods, dist = dist_methods, + designdist = "ANY") # User support, and method options definition. - if(class(physeq) == "character"){ - if( physeq=="help" ){ - cat("Available arguments to methods:\n") - print(method.list) - cat("Please be exact, partial-matching not supported.\n") - cat("Can alternatively provide a custom distance.\n") - cat("See:\n help(\"distance\") \n") - return() - } - if( physeq=="list" ){ - return(c(method.list)) - } - } + if (class(physeq) == "character") { + if (physeq == "help") { + cat("Available arguments to methods:\n") + print(method.list) + cat("Please be exact, partial-matching not supported.\n") + cat("Can alternatively provide a custom distance.\n") + cat("See:\n help(\"distance\") \n") + return() + } + if (physeq == "list") { + return(c(method.list)) + } + } # Regular Expression detect/convert unifrac/weighted-UniFrac args method <- gsub("^(u.*)*unifrac$", "unifrac", method, ignore.case = TRUE) method <- gsub("^w.*unifrac$", "wunifrac", method, ignore.case = TRUE) - # Return distance, or define the function call to build/pass call - if( method == "unifrac" ){ return(UniFrac(physeq, ...)) } - if( method == "wunifrac" ){ return(UniFrac(physeq, weighted=TRUE, ...)) } - if( method == "jsd" ){ return(JSD(physeq, ...)) } - if( method == "dpcoa" ){ - # Remove diagnol entries from "dist" object returned in `RaoDis` slot. - return(as.dist(DPCoA(physeq, ...)$RaoDis, diag=FALSE)) - } else if( method %in% vegdist_methods ){ - dfun <- "vegdist" - } else if( method %in% betadiver_methods ){ - dfun <- "betadiver" - } else if( method %in% dist_methods ){ - dfun <- "dist" - } else { - dfun <- "designdist" - } - # get the extra arguments to pass to functions (this can be empty) - extrargs <- list(...) - # # non-phyloseq methods are assumed to be based on otu_table-only (for now) - # # If necessary (non phyloseq funs), enforce orientation, build function. - OTU <- otu_table(physeq) - # disambiguate type argument... Must be "species" for vegan integration... - # The following should all work: "OTUs", "OTU", "otus", "Taxas", "site" + # Return distance, or define the function call to build/pass call + if (method == "unifrac") { + return(UniFrac(physeq, ...)) + } + if (method == "wunifrac") { + return(UniFrac(physeq, weighted = TRUE, ...)) + } + if (method == "jsd") { + return(JSD(physeq, ...)) + } + if (method == "dpcoa") { + # Remove diagnol entries from 'dist' object returned in `RaoDis` slot. + return(as.dist(DPCoA(physeq, ...)$RaoDis, diag = FALSE)) + } else if (method %in% vegdist_methods) { + dfun <- "vegdist" + } else if (method %in% betadiver_methods) { + dfun <- "betadiver" + } else if (method %in% dist_methods) { + dfun <- "dist" + } else { + dfun <- "designdist" + } + # get the extra arguments to pass to functions (this can be empty) + extrargs <- list(...) + # # non-phyloseq methods are assumed to be based on otu_table-only (for now) # If + # necessary (non phyloseq funs), enforce orientation, build function. + OTU <- otu_table(physeq) + # disambiguate type argument... Must be 'species' for vegan integration... The + # following should all work: 'OTUs', 'OTU', 'otus', 'Taxas', 'site' type <- gsub("(OTU(s)?)|(taxa(s)?)|(Species)", "species", type, ignore.case = TRUE) - # The following should all work: "SaMplE", "Samples", "site", "sites" + # The following should all work: 'SaMplE', 'Samples', 'site', 'sites' type <- gsub("(Sample(s)?)|(site(s)?)", "samples", type, ignore.case = TRUE) - # Test type, and enforce orientation accordingly - if( type == "species"){ - # For species-distance, species need to be rows (vegan-style) - if( !taxa_are_rows(OTU) ){OTU <- t(OTU)} - } else if( type == "samples" ){ - # For sample-distance, samples need to be rows (vegan-style) - if( taxa_are_rows(OTU) ){OTU <- t(OTU)} - } else { - stop("type argument must be one of \n (1) samples \n or \n (2) species") - } - OTU <- as(OTU, "matrix") - fun.args <- c(list(OTU, method=method), extrargs) - return( do.call(dfun, fun.args) ) -} -################################################################################ -################################################################################ -# Shannon-Jensen Divergence, in R. -################################################################################ + # Test type, and enforce orientation accordingly + if (type == "species") { + # For species-distance, species need to be rows (vegan-style) + if (!taxa_are_rows(OTU)) { + OTU <- t(OTU) + } + } else if (type == "samples") { + # For sample-distance, samples need to be rows (vegan-style) + if (taxa_are_rows(OTU)) { + OTU <- t(OTU) + } + } else { + stop("type argument must be one of \n (1) samples \n or \n (2) species") + } + OTU <- as(OTU, "matrix") + fun.args <- c(list(OTU, method = method), extrargs) + return(do.call(dfun, fun.args)) +} +################################################################################ Shannon-Jensen Divergence, in R. #' @keywords internal -phyloseq_JSD_pair <- function(x, y){ - ###Function to compute Shannon-Jensen Divergence - ###x and y are the frequencies for the same p categories - u <- x/sum(x) - v <- y/sum(y) - m <- (u+v)/2 - if (all(u*v>0)){ - d <- (u*log(u/m)+v*log(v/m))/2 - } else { - P1 <- u*log(u/m) - P2 <- v*log(v/m) - P1[is.nan(P1)] <- 0 - P2[is.nan(P2)] <- 0 - d <- (P1+P2)/2 - } - return(sum(d)) +phyloseq_JSD_pair <- function(x, y) { + ### Function to compute Shannon-Jensen Divergence x and y are the frequencies for + ### the same p categories + u <- x/sum(x) + v <- y/sum(y) + m <- (u + v)/2 + if (all(u * v > 0)) { + d <- (u * log(u/m) + v * log(v/m))/2 + } else { + P1 <- u * log(u/m) + P2 <- v * log(v/m) + P1[is.nan(P1)] <- 0 + P2[is.nan(P2)] <- 0 + d <- (P1 + P2)/2 + } + return(sum(d)) } -################################################################################ +################################################################################ #' Calculate the Jensen-Shannon Divergence (distance) #' #' This is a phyloseq-specific implementation of the Jensen-Shannon Divergence @@ -215,7 +213,7 @@ phyloseq_JSD_pair <- function(x, y){ #' The expectation is that you have many samples (say. more than two) and you #' want a distance matrix on which will perform further analysis. \code{JSD} is #' intended to be ``wrapped'' by the more general \code{\link{distance}} -#' function in phyloseq, and it can be invoked using \code{"jsd"} as the +#' function in phyloseq, and it can be invoked using \code{'jsd'} as the #' argument to the \code{method} parameter of \code{\link{distance}}. #' #' One of the motivations for providing JSD in phyloseq was its recent use in @@ -260,42 +258,44 @@ phyloseq_JSD_pair <- function(x, y){ #' # registerDoParallel(cores=6) #' # data(enterotype) #' # # ent.jsd <- JSD(enterotype, TRUE) # internal only -#' # ent.jsd <- distance(enterotype, "jsd", parallel=TRUE) -#' # ent.PCoA <- ordinate(enterotype, "PCoA", ent.jsd) # Perform principle coordinate analysis -#' # p <- plot_ordination(enterotype, ent.PCoA, color="Enterotype", shape="SeqTech") +#' # ent.jsd <- distance(enterotype, 'jsd', parallel=TRUE) +#' # ent.PCoA <- ordinate(enterotype, 'PCoA', ent.jsd) # Perform principle coordinate analysis +#' # p <- plot_ordination(enterotype, ent.PCoA, color='Enterotype', shape='SeqTech') #' # (p <- p + geom_point(size=5, alpha=0.5)) -JSD <- function(physeq, parallel=FALSE){ - OTU <- otu_table(physeq) - ### Some parallel-foreach housekeeping. - # If user specifies not-parallel run (the default), register the sequential "back-end" - if( !parallel ){ registerDoSEQ() } - # create N x 2 matrix of all pairwise combinations of samples. - spn <- combn(sample_names(physeq), 2, simplify=FALSE) - # initialize DistMat with NAs - DistMat <- matrix(NA, nsamples(physeq), nsamples(physeq)) - # define the rows/cols of DistMat with the sample names (rownames) - rownames(DistMat) <- sample_names(physeq) - colnames(DistMat) <- sample_names(physeq) - ## Format coercion - # Coerce to the vegan orientation, with species as columns - if(taxa_are_rows(physeq)){ OTU <- t(OTU) } - # Coerce OTU to matrix for calculations. - OTU <- as(OTU, "matrix") - # optionally-parallel implementation with foreach - distlist <- foreach( i = spn, .packages="phyloseq") %dopar% { - A <- i[1] - B <- i[2] - return( phyloseq_JSD_pair(OTU[A, ], OTU[B, ]) ) - } - # return(distlist) - # This is in serial, but it is quick. - distlist2distmat <- function(i, spn, DL){ - DistMat[ spn[[i]][2], spn[[i]][1] ] <<- DL[[i]] - } - junk <- sapply(1:length(spn), distlist2distmat, spn, distlist) - return(as.dist(DistMat)) +JSD <- function(physeq, parallel = FALSE) { + OTU <- otu_table(physeq) + ### Some parallel-foreach housekeeping. If user specifies not-parallel run (the + ### default), register the sequential 'back-end' + if (!parallel) { + registerDoSEQ() + } + # create N x 2 matrix of all pairwise combinations of samples. + spn <- combn(sample_names(physeq), 2, simplify = FALSE) + # initialize DistMat with NAs + DistMat <- matrix(NA, nsamples(physeq), nsamples(physeq)) + # define the rows/cols of DistMat with the sample names (rownames) + rownames(DistMat) <- sample_names(physeq) + colnames(DistMat) <- sample_names(physeq) + ## Format coercion Coerce to the vegan orientation, with species as columns + if (taxa_are_rows(physeq)) { + OTU <- t(OTU) + } + # Coerce OTU to matrix for calculations. + OTU <- as(OTU, "matrix") + # optionally-parallel implementation with foreach + distlist <- foreach(i = spn, .packages = "phyloseq") %dopar% { + A <- i[1] + B <- i[2] + return(phyloseq_JSD_pair(OTU[A, ], OTU[B, ])) + } + # return(distlist) This is in serial, but it is quick. + distlist2distmat <- function(i, spn, DL) { + DistMat[spn[[i]][2], spn[[i]][1]] <<- DL[[i]] + } + junk <- sapply(1:length(spn), distlist2distmat, spn, distlist) + return(as.dist(DistMat)) } -############################################################################## +############################################################################## #' Calculate weighted or unweighted (Fast) UniFrac distance for all sample pairs. #' #' This function calculates the (Fast) UniFrac distance for all sample-pairs @@ -429,7 +429,7 @@ JSD <- function(physeq, parallel=FALSE){ #' ################################################################################ #' # Perform UniFrac on esophagus data #' ################################################################################ -#' data("esophagus") +#' data('esophagus') #' (y <- UniFrac(esophagus, TRUE)) #' UniFrac(esophagus, TRUE, FALSE) #' UniFrac(esophagus, FALSE) @@ -461,158 +461,165 @@ JSD <- function(physeq, parallel=FALSE){ #' # registerDoParallel(cores=3) #' # UniFrac(esophagus, TRUE) #' ################################################################################ -setGeneric("UniFrac", function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE){ - standardGeneric("UniFrac") +setGeneric("UniFrac", function(physeq, weighted = FALSE, normalized = TRUE, parallel = FALSE, + fast = TRUE) { + standardGeneric("UniFrac") }) -################################################################################ +################################################################################ #' @aliases UniFrac,phyloseq-method #' @rdname UniFrac-methods #' @importFrom ape is.rooted #' @importFrom ape root -setMethod("UniFrac", "phyloseq", function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE, fast=TRUE){ - if(is.null(phy_tree(physeq)$edge.length)){ +setMethod("UniFrac", "phyloseq", function(physeq, weighted = FALSE, normalized = TRUE, + parallel = FALSE, fast = TRUE) { + if (is.null(phy_tree(physeq)$edge.length)) { stop("Tree has no branch lengths. See tree$edge.length. Cannot compute UniFrac without branch lengths") - } + } # Check if tree is rooted, set random root with warning if it is not. - if( !is.rooted(phy_tree(physeq)) ){ + if (!is.rooted(phy_tree(physeq))) { randoroot = sample(taxa_names(physeq), 1) warning("Randomly assigning root as -- ", randoroot, " -- in the phylogenetic tree in the data you provided.") - phy_tree(physeq) <- root(phy=phy_tree(physeq), outgroup=randoroot, resolve.root=TRUE, interactive=FALSE) - if( !is.rooted(phy_tree(physeq)) ){ + phy_tree(physeq) <- root(phy = phy_tree(physeq), outgroup = randoroot, resolve.root = TRUE, + interactive = FALSE) + if (!is.rooted(phy_tree(physeq))) { stop("Problem automatically rooting tree. Make sure your tree is rooted before attempting UniFrac calculation. See ?ape::root") } - } - if( fast ){ - fastUniFrac(physeq, weighted, normalized, parallel) - } else { + } + if (fast) { + fastUniFrac(physeq, weighted, normalized, parallel) + } else { warning("Option `fast=FALSE` is deprecated. Only 'fast' UniFrac is supported in phyloseq.") - fastUniFrac(physeq, weighted, normalized, parallel) - } + fastUniFrac(physeq, weighted, normalized, parallel) + } }) -################################################################################ -# Fast UniFrac for R. -# Adapted from The ISME Journal (2010) 4, 17-27; doi:10.1038/ismej.2009.97; -# http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html -################################################################################ +################################################################################ Fast UniFrac for R. Adapted from The ISME Journal (2010) 4, 17-27; +################################################################################ doi:10.1038/ismej.2009.97; +################################################################################ http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html #' @importFrom ape prop.part #' @importFrom ape reorder.phylo #' @keywords internal #' @import foreach -fastUniFrac <- function(physeq, weighted=FALSE, normalized=TRUE, parallel=FALSE){ - # Access the needed components. Note, will error if missing in physeq. - OTU <- otu_table(physeq) - tree <- phy_tree(physeq) - # Some important checks. - if( is.null(tree$edge.length) ) { - stop("Tree has no branch lengths, cannot compute UniFrac") - } - if( !is.rooted(tree) ) { - stop("Rooted phylogeny required for UniFrac calculation") - } - ### Some parallel-foreach housekeeping. - # If user specifies not-parallel run (the default), register the sequential "back-end" - if( !parallel ){ registerDoSEQ() } - # create N x 2 matrix of all pairwise combinations of samples. - spn <- combn(sample_names(physeq), 2, simplify=FALSE) - # Make sure OTU is in species-are-rows orientation - if( !taxa_are_rows(physeq) ){OTU <- t(OTU)} +fastUniFrac <- function(physeq, weighted = FALSE, normalized = TRUE, parallel = FALSE) { + # Access the needed components. Note, will error if missing in physeq. + OTU <- otu_table(physeq) + tree <- phy_tree(physeq) + # Some important checks. + if (is.null(tree$edge.length)) { + stop("Tree has no branch lengths, cannot compute UniFrac") + } + if (!is.rooted(tree)) { + stop("Rooted phylogeny required for UniFrac calculation") + } + ### Some parallel-foreach housekeeping. If user specifies not-parallel run (the + ### default), register the sequential 'back-end' + if (!parallel) { + registerDoSEQ() + } + # create N x 2 matrix of all pairwise combinations of samples. + spn <- combn(sample_names(physeq), 2, simplify = FALSE) + # Make sure OTU is in species-are-rows orientation + if (!taxa_are_rows(physeq)) { + OTU <- t(OTU) + } # Convert to standard matrix - OTU <- as(OTU, "matrix") - # Enforce that tree and otu_table indices are the same order, - # by re-ordering OTU, if needed - if( !all(rownames(OTU) == taxa_names(tree)) ){ - OTU <- OTU[taxa_names(tree), ] - } - ######################################## - # Build the requisite matrices as defined - # in the Fast UniFrac article. - ######################################## - ## This only needs to happen once in a call to UniFrac. - ## Notice that A and B do not appear in this section. - # Begin by building the edge descendants matrix (edge-by-sample) - # `edge_array` - # - # Create a list of descendants, starting from the first internal node (root) - descList <- prop.part(tree, check.labels = FALSE) - # Add the terminal edge descendants (tips). By definition, can only have one descendant - descList <- c(as.list(1:length(tree$tip.label)), descList) - # Convert `descList` to `edge_array` that matches the order of things in `tree$edge` - edge_array <- matrix(0, nrow=nrow(tree$edge), ncol=nsamples(physeq), - dimnames=list(NULL, sample_names=sample_names(physeq))) - for(i in 1:nrow(tree$edge)){ + OTU <- as(OTU, "matrix") + # Enforce that tree and otu_table indices are the same order, by re-ordering OTU, + # if needed + if (!all(rownames(OTU) == taxa_names(tree))) { + OTU <- OTU[taxa_names(tree), ] + } + ######################################## Build the requisite matrices as defined in the Fast UniFrac article. This only + ######################################## needs to happen once in a call to UniFrac. Notice that A and B do not appear + ######################################## in this section. Begin by building the edge descendants matrix + ######################################## (edge-by-sample) `edge_array` Create a list of descendants, starting from the + ######################################## first internal node (root) + descList <- prop.part(tree, check.labels = FALSE) + # Add the terminal edge descendants (tips). By definition, can only have one + # descendant + descList <- c(as.list(1:length(tree$tip.label)), descList) + # Convert `descList` to `edge_array` that matches the order of things in + # `tree$edge` + edge_array <- matrix(0, nrow = nrow(tree$edge), ncol = nsamples(physeq), dimnames = list(NULL, + sample_names = sample_names(physeq))) + for (i in 1:nrow(tree$edge)) { # For each entry in the tree$edge table, sum the descendants for each sample # `tree$edge[i, 2]` is the node ID. - edge_array[i, ] <- colSums(OTU[descList[[tree$edge[i, 2]]], , drop=FALSE], na.rm = TRUE) - } + edge_array[i, ] <- colSums(OTU[descList[[tree$edge[i, 2]]], , drop = FALSE], + na.rm = TRUE) + } # Remove unneeded variables. `descList` in particular could be large-ish. rm(descList) - # If unweighted-UniFrac, coerce to a presence-absence contingency, occ - if(!weighted){ - # For unweighted UniFrac, convert the edge_array to an occurrence (presence/absence binary) array - edge_occ <- (edge_array > 0) - 0 - } - if( weighted & normalized ){ - # This is only relevant to weighted-UniFrac. - # For denominator in the normalized distance, we need the age of each tip. - # 'z' is the tree in postorder order used in calls to .C - # Descending order of left-hand side of edge (the ancestor to the node) - z = reorder.phylo(tree, order="postorder") - # Call phyloseq-internal function that in-turn calls ape's internal - # horizontal position function, in C, using the re-ordered phylo object, `z` - tipAges = ape_node_depth_edge_length(Ntip = length(tree$tip.label), - Nnode = tree$Nnode, - edge = z$edge, - Nedge = nrow(tree$edge)[1], - edge.length = z$edge.length) - # Keep only the tips, and add the tip labels in case `z` order differs from `tree` - tipAges <- tipAges[1:length(tree$tip.label)] - names(tipAges) <- z$tip.label + # If unweighted-UniFrac, coerce to a presence-absence contingency, occ + if (!weighted) { + # For unweighted UniFrac, convert the edge_array to an occurrence + # (presence/absence binary) array + edge_occ <- (edge_array > 0) - 0 + } + if (weighted & normalized) { + # This is only relevant to weighted-UniFrac. For denominator in the normalized + # distance, we need the age of each tip. 'z' is the tree in postorder order used + # in calls to .C Descending order of left-hand side of edge (the ancestor to the + # node) + z = reorder.phylo(tree, order = "postorder") + # Call phyloseq-internal function that in-turn calls ape's internal horizontal + # position function, in C, using the re-ordered phylo object, `z` + tipAges = ape_node_depth_edge_length(Ntip = length(tree$tip.label), Nnode = tree$Nnode, + edge = z$edge, Nedge = nrow(tree$edge)[1], edge.length = z$edge.length) + # Keep only the tips, and add the tip labels in case `z` order differs from + # `tree` + tipAges <- tipAges[1:length(tree$tip.label)] + names(tipAges) <- z$tip.label # Explicitly re-order tipAges to match OTU - tipAges <- tipAges[rownames(OTU)] - } - ######################################## - # optionally-parallel implementation with foreach - ######################################## - samplesums = sample_sums(physeq) - distlist <- foreach( i = spn, .packages="phyloseq") %dopar% { - A <- i[1] - B <- i[2] - AT <- samplesums[A] - BT <- samplesums[B] - if( weighted ){ + tipAges <- tipAges[rownames(OTU)] + } + ######################################## optionally-parallel implementation with foreach + samplesums = sample_sums(physeq) + distlist <- foreach(i = spn, .packages = "phyloseq") %dopar% { + A <- i[1] + B <- i[2] + AT <- samplesums[A] + BT <- samplesums[B] + if (weighted) { # weighted UniFrac - wUF_branchweight <- abs(edge_array[, A]/AT - edge_array[, B]/BT) - # calculate the w-UF numerator - numerator <- sum({tree$edge.length * wUF_branchweight}, na.rm = TRUE) - # if not-normalized weighted UniFrac, just return "numerator"; - # the u-value in the w-UniFrac description - if(!normalized){ - return(numerator) - } else { - # denominator (assumes tree-indices and otu_table indices are same order) - denominator <- sum({tipAges * (OTU[, A]/AT + OTU[, B]/BT)}, na.rm = TRUE) - # return the normalized weighted UniFrac values - return(numerator / denominator) - } - } else { - # Unweighted UniFrac - # Subset matrix to just columns A and B - edge_occ_AB <- edge_occ[, c(A, B)] + wUF_branchweight <- abs(edge_array[, A]/AT - edge_array[, B]/BT) + # calculate the w-UF numerator + numerator <- sum({ + tree$edge.length * wUF_branchweight + }, na.rm = TRUE) + # if not-normalized weighted UniFrac, just return 'numerator'; the u-value in the + # w-UniFrac description + if (!normalized) { + return(numerator) + } else { + # denominator (assumes tree-indices and otu_table indices are same order) + denominator <- sum({ + tipAges * (OTU[, A]/AT + OTU[, B]/BT) + }, na.rm = TRUE) + # return the normalized weighted UniFrac values + return(numerator/denominator) + } + } else { + # Unweighted UniFrac Subset matrix to just columns A and B + edge_occ_AB <- edge_occ[, c(A, B)] # Keep only the unique branches. Sum the lengths - edge_uni_AB_sum <- sum((tree$edge.length * edge_occ_AB)[rowSums(edge_occ_AB, na.rm=TRUE) < 2, ], na.rm=TRUE) - # Normalize this sum to the total branches among these two samples, A and B - uwUFpairdist <- edge_uni_AB_sum / sum(tree$edge.length[rowSums(edge_occ_AB, na.rm=TRUE) > 0]) - return(uwUFpairdist) - } - } - # Initialize UniFracMat with NAs - UniFracMat <- matrix(NA_real_, nsamples(physeq), nsamples(physeq)) - rownames(UniFracMat) <- colnames(UniFracMat) <- sample_names(physeq) + edge_uni_AB_sum <- sum((tree$edge.length * edge_occ_AB)[rowSums(edge_occ_AB, + na.rm = TRUE) < 2, ], na.rm = TRUE) + # Normalize this sum to the total branches among these two samples, A and B + uwUFpairdist <- edge_uni_AB_sum/sum(tree$edge.length[rowSums(edge_occ_AB, + na.rm = TRUE) > 0]) + return(uwUFpairdist) + } + } + # Initialize UniFracMat with NAs + UniFracMat <- matrix(NA_real_, nsamples(physeq), nsamples(physeq)) + rownames(UniFracMat) <- colnames(UniFracMat) <- sample_names(physeq) # Matrix-assign lower-triangle of UniFracMat. Then coerce to dist and return. - matIndices <- do.call(rbind, spn)[, 2:1] - # Take care of edge case where there are two samples -> 1 pair of indices -> rbind doesn't return a matrix - if(!is.matrix(matIndices)) matIndices <- matrix(matIndices, ncol=2) - UniFracMat[matIndices] <- unlist(distlist) - return(as.dist(UniFracMat)) + matIndices <- do.call(rbind, spn)[, 2:1] + # Take care of edge case where there are two samples -> 1 pair of indices -> + # rbind doesn't return a matrix + if (!is.matrix(matIndices)) + matIndices <- matrix(matIndices, ncol = 2) + UniFracMat[matIndices] <- unlist(distlist) + return(as.dist(UniFracMat)) } -################################################################################ +################################################################################ diff --git a/R/extend_DESeq2.R b/R/extend_DESeq2.R index 2132fa7b..532dad84 100644 --- a/R/extend_DESeq2.R +++ b/R/extend_DESeq2.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Convert phyloseq data to DESeq2 dds object #' #' No testing is performed by this function. The phyloseq data is converted @@ -23,7 +23,7 @@ #' reference sample class in tests by setting it to the first of the factor levels #' using the \code{\link{relevel}} function: #' -#' \code{sample_data(entill)$Enterotype <- relevel(sample_data(entill)$Enterotype, "1")} +#' \code{sample_data(entill)$Enterotype <- relevel(sample_data(entill)$Enterotype, '1')} #' #' @param ... (Optional). Additional named arguments passed to \code{\link[DESeq2]{DESeqDataSetFromMatrix}}. #' Most users will not need to pass any additional arguments here. @@ -34,7 +34,7 @@ #' #' @seealso #' -#' \code{vignette("phyloseq-mixture-models")} +#' \code{vignette('phyloseq-mixture-models')} #' #' The #' \href{http://joey711.github.io/phyloseq-extensions}{phyloseq-extensions} @@ -50,21 +50,23 @@ #' #' @examples #' # Check out the vignette phyloseq-mixture-models for more details. -#' # vignette("phyloseq-mixture-models") +#' # vignette('phyloseq-mixture-models') #' data(soilrep) #' phyloseq_to_deseq2(soilrep, ~warmed) -phyloseq_to_deseq2 = function(physeq, design, ...){ +phyloseq_to_deseq2 = function(physeq, design, ...) { # Need to add check here for missing sample_data - if( is.null(sample_data(physeq, FALSE)) ){ + if (is.null(sample_data(physeq, FALSE))) { stop("There must be sample_data present, for specifying experimental design. See ?phyloseq_to_deseq2") } # Enforce orientation. Samples are columns - if( !taxa_are_rows(physeq) ){ physeq <- t(physeq)} + if (!taxa_are_rows(physeq)) { + physeq <- t(physeq) + } # Coerce count data to vanilla matrix of integers - countData = round(as(otu_table(physeq), "matrix"), digits=0) + countData = round(as(otu_table(physeq), "matrix"), digits = 0) colData = data.frame(sample_data(physeq)) # Create the DESeq data set, dds. dds <- DESeqDataSetFromMatrix(countData, colData, design, ...) return(dds) } -################################################################################ +################################################################################ diff --git a/R/extend_vegan.R b/R/extend_vegan.R index 2da390ba..01f43444 100644 --- a/R/extend_vegan.R +++ b/R/extend_vegan.R @@ -1,52 +1,44 @@ -################################################################################ -# Define S3 methods for scores (originally defined by vegan-package) -# to work for other ordination results -# vegan:::scores.default -################################################################################ -# pcoa-class, from pcoa{ape} +################################################################################ Define S3 methods for scores (originally defined by vegan-package) to work for +################################################################################ other ordination results vegan:::scores.default pcoa-class, from pcoa{ape} #' @importFrom vegan wascores #' @importFrom vegan scores #' @keywords internal -scores.pcoa <- function(x, choices=NULL, display="sites", physeq=NULL, ...){ - if(is.null(choices)){ - choices <- colnames(x$vectors) - } +scores.pcoa <- function(x, choices = NULL, display = "sites", physeq = NULL, ...) { + if (is.null(choices)) { + choices <- colnames(x$vectors) + } co = list(sites = x$vectors[, choices]) - if( "species" %in% display ){ - if(is.null(otu_table(physeq, errorIfNULL = FALSE))){ - warning("scores.pcoa: Failed to access OTU table from `physeq` argument, \n - needed for weighted average of OTU/taxa/species points in MDS/PCoA.") + if ("species" %in% display) { + if (is.null(otu_table(physeq, errorIfNULL = FALSE))) { + warning("scores.pcoa: Failed to access OTU table from `physeq` argument, \n\n needed for weighted average of OTU/taxa/species points in MDS/PCoA.") } else { - # MDS/PCoA only provides coordinates of the elements in the - # distance matrix, usually sites/samples, so species (etc.) - # This means we need to use the weighted-average as there is - # no corresponding axes from the ordination directly. - co$species <- wascores(x$vectors[, choices], w = veganifyOTU(physeq)) + # MDS/PCoA only provides coordinates of the elements in the distance matrix, + # usually sites/samples, so species (etc.) This means we need to use the + # weighted-average as there is no corresponding axes from the ordination + # directly. + co$species <- wascores(x$vectors[, choices], w = veganifyOTU(physeq)) } - } + } co <- co[display] - if(length(co) < 2L){ + if (length(co) < 2L) { # Unlist co <- co[[display]] } return(co) } -################################################################################ -# DPCoA management -################################################################################ +################################################################################ DPCoA management #' @importFrom vegan scores #' @keywords internal -get_dpcoa_species_coords = function(x, physeq=NULL){ +get_dpcoa_species_coords = function(x, physeq = NULL) { # Grab coordinates from the dpcoa object coords = x$dls - # ade4 mangles the element names using `make.names` conventions in base R - # Replace them in `coords` - if(is.null(taxa_names(physeq))){ - warning("scores.dpcoa: Failed to access `taxa_names` from `physeq` argument, \n - needed to ensure correct mapping of OTU/taxa/species points in DPCoA.") + # ade4 mangles the element names using `make.names` conventions in base R Replace + # them in `coords` + if (is.null(taxa_names(physeq))) { + warning("scores.dpcoa: Failed to access `taxa_names` from `physeq` argument, \n\n needed to ensure correct mapping of OTU/taxa/species points in DPCoA.") } else { - # if the names are available, use them - # by mapping the same variable-name conversion that ade4 would have used. + # if the names are available, use them by mapping the same variable-name + # conversion that ade4 would have used. taxnames = taxa_names(physeq) names(taxnames) <- make.names(taxnames) rownames(coords) <- taxnames[rownames(coords)] @@ -55,17 +47,16 @@ get_dpcoa_species_coords = function(x, physeq=NULL){ } #' @importFrom vegan scores #' @keywords internal -get_dpcoa_sites_coords = function(x, physeq=NULL){ +get_dpcoa_sites_coords = function(x, physeq = NULL) { # Grab coordinates from the dpcoa object coords = x$li - # ade4 mangles the element names using `make.names` conventions in base R - # Replace them in `coords` - if(is.null(sample_names(physeq))){ - warning("scores.dpcoa: Failed to access `sample_names` from `physeq` argument, \n - needed to ensure correct mapping of site/sample/library points in DPCoA.") + # ade4 mangles the element names using `make.names` conventions in base R Replace + # them in `coords` + if (is.null(sample_names(physeq))) { + warning("scores.dpcoa: Failed to access `sample_names` from `physeq` argument, \n\n needed to ensure correct mapping of site/sample/library points in DPCoA.") } else { - # if the names are available, use them - # by mapping the same variable-name conversion that ade4 would have used. + # if the names are available, use them by mapping the same variable-name + # conversion that ade4 would have used. samplenames = sample_names(physeq) names(samplenames) <- make.names(samplenames) rownames(coords) <- samplenames[rownames(coords)] @@ -75,64 +66,49 @@ get_dpcoa_sites_coords = function(x, physeq=NULL){ # dpcoa-class, from ade4 #' @importFrom vegan scores #' @keywords internal -scores.dpcoa <- function(x, choices=NULL, display="sites", physeq=NULL, ...){ - # x = ordination - # display = "species" +scores.dpcoa <- function(x, choices = NULL, display = "sites", physeq = NULL, ...) { + # x = ordination display = 'species' coords = NULL - # `display` must be either "sites" or "species", per vegan-package convention. - coords <- switch(EXPR = display, - species = get_dpcoa_species_coords(x, physeq), - sites = get_dpcoa_sites_coords(x, physeq)) + # `display` must be either 'sites' or 'species', per vegan-package convention. + coords <- switch(EXPR = display, species = get_dpcoa_species_coords(x, physeq), + sites = get_dpcoa_sites_coords(x, physeq)) # If no choices selection, take all dimensions/columns - if(is.null(choices)){ - choices <- 1:ncol(coords) + if (is.null(choices)) { + choices <- 1:ncol(coords) } - return( coords[, choices, drop=FALSE] ) + return(coords[, choices, drop = FALSE]) } -################################################################################ -# Extend vegdist for phyloseq classes -################################################################################ -# \code{\link[vegan]{vegdist}} wrapper for phyloseq classes -# -# Trivially-extended S4 method from the \code{\link[vegan]{vegdist}} function, -# such that S4 classes from the \code{\link{phyloseq-package}} are properly -# handled / accessed. All parameters passed on to \code{\link[vegan]{vegdist}} -# verbatim. -# -# @seealso \code{\link[vegan]{vegdist}} -# @rdname vegdist-methods -# @docType methods -# @aliases vegdist -# -# @examples -# data(esophagus) -# vegdist(esophagus, "jaccard") +################################################################################ Extend vegdist for phyloseq classes \code{\link[vegan]{vegdist}} wrapper for +################################################################################ phyloseq classes Trivially-extended S4 method from the +################################################################################ \code{\link[vegan]{vegdist}} function, such that S4 classes from the +################################################################################ \code{\link{phyloseq-package}} are properly handled / accessed. All +################################################################################ parameters passed on to \code{\link[vegan]{vegdist}} verbatim. @seealso +################################################################################ \code{\link[vegan]{vegdist}} @rdname vegdist-methods @docType methods +################################################################################ @aliases vegdist @examples data(esophagus) vegdist(esophagus, 'jaccard') #' @importFrom vegan vegdist #' @keywords internal setGeneric("vegdist") -################################################################################ -# @aliases vegdist,otu_table-method -# @rdname vegdist-methods +################################################################################ @aliases vegdist,otu_table-method @rdname vegdist-methods #' @importFrom vegan vegdist -setMethod("vegdist", "otu_table", function(x, method = "bray", binary = FALSE, - diag = FALSE, upper = FALSE, na.rm = FALSE, ...){ - # Make sure in sample-by-species orientation - if( taxa_are_rows(x) ){x <- t(x)} - # Convert to simple matrix - x <- as(x, "matrix") - # pass to standard method (compiled C) - vegdist(x, method, binary, diag, upper, na.rm, ...) +setMethod("vegdist", "otu_table", function(x, method = "bray", binary = FALSE, diag = FALSE, + upper = FALSE, na.rm = FALSE, ...) { + # Make sure in sample-by-species orientation + if (taxa_are_rows(x)) { + x <- t(x) + } + # Convert to simple matrix + x <- as(x, "matrix") + # pass to standard method (compiled C) + vegdist(x, method, binary, diag, upper, na.rm, ...) }) -################################################################################ -# @aliases vegdist,phyloseq-method -# @rdname vegdist-methods -setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE, - diag = FALSE, upper = FALSE, na.rm = FALSE, ...){ - # Simply access the otu_table - x <- otu_table(x) - vegdist(x, method, binary, diag, upper, na.rm, ...) +################################################################################ @aliases vegdist,phyloseq-method @rdname vegdist-methods +setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE, diag = FALSE, + upper = FALSE, na.rm = FALSE, ...) { + # Simply access the otu_table + x <- otu_table(x) + vegdist(x, method, binary, diag, upper, na.rm, ...) }) -################################################################################ +################################################################################ #' Summarize alpha diversity #' #' Performs a number of standard alpha diversity estimates, @@ -160,7 +136,7 @@ setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE, #' Alternatively, you can specify one or more measures #' as a character vector of measure names. #' Values must be among those supported: -#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}. +#' \code{c('Observed', 'Chao1', 'ACE', 'Shannon', 'Simpson', 'InvSimpson', 'Fisher')}. #' #' @return A \code{data.frame} of the richness estimates, and their standard error. #' @@ -183,92 +159,91 @@ setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE, #' @examples #' ## There are many more interesting examples at the phyloseq online tutorials. #' ## http://joey711.github.com/phyloseq/plot_richness-examples -#' data("esophagus") +#' data('esophagus') #' # Default is all available measures #' estimate_richness(esophagus) #' # Specify just one: -#' estimate_richness(esophagus, measures="Observed") +#' estimate_richness(esophagus, measures='Observed') #' # Specify a few: -#' estimate_richness(esophagus, measures=c("Observed", "InvSimpson", "Shannon", "Chao1")) -estimate_richness <- function(physeq, split=TRUE, measures=NULL){ - - if( !any(otu_table(physeq)==1) ){ - # Check for singletons, and then warning if they are missing. - # These metrics only really meaningful if singletons are included. - warning( - "The data you have provided does not have\n", - "any singletons. This is highly suspicious. Results of richness\n", - "estimates (for example) are probably unreliable, or wrong, if you have already\n", - "trimmed low-abundance taxa from the data.\n", - "\n", - "We recommended that you find the un-trimmed data and retry." - ) - } - - # If we are not splitting sample-wise, sum the species. Else, enforce orientation. - if( !split ){ - OTU <- taxa_sums(physeq) - } else if( split ){ - OTU <- as(otu_table(physeq), "matrix") - if( taxa_are_rows(physeq) ){ OTU <- t(OTU) } - } - - # Define renaming vector: - renamevec = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher") - names(renamevec) <- c("S.obs", "S.chao1", "S.ACE", "shannon", "simpson", "invsimpson", "fisher") - # If measures was not explicitly provided (is NULL), set to all supported methods - if( is.null(measures) ){ - measures = as.character(renamevec) - } +#' estimate_richness(esophagus, measures=c('Observed', 'InvSimpson', 'Shannon', 'Chao1')) +estimate_richness <- function(physeq, split = TRUE, measures = NULL) { + + if (!any(otu_table(physeq) == 1)) { + # Check for singletons, and then warning if they are missing. These metrics only + # really meaningful if singletons are included. + warning("The data you have provided does not have\n", "any singletons. This is highly suspicious. Results of richness\n", + "estimates (for example) are probably unreliable, or wrong, if you have already\n", + "trimmed low-abundance taxa from the data.\n", "\n", "We recommended that you find the un-trimmed data and retry.") + } + + # If we are not splitting sample-wise, sum the species. Else, enforce + # orientation. + if (!split) { + OTU <- taxa_sums(physeq) + } else if (split) { + OTU <- as(otu_table(physeq), "matrix") + if (taxa_are_rows(physeq)) { + OTU <- t(OTU) + } + } + + # Define renaming vector: + renamevec = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", + "Fisher") + names(renamevec) <- c("S.obs", "S.chao1", "S.ACE", "shannon", "simpson", "invsimpson", + "fisher") + # If measures was not explicitly provided (is NULL), set to all supported methods + if (is.null(measures)) { + measures = as.character(renamevec) + } # Rename measures if they are in the old-style - if( any(measures %in% names(renamevec)) ){ - measures[measures %in% names(renamevec)] <- renamevec[names(renamevec) %in% measures] + if (any(measures %in% names(renamevec))) { + measures[measures %in% names(renamevec)] <- renamevec[names(renamevec) %in% + measures] } # Stop with error if no measures are supported - if( !any(measures %in% renamevec) ){ + if (!any(measures %in% renamevec)) { stop("None of the `measures` you provided are supported. Try default `NULL` instead.") } # Initialize to NULL outlist = vector("list") - # Some standard diversity indices + # Some standard diversity indices estimRmeas = c("Chao1", "Observed", "ACE") - if( any(estimRmeas %in% measures) ){ + if (any(estimRmeas %in% measures)) { outlist <- c(outlist, list(t(data.frame(estimateR(OTU))))) - } - if( "Shannon" %in% measures ){ - outlist <- c(outlist, list(shannon = diversity(OTU, index="shannon"))) - } - if( "Simpson" %in% measures ){ - outlist <- c(outlist, list(simpson = diversity(OTU, index="simpson"))) - } - if( "InvSimpson" %in% measures ){ - outlist <- c(outlist, list(invsimpson = diversity(OTU, index="invsimpson"))) - } - if( "Fisher" %in% measures ){ - fisher = tryCatch(fisher.alpha(OTU, se=TRUE), - warning=function(w){ - warning("phyloseq::estimate_richness: Warning in fisher.alpha(). See `?fisher.fit` or ?`fisher.alpha`. Treat fisher results with caution") - suppressWarnings(fisher.alpha(OTU, se=TRUE)[, c("alpha", "se")]) - } - ) - if(!is.null(dim(fisher))){ + } + if ("Shannon" %in% measures) { + outlist <- c(outlist, list(shannon = diversity(OTU, index = "shannon"))) + } + if ("Simpson" %in% measures) { + outlist <- c(outlist, list(simpson = diversity(OTU, index = "simpson"))) + } + if ("InvSimpson" %in% measures) { + outlist <- c(outlist, list(invsimpson = diversity(OTU, index = "invsimpson"))) + } + if ("Fisher" %in% measures) { + fisher = tryCatch(fisher.alpha(OTU, se = TRUE), warning = function(w) { + warning("phyloseq::estimate_richness: Warning in fisher.alpha(). See `?fisher.fit` or ?`fisher.alpha`. Treat fisher results with caution") + suppressWarnings(fisher.alpha(OTU, se = TRUE)[, c("alpha", "se")]) + }) + if (!is.null(dim(fisher))) { colnames(fisher)[1:2] <- c("Fisher", "se.fisher") outlist <- c(outlist, list(fisher)) } else { - outlist <- c(outlist, Fisher=list(fisher)) + outlist <- c(outlist, Fisher = list(fisher)) } - } + } out = do.call("cbind", outlist) # Rename columns per renamevec namechange = intersect(colnames(out), names(renamevec)) colnames(out)[colnames(out) %in% namechange] <- renamevec[namechange] - # Final prune to just those columns related to "measures". Use grep. - colkeep = sapply(paste0("(se\\.){0,}", measures), grep, colnames(out), ignore.case=TRUE) - out = out[, sort(unique(unlist(colkeep))), drop=FALSE] + # Final prune to just those columns related to 'measures'. Use grep. + colkeep = sapply(paste0("(se\\.){0,}", measures), grep, colnames(out), ignore.case = TRUE) + out = out[, sort(unique(unlist(colkeep))), drop = FALSE] # Make sure that you return a data.frame for reliable performance. out <- as.data.frame(out) - return(out) + return(out) } -################################################################################ +################################################################################ diff --git a/R/extract-methods.R b/R/extract-methods.R index e334d752..b46c294a 100644 --- a/R/extract-methods.R +++ b/R/extract-methods.R @@ -1,7 +1,5 @@ -################################################################################ -# subsetting functions -# Without these, the default coerces to the base object (e.g. matrix or data.frame) -################################################################################ +################################################################################ subsetting functions Without these, the default coerces to the base object +################################################################################ (e.g. matrix or data.frame) #' Method extensions to extraction operator for phyloseq objects. #' #' See the documentation for the \code{\link[base]{Extract}} generic, @@ -31,44 +29,40 @@ #' data(esophagus) #' nrow(otu_table(esophagus)) #' nrow(otu_table(esophagus)[1:5, ]) -setMethod("[", "otu_table", function(x, i, j, ...){ - newx <- as(x, "matrix")[i, j, drop=FALSE] - otu_table(newx, taxa_are_rows(x) ) +setMethod("[", "otu_table", function(x, i, j, ...) { + newx <- as(x, "matrix")[i, j, drop = FALSE] + otu_table(newx, taxa_are_rows(x)) }) # extract parts of sample_data -# #' @export #' @rdname extract-methods -setMethod("[", "sample_data", function(x, i, j, ...){ - sample_data( data.frame(x)[i, j, drop=FALSE] ) +setMethod("[", "sample_data", function(x, i, j, ...) { + sample_data(data.frame(x)[i, j, drop = FALSE]) }) # extract parts of taxonomyTable -# #' @export #' @rdname extract-methods -setMethod("[", "taxonomyTable", function(x, i, j, ...){ +setMethod("[", "taxonomyTable", function(x, i, j, ...) { # Coerce to matrix, apply std extraction, reconstruct. - return( tax_table(as(x, "matrix")[i, j, drop=FALSE]) ) + return(tax_table(as(x, "matrix")[i, j, drop = FALSE])) }) -# A numeric extraction method is already defined in Biostrings for XStringSet -# Add name-character-based extraction method for XStringSet -# +# A numeric extraction method is already defined in Biostrings for XStringSet Add +# name-character-based extraction method for XStringSet #' @importClassesFrom Biostrings XStringSet #' @export #' @rdname extract-methods -setMethod("[", c("XStringSet", "character"), function(x, i){ - index_vector = match(i, names(x), nomatch=NA_integer_) - index_vector = index_vector[!is.na(index_vector)] - if( length(index_vector) <= 0 ){ - warning("[,XStringSet: no valid seq-indices provided, NULL returned") - return(NULL) - } - if( length(index_vector) < length(i) ){ - warning("[,XStringSet: some seq-name indices invalid, omitted.") - } - # index_vector is an integer, subsetting now dispatches to standard - x = x[index_vector] - return(x) +setMethod("[", c("XStringSet", "character"), function(x, i) { + index_vector = match(i, names(x), nomatch = NA_integer_) + index_vector = index_vector[!is.na(index_vector)] + if (length(index_vector) <= 0) { + warning("[,XStringSet: no valid seq-indices provided, NULL returned") + return(NULL) + } + if (length(index_vector) < length(i)) { + warning("[,XStringSet: some seq-name indices invalid, omitted.") + } + # index_vector is an integer, subsetting now dispatches to standard + x = x[index_vector] + return(x) }) -################################################################################ -################################################################################ +################################################################################ diff --git a/R/merge-methods.R b/R/merge-methods.R index 4f4ce200..2841a63c 100644 --- a/R/merge-methods.R +++ b/R/merge-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Merge arguments into one phyloseq object. #' #' Takes a comma-separated list of phyloseq objects as arguments, @@ -47,9 +47,9 @@ #' @examples # #' ## # Make a random complex object #' ## OTU1 <- otu_table(matrix(sample(0:5,250,TRUE),25,10), taxa_are_rows=TRUE) -#' ## tax1 <- tax_table(matrix("abc", 30, 8)) +#' ## tax1 <- tax_table(matrix('abc', 30, 8)) #' ## map1 <- data.frame( matrix(sample(0:3,250,TRUE),25,10), -#' ## matrix(sample(c("a","b","c"),150,TRUE), 25, 6) ) +#' ## matrix(sample(c('a','b','c'),150,TRUE), 25, 6) ) #' ## map1 <- sample_data(map1) #' ## exam1 <- phyloseq(OTU1, map1, tax1) #' ## x <- exam1 @@ -57,38 +57,39 @@ #' ## y <- tax_table(exam1) #' ## merge_phyloseq(x, y) #' ## merge_phyloseq(y, y, y, y) -merge_phyloseq <- function(...){ - arguments <- list(...) - # create list of all components of all objects - comp.list <- list() - for( i in 1:length(arguments) ){ - comp.list <- c(comp.list, splat.phyloseq.objects(arguments[[i]])) - } - # loop through each component type. Note, list names redundant. will use this - merged.list <- list() - for( i in unique(names(comp.list)) ){ #i="tax_table" - # check if length 1, if so, cat to merged.list. - i.list <- comp.list[names(comp.list)==i] - if( length(i.list) == 1 ){ - merged.list <- c(merged.list, i.list) - } else { - # else, loop through each identically-named objects. - x1 <- i.list[[1]] - for( j in 2:length(i.list)){ - x1 <- merge_phyloseq_pair(x1, i.list[[j]]) - } - x1 <- list(x1) - names(x1) <- i - merged.list <- c(merged.list, x1) - } - } - # Remove names to avoid any conflicts with phyloseq(), which does not need named-arguments - names(merged.list) <- NULL - - # Use do.call for calling this variable-length, variable-content argument list. - return( do.call(phyloseq, merged.list) ) +merge_phyloseq <- function(...) { + arguments <- list(...) + # create list of all components of all objects + comp.list <- list() + for (i in 1:length(arguments)) { + comp.list <- c(comp.list, splat.phyloseq.objects(arguments[[i]])) + } + # loop through each component type. Note, list names redundant. will use this + merged.list <- list() + for (i in unique(names(comp.list))) { + # i='tax_table' check if length 1, if so, cat to merged.list. + i.list <- comp.list[names(comp.list) == i] + if (length(i.list) == 1) { + merged.list <- c(merged.list, i.list) + } else { + # else, loop through each identically-named objects. + x1 <- i.list[[1]] + for (j in 2:length(i.list)) { + x1 <- merge_phyloseq_pair(x1, i.list[[j]]) + } + x1 <- list(x1) + names(x1) <- i + merged.list <- c(merged.list, x1) + } + } + # Remove names to avoid any conflicts with phyloseq(), which does not need + # named-arguments + names(merged.list) <- NULL + + # Use do.call for calling this variable-length, variable-content argument list. + return(do.call(phyloseq, merged.list)) } -################################################################################ +################################################################################ #' Merge pair of phyloseq component data objects of the same class. #' #' Internal S4 methods to combine pairs of objects of classes specified in the @@ -131,129 +132,138 @@ merge_phyloseq <- function(...){ #' ## xy <- merge_phyloseq_pair(x, y) #' ## yx <- merge_phyloseq_pair(y, x) #' ## # merge two simulated tax_table objects -#' ## x <- tax_table(matrix("abc", 20, 6)) -#' ## y <- tax_table(matrix("def", 30, 8)) +#' ## x <- tax_table(matrix('abc', 20, 6)) +#' ## y <- tax_table(matrix('def', 30, 8)) #' ## xy <- merge_phyloseq_pair(x, y) #' ## # merge two simulated sample_data objects #' ## x <- data.frame( matrix(sample(0:3,250,TRUE),25,10), -#' ## matrix(sample(c("a","b","c"),150,TRUE),25,6) ) +#' ## matrix(sample(c('a','b','c'),150,TRUE),25,6) ) #' ## x <- sample_data(x) #' ## y <- data.frame( matrix(sample(4:6,200,TRUE),20,10), -#' ## matrix(sample(c("d","e","f"),120,TRUE),20,8) ) +#' ## matrix(sample(c('d','e','f'),120,TRUE),20,8) ) #' ## y <- sample_data(y) #' ## merge_phyloseq_pair(x, y) #' ## data.frame(merge_phyloseq_pair(x, y)) #' ## data.frame(merge_phyloseq_pair(y, x)) setGeneric("merge_phyloseq_pair", function(x, y) standardGeneric("merge_phyloseq_pair")) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,otu_table,otu_table-method #' @rdname merge_phyloseq_pair-methods -setMethod("merge_phyloseq_pair", signature("otu_table", "otu_table"), function(x, y){ - specRrowsx <- taxa_are_rows(x) - new.sp.names <- union(taxa_names(x), taxa_names(y)) - new.sa.names <- union(sample_names(x), sample_names(y)) - - # Create the empty new matrix structure - newx <- matrix(0, nrow=length(new.sp.names), ncol=length(new.sa.names), - dimnames=list(new.sp.names, new.sa.names)) - - # assign a standard taxa_are_rows orientation to TRUE for x and y - if( !taxa_are_rows(x) ){ x <- t(x) } - if( !taxa_are_rows(y) ){ y <- t(y) } - - # "merge" by addition. - newx[rownames(x), colnames(x)] <- x - newx[rownames(y), colnames(y)] <- newx[rownames(y), colnames(y)] + y - - # Create the new otu_table object - newx <- otu_table(newx, taxa_are_rows=TRUE) - - # Return the orientation that was in x - if( !specRrowsx ){ newx <- t(newx) } - return(newx) +setMethod("merge_phyloseq_pair", signature("otu_table", "otu_table"), function(x, + y) { + specRrowsx <- taxa_are_rows(x) + new.sp.names <- union(taxa_names(x), taxa_names(y)) + new.sa.names <- union(sample_names(x), sample_names(y)) + + # Create the empty new matrix structure + newx <- matrix(0, nrow = length(new.sp.names), ncol = length(new.sa.names), dimnames = list(new.sp.names, + new.sa.names)) + + # assign a standard taxa_are_rows orientation to TRUE for x and y + if (!taxa_are_rows(x)) { + x <- t(x) + } + if (!taxa_are_rows(y)) { + y <- t(y) + } + + # 'merge' by addition. + newx[rownames(x), colnames(x)] <- x + newx[rownames(y), colnames(y)] <- newx[rownames(y), colnames(y)] + y + + # Create the new otu_table object + newx <- otu_table(newx, taxa_are_rows = TRUE) + + # Return the orientation that was in x + if (!specRrowsx) { + newx <- t(newx) + } + return(newx) }) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,taxonomyTable,taxonomyTable-method #' @rdname merge_phyloseq_pair-methods -setMethod("merge_phyloseq_pair", signature("taxonomyTable", "taxonomyTable"), function(x, y){ - new.sp.names <- union(rownames(x), rownames(y)) - new.ta.names <- union(colnames(x), colnames(y)) - - # Create the empty new matrix structure - newx <- matrix(NA, nrow=length(new.sp.names), ncol=length(new.ta.names), - dimnames=list(new.sp.names, new.ta.names)) - - # "merge". Overwrite with x information. - newx[rownames(y), colnames(y)] <- y - newx[rownames(x), colnames(x)] <- x - - # Create the new otu_table object - newx <- tax_table(newx) - - return(newx) +setMethod("merge_phyloseq_pair", signature("taxonomyTable", "taxonomyTable"), function(x, + y) { + new.sp.names <- union(rownames(x), rownames(y)) + new.ta.names <- union(colnames(x), colnames(y)) + + # Create the empty new matrix structure + newx <- matrix(NA, nrow = length(new.sp.names), ncol = length(new.ta.names), + dimnames = list(new.sp.names, new.ta.names)) + + # 'merge'. Overwrite with x information. + newx[rownames(y), colnames(y)] <- y + newx[rownames(x), colnames(x)] <- x + + # Create the new otu_table object + newx <- tax_table(newx) + + return(newx) }) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,sample_data,sample_data-method #' @rdname merge_phyloseq_pair-methods -setMethod("merge_phyloseq_pair", signature("sample_data", "sample_data"), function(x, y){ - new.sa.names <- union(rownames(x), rownames(y)) - new.va.names <- union(colnames(x), colnames(y)) - - partx <- data.frame("X0"=rownames(x), x) - party <- data.frame("X0"=rownames(y), y) - newx <- merge(partx, party, all=TRUE) - # now we have the correct template, lets remove redundant rows. - keep.samp.rows <- sapply(unique(as.character(newx[,1])), function(i,nx){ - rownames(subset(nx, X0==i))[1] - },newx) - newx <- newx[keep.samp.rows,] - rownames(newx) <- as.character(newx$"X0") - - # "merge". Overwrite with x information. - newx[rownames(y), colnames(y)] <- data.frame(y) - newx[rownames(x), colnames(x)] <- data.frame(x) - - # trim the sample name column - newx <- newx[,names(newx)!="X0"] - - # Create the new otu_table object - newx <- sample_data(newx) - return(newx) +setMethod("merge_phyloseq_pair", signature("sample_data", "sample_data"), function(x, + y) { + new.sa.names <- union(rownames(x), rownames(y)) + new.va.names <- union(colnames(x), colnames(y)) + + partx <- data.frame(X0 = rownames(x), x) + party <- data.frame(X0 = rownames(y), y) + newx <- merge(partx, party, all = TRUE) + # now we have the correct template, lets remove redundant rows. + keep.samp.rows <- sapply(unique(as.character(newx[, 1])), function(i, nx) { + rownames(subset(nx, X0 == i))[1] + }, newx) + newx <- newx[keep.samp.rows, ] + rownames(newx) <- as.character(newx$X0) + + # 'merge'. Overwrite with x information. + newx[rownames(y), colnames(y)] <- data.frame(y) + newx[rownames(x), colnames(x)] <- data.frame(x) + + # trim the sample name column + newx <- newx[, names(newx) != "X0"] + + # Create the new otu_table object + newx <- sample_data(newx) + return(newx) }) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,phylo,phylo-method #' @rdname merge_phyloseq_pair-methods #' @importFrom ape consensus -setMethod("merge_phyloseq_pair", signature("phylo", "phylo"), function(x, y){ - if(identical(x, y)){ - return(x) - } else { - return( consensus(x, y) ) - } +setMethod("merge_phyloseq_pair", signature("phylo", "phylo"), function(x, y) { + if (identical(x, y)) { + return(x) + } else { + return(consensus(x, y)) + } }) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,XStringSet,XStringSet-method #' @rdname merge_phyloseq_pair-methods -setMethod("merge_phyloseq_pair", signature("XStringSet", "XStringSet"), function(x, y){ - if( class(x) != class(y) ){ - # if class of x and y don't match, throw warning, try anyway (just in case) - warning("For merging reference sequence objects, x and y should be same type.\n", - "That is, the same subclass of XStringSet. e.g. both DNAStringSet.\n", - "Try coercing each to the same compatible class prior to merge.") - } - # Add to x the stuff that is in y, but not in x - add_y_taxa = setdiff(taxa_names(y), taxa_names(x)) - if( length(add_y_taxa) < 1L ){ - # If there is nothing from y to add, just return x as-is - return(x) - } else { - # Else, add unique stuff from y only to x (they are both lists!) - x = c(x, y[add_y_taxa]) - return(x) - } +setMethod("merge_phyloseq_pair", signature("XStringSet", "XStringSet"), function(x, + y) { + if (class(x) != class(y)) { + # if class of x and y don't match, throw warning, try anyway (just in case) + warning("For merging reference sequence objects, x and y should be same type.\n", + "That is, the same subclass of XStringSet. e.g. both DNAStringSet.\n", + "Try coercing each to the same compatible class prior to merge.") + } + # Add to x the stuff that is in y, but not in x + add_y_taxa = setdiff(taxa_names(y), taxa_names(x)) + if (length(add_y_taxa) < 1L) { + # If there is nothing from y to add, just return x as-is + return(x) + } else { + # Else, add unique stuff from y only to x (they are both lists!) + x = c(x, y[add_y_taxa]) + return(x) + } }) -################################################################################ -################################################################################ +################################################################################ #' Merge a subset of the species in \code{x} into one species/taxa/OTU. #' #' Takes as input an object that describes species/taxa @@ -302,163 +312,169 @@ setMethod("merge_phyloseq_pair", signature("XStringSet", "XStringSet"), function #' otutree0 <- phyloseq(otu, tree) #' # plot_tree(otutree0) #' otutree1 <- merge_taxa(otutree0, 1:8, 2) -#' # plot_tree(esophagus, ladderize="left") -setGeneric("merge_taxa", function(x, eqtaxa, archetype=1L) standardGeneric("merge_taxa")) -################################################################################ +#' # plot_tree(esophagus, ladderize='left') +setGeneric("merge_taxa", function(x, eqtaxa, archetype = 1L) standardGeneric("merge_taxa")) +################################################################################ #' @keywords internal -merge_taxa.indices.internal = function(x, eqtaxa, archetype){ - ## If eqtaxa or archetype are character, interpret them to be OTUs and coerce them to integer indices - if( is.character(archetype) ){ - # If archetype is already an OTU, just assign it to keepIndex - keepIndex = which(taxa_names(x) %in% archetype[1L]) - } else { - # Else archetype is the numeric index of the eqtaxa that should be kept. - # Need to grab from unmodifed eqtaxa, and then decide - archetype = eqtaxa[as.integer(archetype[1L])] - if( is.character(archetype) ){ - # If archetype is now an OTU name, find the index and assign to keepIndex - keepIndex = which(taxa_names(x) == archetype[1L]) - } else { - # Otherwise, assume it is a taxa index, and assign to keepIndex - keepIndex = as.integer(archetype) - } - } - # Ensure eqtaxa is the integer indices of the taxa that are being merged together - if( is.character(eqtaxa) ){ - # assume OTU name, index it against the OTU names in x - eqtaxa = which(taxa_names(x) %in% eqtaxa) - } else { - # Else assume numeric index of the OTU that are being merged - eqtaxa = as.integer(eqtaxa) - } - # keepIndex is index of the OTU that is kept / everything merged into. - # It must be among the set of indices in eqtaxa or there is a logic error. Stop. - if( length(keepIndex) <= 0L ){ stop("invalid archetype provided.") } - if( !keepIndex %in% eqtaxa ){ stop("invalid archetype provided. It is not part of eqtaxa.") } - # removeIndex is the index of each OTU that will be removed - removeIndex = setdiff(eqtaxa, keepIndex) - # Check that indices are valid - allIndices = unlist(list(keepIndex, removeIndex)) - if( any(allIndices > ntaxa(x) | allIndices < 0L) ){ - stop("invalid OTU indices provided as eqtaxa or archetype.") - } - return(list(removeIndex=removeIndex, keepIndex=keepIndex)) +merge_taxa.indices.internal = function(x, eqtaxa, archetype) { + ## If eqtaxa or archetype are character, interpret them to be OTUs and coerce them + ## to integer indices + if (is.character(archetype)) { + # If archetype is already an OTU, just assign it to keepIndex + keepIndex = which(taxa_names(x) %in% archetype[1L]) + } else { + # Else archetype is the numeric index of the eqtaxa that should be kept. Need to + # grab from unmodifed eqtaxa, and then decide + archetype = eqtaxa[as.integer(archetype[1L])] + if (is.character(archetype)) { + # If archetype is now an OTU name, find the index and assign to keepIndex + keepIndex = which(taxa_names(x) == archetype[1L]) + } else { + # Otherwise, assume it is a taxa index, and assign to keepIndex + keepIndex = as.integer(archetype) + } + } + # Ensure eqtaxa is the integer indices of the taxa that are being merged together + if (is.character(eqtaxa)) { + # assume OTU name, index it against the OTU names in x + eqtaxa = which(taxa_names(x) %in% eqtaxa) + } else { + # Else assume numeric index of the OTU that are being merged + eqtaxa = as.integer(eqtaxa) + } + # keepIndex is index of the OTU that is kept / everything merged into. It must + # be among the set of indices in eqtaxa or there is a logic error. Stop. + if (length(keepIndex) <= 0L) { + stop("invalid archetype provided.") + } + if (!keepIndex %in% eqtaxa) { + stop("invalid archetype provided. It is not part of eqtaxa.") + } + # removeIndex is the index of each OTU that will be removed + removeIndex = setdiff(eqtaxa, keepIndex) + # Check that indices are valid + allIndices = unlist(list(keepIndex, removeIndex)) + if (any(allIndices > ntaxa(x) | allIndices < 0L)) { + stop("invalid OTU indices provided as eqtaxa or archetype.") + } + return(list(removeIndex = removeIndex, keepIndex = keepIndex)) } -################################################################################ +################################################################################ #' @aliases merge_taxa,phyloseq-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "phyloseq", function(x, eqtaxa, - archetype=eqtaxa[which.max(taxa_sums(x)[eqtaxa])]){ +setMethod("merge_taxa", "phyloseq", function(x, eqtaxa, archetype = eqtaxa[which.max(taxa_sums(x)[eqtaxa])]) { - comp_list <- splat.phyloseq.objects(x) - merged_list <- lapply(comp_list, merge_taxa, eqtaxa, archetype) - # the element names can wreak havoc on do.call - names(merged_list) <- NULL - # Re-instantiate the combined object using the species-merged object. - do.call("phyloseq", merged_list) + comp_list <- splat.phyloseq.objects(x) + merged_list <- lapply(comp_list, merge_taxa, eqtaxa, archetype) + # the element names can wreak havoc on do.call + names(merged_list) <- NULL + # Re-instantiate the combined object using the species-merged object. + do.call("phyloseq", merged_list) }) -############################################################################### -# Don't need to merge anything for sample_data. Return As-is. +############################################################################### Don't need to merge anything for sample_data. Return As-is. #' @aliases merge_taxa,sample_data-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "sample_data", function(x, eqtaxa, archetype=1L){ - return(x) +setMethod("merge_taxa", "sample_data", function(x, eqtaxa, archetype = 1L) { + return(x) }) -############################################################################### +############################################################################### #' @aliases merge_taxa,otu_table-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "otu_table", function(x, eqtaxa, - archetype=eqtaxa[which.max(taxa_sums(x)[eqtaxa])]){ +setMethod("merge_taxa", "otu_table", function(x, eqtaxa, archetype = eqtaxa[which.max(taxa_sums(x)[eqtaxa])]) { - if( length(eqtaxa) < 2 ){ - return(x) - } - indList = merge_taxa.indices.internal(x, eqtaxa, archetype) - removeIndex = indList$removeIndex - keepIndex = indList$keepIndex - # Merge taxa by summing all the equivalent taxa and assigning to the one in keepIndex - if( taxa_are_rows(x) ){ - x[keepIndex, ] = colSums(x[eqtaxa, ]) - } else { - x[, keepIndex] = rowSums(x[, eqtaxa]) - } - # For speed, use matrix subsetting instead of prune_taxa() - if (taxa_are_rows(x)) { - x = x[-removeIndex, , drop = FALSE] - } else { - x = x[, -removeIndex, drop = FALSE] - } - return(x) + if (length(eqtaxa) < 2) { + return(x) + } + indList = merge_taxa.indices.internal(x, eqtaxa, archetype) + removeIndex = indList$removeIndex + keepIndex = indList$keepIndex + # Merge taxa by summing all the equivalent taxa and assigning to the one in + # keepIndex + if (taxa_are_rows(x)) { + x[keepIndex, ] = colSums(x[eqtaxa, ]) + } else { + x[, keepIndex] = rowSums(x[, eqtaxa]) + } + # For speed, use matrix subsetting instead of prune_taxa() + if (taxa_are_rows(x)) { + x = x[-removeIndex, , drop = FALSE] + } else { + x = x[, -removeIndex, drop = FALSE] + } + return(x) }) -############################################################################### +############################################################################### #' @importFrom ape drop.tip #' @aliases merge_taxa,phylo-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "phylo", function(x, eqtaxa, archetype=1L){ - # If there is nothing to merge, return x as-is - if( length(eqtaxa) < 2 ){ - return(x) - } - indList = merge_taxa.indices.internal(x, eqtaxa, archetype) - removeIndex = indList$removeIndex - # If there is too much to merge (tree would have 1 or 0 branches), return NULL/warning - if( length(removeIndex) >= (ntaxa(x)-1) ){ - # Can't have a tree with 1 or fewer tips - warning("merge_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.") - return(NULL) - # Else, drop the removeIndex tips and returns the pruned tree. - } else { - return( drop.tip(x, removeIndex) ) - } +setMethod("merge_taxa", "phylo", function(x, eqtaxa, archetype = 1L) { + # If there is nothing to merge, return x as-is + if (length(eqtaxa) < 2) { + return(x) + } + indList = merge_taxa.indices.internal(x, eqtaxa, archetype) + removeIndex = indList$removeIndex + # If there is too much to merge (tree would have 1 or 0 branches), return + # NULL/warning + if (length(removeIndex) >= (ntaxa(x) - 1)) { + # Can't have a tree with 1 or fewer tips + warning("merge_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.") + return(NULL) + # Else, drop the removeIndex tips and returns the pruned tree. + } else { + return(drop.tip(x, removeIndex)) + } }) -############################################################################### +############################################################################### #' @importClassesFrom Biostrings XStringSet #' @aliases merge_taxa,XStringSet-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "XStringSet", function(x, eqtaxa, archetype=1L){ - # If there is nothing to merge, return x as-is - if( length(eqtaxa) < 2 ){ - return(x) - } - indList = merge_taxa.indices.internal(x, eqtaxa, archetype) - removeIndex = indList$removeIndex - # If there is too much to merge (refseq would have 0 sequences), return NULL/warning - if( length(removeIndex) >= ntaxa(x) ){ - # Can't have a refseq list with less - warning("merge_taxa attempted to reduce reference sequence list to 0 sequences.\n refseq replaced with NULL.") - return(NULL) - } else { - # Else, drop the removeIndex sequences and returns the pruned XStringSet object - x <- x[-removeIndex] - return(x) - } +setMethod("merge_taxa", "XStringSet", function(x, eqtaxa, archetype = 1L) { + # If there is nothing to merge, return x as-is + if (length(eqtaxa) < 2) { + return(x) + } + indList = merge_taxa.indices.internal(x, eqtaxa, archetype) + removeIndex = indList$removeIndex + # If there is too much to merge (refseq would have 0 sequences), return + # NULL/warning + if (length(removeIndex) >= ntaxa(x)) { + # Can't have a refseq list with less + warning("merge_taxa attempted to reduce reference sequence list to 0 sequences.\n refseq replaced with NULL.") + return(NULL) + } else { + # Else, drop the removeIndex sequences and returns the pruned XStringSet object + x <- x[-removeIndex] + return(x) + } }) -################################################################################ +################################################################################ #' @aliases merge_taxa,taxonomyTable-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype=1L){ - if( length(eqtaxa) < 2 ){ - return(x) - } - indList = merge_taxa.indices.internal(x, eqtaxa, archetype) - removeIndex = indList$removeIndex - keepIndex = indList$keepIndex - # # # Taxonomy is trivial in ranks after disagreement among merged taxa - # # # Make those values NA_character_ - taxmerge <- as(x, "matrix")[eqtaxa, ] - bad_ranks <- apply(taxmerge, 2, function(i){ length(unique(i)) != 1 }) - # Test if all taxonomies agree. If so, do nothing. Just continue to pruning. - if( any(bad_ranks) ){ - # The col indices of the bad ranks - bad_ranks <- min(which(bad_ranks)):length(bad_ranks) - # Replace bad taxonomy elements in the archetype only (others are pruned) - x[keepIndex, bad_ranks] <- NA_character_ - } - # Finally, remove the OTUs that have been merged into keepIndex - return( x[-removeIndex, , drop = FALSE] ) +setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype = 1L) { + if (length(eqtaxa) < 2) { + return(x) + } + indList = merge_taxa.indices.internal(x, eqtaxa, archetype) + removeIndex = indList$removeIndex + keepIndex = indList$keepIndex + # # # Taxonomy is trivial in ranks after disagreement among merged taxa # # Make + # those values NA_character_ + taxmerge <- as(x, "matrix")[eqtaxa, ] + bad_ranks <- apply(taxmerge, 2, function(i) { + length(unique(i)) != 1 + }) + # Test if all taxonomies agree. If so, do nothing. Just continue to pruning. + if (any(bad_ranks)) { + # The col indices of the bad ranks + bad_ranks <- min(which(bad_ranks)):length(bad_ranks) + # Replace bad taxonomy elements in the archetype only (others are pruned) + x[keepIndex, bad_ranks] <- NA_character_ + } + # Finally, remove the OTUs that have been merged into keepIndex + return(x[-removeIndex, , drop = FALSE]) }) -################################################################################ -################################################################################ +################################################################################ #' Merge samples based on a sample variable or factor. #' #' The purpose of this method is to merge/agglomerate the sample indices of a @@ -497,8 +513,8 @@ setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype=1L){ #' @examples # #' data(GlobalPatterns) #' GP = GlobalPatterns -#' mergedGP = merge_samples(GlobalPatterns, "SampleType") -#' SD = merge_samples(sample_data(GlobalPatterns), "SampleType") +#' mergedGP = merge_samples(GlobalPatterns, 'SampleType') +#' SD = merge_samples(sample_data(GlobalPatterns), 'SampleType') #' print(SD) #' print(mergedGP) #' sample_names(GlobalPatterns) @@ -509,92 +525,104 @@ setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype=1L){ #' OTUnames10 = names(sort(taxa_sums(GP), TRUE)[1:10]) #' GP10 = prune_taxa(OTUnames10, GP) #' mGP10 = prune_taxa(OTUnames10, mergedGP) -#' ocean_samples = sample_names(subset(sample_data(GP), SampleType=="Ocean")) +#' ocean_samples = sample_names(subset(sample_data(GP), SampleType=='Ocean')) #' print(ocean_samples) #' otu_table(GP10)[, ocean_samples] #' rowSums(otu_table(GP10)[, ocean_samples]) -#' otu_table(mGP10)["Ocean", ] -setGeneric("merge_samples", function(x, group, fun=mean) standardGeneric("merge_samples")) -################################################################################ +#' otu_table(mGP10)['Ocean', ] +setGeneric("merge_samples", function(x, group, fun = mean) standardGeneric("merge_samples")) +################################################################################ #' @aliases merge_samples,sample_data-method #' @rdname merge_samples-methods -setMethod("merge_samples", signature("sample_data"), function(x, group, fun=mean){ - x1 <- data.frame(x) - - # Check class of group and modify if "character" - if( class(group)=="character" & length(group)==1 ){ - if( !group %in% colnames(x) ){stop("group not found among sample variable names.")} - group <- x1[, group] - } - if( class(group)!="factor" ){ - # attempt to coerce to factor - group <- factor(group) - } - - # Remove any non-coercable columns. - # Philosophy is to keep as much as possible. If it is coercable at all, keep. - # Coerce all columns to numeric matrix - coercable <- sapply(x1, canCoerce, "numeric") - x2 <- sapply(x1[, coercable], as, "numeric") - rownames(x2) <- rownames(x1) - - # Perform the aggregation. - outdf <- aggregate(x2, list(group), fun) - # get rownames from the "group" column (always first) - # rownames(outdf) <- as.character(outdf[, 1]) - rownames(outdf) <- levels(group) - # "pop" the first column - outdf <- outdf[, -1, drop=FALSE] - - return( sample_data(outdf) ) +setMethod("merge_samples", signature("sample_data"), function(x, group, fun = mean) { + x1 <- data.frame(x) + + # Check class of group and modify if 'character' + if (class(group) == "character" & length(group) == 1) { + if (!group %in% colnames(x)) { + stop("group not found among sample variable names.") + } + group <- x1[, group] + } + if (class(group) != "factor") { + # attempt to coerce to factor + group <- factor(group) + } + + # Remove any non-coercable columns. Philosophy is to keep as much as possible. + # If it is coercable at all, keep. Coerce all columns to numeric matrix + coercable <- sapply(x1, canCoerce, "numeric") + x2 <- sapply(x1[, coercable], as, "numeric") + rownames(x2) <- rownames(x1) + + # Perform the aggregation. + outdf <- aggregate(x2, list(group), fun) + # get rownames from the 'group' column (always first) rownames(outdf) <- + # as.character(outdf[, 1]) + rownames(outdf) <- levels(group) + # 'pop' the first column + outdf <- outdf[, -1, drop = FALSE] + + return(sample_data(outdf)) }) -################################################################################ +################################################################################ #' @aliases merge_samples,otu_table-method #' @rdname merge_samples-methods -setMethod("merge_samples", signature("otu_table"), function(x, group){ - # needs to be in sample-by-species orientation - if( taxa_are_rows(x) ){ x <- t(x) } - # coerce to matrix, x2 - x2 <- as(x, "matrix") - - # # # #aggregate(x2, list(group), fun) - out <- rowsum(x2, group) - - # convert back to otu_table, and return - return( otu_table(out, taxa_are_rows=FALSE) ) +setMethod("merge_samples", signature("otu_table"), function(x, group) { + # needs to be in sample-by-species orientation + if (taxa_are_rows(x)) { + x <- t(x) + } + # coerce to matrix, x2 + x2 <- as(x, "matrix") + + # # # #aggregate(x2, list(group), fun) + out <- rowsum(x2, group) + + # convert back to otu_table, and return + return(otu_table(out, taxa_are_rows = FALSE)) }) -################################################################################ +################################################################################ #' @aliases merge_samples,phyloseq-method #' @rdname merge_samples-methods -setMethod("merge_samples", signature("phyloseq"), function(x, group, fun=mean){ - - # Check if phyloseq object has a sample_data - if( !is.null(sample_data(x, FALSE)) ){ - # Check class of group and modify if single "character" (column name) - if( class(group)=="character" & length(group)==1 ){ - x1 <- data.frame(sample_data(x)) - if( !group %in% colnames(x1) ){stop("group not found among sample variable names.")} - group <- x1[, group] - } - # coerce to factor - if( class(group)!="factor" ){ group <- factor(group) } - # Perform merges. - newSM <- merge_samples(sample_data(x), group, fun) - newOT <- merge_samples(otu_table(x), group) - phyloseqList <- list(newOT, newSM) - # Else, the only relevant object to "merge_samples" is the otu_table - } else { - if( class(group)!="factor" ){ group <- factor(group) } - phyloseqList <- list( newOT=merge_samples(otu_table(x), group) ) - } - - ### Add to build-call-list the remaining components, if present in x. - ### NULL is returned by accessor if object lacks requested component/slot. - ### Order of objects in list doesn't matter for phyloseq. - ### The list should not be named. - if( !is.null(access(x, "tax_table")) ){ phyloseqList <- c(phyloseqList, list(tax_table(x))) } - if( !is.null(access(x, "phy_tree")) ){ phyloseqList <- c(phyloseqList, list(phy_tree(x))) } - - return( do.call("phyloseq", phyloseqList) ) +setMethod("merge_samples", signature("phyloseq"), function(x, group, fun = mean) { + + # Check if phyloseq object has a sample_data + if (!is.null(sample_data(x, FALSE))) { + # Check class of group and modify if single 'character' (column name) + if (class(group) == "character" & length(group) == 1) { + x1 <- data.frame(sample_data(x)) + if (!group %in% colnames(x1)) { + stop("group not found among sample variable names.") + } + group <- x1[, group] + } + # coerce to factor + if (class(group) != "factor") { + group <- factor(group) + } + # Perform merges. + newSM <- merge_samples(sample_data(x), group, fun) + newOT <- merge_samples(otu_table(x), group) + phyloseqList <- list(newOT, newSM) + # Else, the only relevant object to 'merge_samples' is the otu_table + } else { + if (class(group) != "factor") { + group <- factor(group) + } + phyloseqList <- list(newOT = merge_samples(otu_table(x), group)) + } + + ### Add to build-call-list the remaining components, if present in x. NULL is + ### returned by accessor if object lacks requested component/slot. Order of + ### objects in list doesn't matter for phyloseq. The list should not be named. + if (!is.null(access(x, "tax_table"))) { + phyloseqList <- c(phyloseqList, list(tax_table(x))) + } + if (!is.null(access(x, "phy_tree"))) { + phyloseqList <- c(phyloseqList, list(phy_tree(x))) + } + + return(do.call("phyloseq", phyloseqList)) }) -################################################################################ +################################################################################ diff --git a/R/multtest-wrapper.R b/R/multtest-wrapper.R index 1c6af8f0..2589b296 100644 --- a/R/multtest-wrapper.R +++ b/R/multtest-wrapper.R @@ -1,6 +1,4 @@ -#################################################################################### -# # # # Avoiding full import of multtest to mitigate potential conflicts -#################################################################################### +#################################################################################### # # # Avoiding full import of multtest to mitigate potential conflicts #' Multiple testing of taxa abundance according to sample categories/classes #' #' Please note that it is up to you to perform any necessary @@ -20,17 +18,17 @@ #' NOTE: the default test applied to each taxa is a two-sample two-sided #' \code{\link{t.test}}, WHICH WILL FAIL with an error if you provide a data variable #' (or custom vector) that contains MORE THAN TWO classes. One alternative to consider -#' is an F-test, by specifying \code{test="f"} as an additional argument. See +#' is an F-test, by specifying \code{test='f'} as an additional argument. See #' the first example below, and/or further documentation of #' \code{\link[multtest]{mt.maxT}} or \code{\link[multtest]{mt.minP}} #' for other options and formal details. #' -#' @param minPmaxT (Optional). Character string. \code{"mt.minP"} or \code{"mt.maxT"}. -#' Default is to use \code{"\link[multtest]{mt.minP}"}. +#' @param minPmaxT (Optional). Character string. \code{'mt.minP'} or \code{'mt.maxT'}. +#' Default is to use \code{'\link[multtest]{mt.minP}'}. #' #' @param method (Optional). Additional multiple-hypthesis correction methods. #' A character vector from the set \code{\link[stats]{p.adjust.methods}}. -#' Default is \code{"fdr"}, for the Benjamini and Hochberg (1995) method +#' Default is \code{'fdr'}, for the Benjamini and Hochberg (1995) method #' to control False Discovery Rate (FDR). This argument is passed on to #' \code{\link[stats]{p.adjust}}, please see that documentation for more details. #' @@ -63,111 +61,109 @@ #' # Filter samples that don't have Enterotype #' x <- subset_samples(enterotype, !is.na(Enterotype)) #' # (the taxa are at the genera level in this dataset) -#' res = mt(x, "Enterotype", method=c("fdr", "bonferroni"), test="f", B=300) +#' res = mt(x, 'Enterotype', method=c('fdr', 'bonferroni'), test='f', B=300) #' head(res, 10) #' ## # Not surprisingly, Prevotella and Bacteroides top the list. #' ## # Different test, multiple-adjusted t-test, whether samples are ent-2 or not. -#' ## mt(x, get_variable(x, "Enterotype")==2) -setGeneric("mt", function(physeq, classlabel, minPmaxT="minP", method="fdr", ...) standardGeneric("mt") ) -################################################################################ -# First, access the otu_table, and if appropriate, define classlabel from -# the sample_data. +#' ## mt(x, get_variable(x, 'Enterotype')==2) +setGeneric("mt", function(physeq, classlabel, minPmaxT = "minP", method = "fdr", + ...) standardGeneric("mt")) +################################################################################ First, access the otu_table, and if appropriate, define classlabel from the +################################################################################ sample_data. #' @aliases mt,phyloseq,ANY-method #' @rdname mt-methods -setMethod("mt", c("phyloseq", "ANY"), function(physeq, classlabel, minPmaxT="minP", method="fdr", ...){ - # Extract the class information from the sample_data - # if sample_data slot is non-empty, - # and the classlabel is a character-class - # and its length is 1. - if( !is.null(sample_data(physeq, FALSE)) & - inherits(classlabel, "character") & - identical(length(classlabel), 1L) ){ - # Define a raw factor based on the data available in a sample variable - rawFactor = get_variable(physeq, classlabel[1]) - if( !inherits(rawFactor, "factor") ){ - # coerce to a factor if it is not already one. - rawFactor = factor(rawFactor) - } - # Either way, replace `classlabel` with `rawFactor` - classlabel = rawFactor - } - # Either way, dispatch `mt` on otu_table(physeq) - MT = mt(otu_table(physeq), classlabel, minPmaxT, ...) - if( !is.null(tax_table(physeq, FALSE)) ){ - # If there is tax_table data present, - # add/cbind it to the results. - MT = cbind(MT, as(tax_table(physeq), "matrix")[rownames(MT), , drop=FALSE]) - } - if(length(method)>0 & method %in% p.adjust.methods){ +setMethod("mt", c("phyloseq", "ANY"), function(physeq, classlabel, minPmaxT = "minP", + method = "fdr", ...) { + # Extract the class information from the sample_data if sample_data slot is + # non-empty, and the classlabel is a character-class and its length is 1. + if (!is.null(sample_data(physeq, FALSE)) & inherits(classlabel, "character") & + identical(length(classlabel), 1L)) { + # Define a raw factor based on the data available in a sample variable + rawFactor = get_variable(physeq, classlabel[1]) + if (!inherits(rawFactor, "factor")) { + # coerce to a factor if it is not already one. + rawFactor = factor(rawFactor) + } + # Either way, replace `classlabel` with `rawFactor` + classlabel = rawFactor + } + # Either way, dispatch `mt` on otu_table(physeq) + MT = mt(otu_table(physeq), classlabel, minPmaxT, ...) + if (!is.null(tax_table(physeq, FALSE))) { + # If there is tax_table data present, add/cbind it to the results. + MT = cbind(MT, as(tax_table(physeq), "matrix")[rownames(MT), , drop = FALSE]) + } + if (length(method) > 0 & method %in% p.adjust.methods) { # Use only the supported methods method <- method[which(method %in% p.adjust.methods)] # Add adjust-p columns. sapply should retain the names. - adjp = sapply(method, function(meth, p){p.adjust(p, meth)}, p = MT$rawp, USE.NAMES = TRUE) + adjp = sapply(method, function(meth, p) { + p.adjust(p, meth) + }, p = MT$rawp, USE.NAMES = TRUE) MT <- cbind(MT, adjp) } - return(MT) + return(MT) }) -################################################################################ -# All valid mt() calls eventually funnel dispatch to this method. -# The otu_table orientation is checked/handled here (and only here). +################################################################################ All valid mt() calls eventually funnel dispatch to this method. The otu_table +################################################################################ orientation is checked/handled here (and only here). #' @aliases mt,otu_table,integer-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "integer"), function(physeq, classlabel, minPmaxT="minP", ...){ - # Guarantee proper orientation of abundance table, and coerce to matrix. - if( !taxa_are_rows(physeq) ){ physeq <- t(physeq) } - mt.phyloseq.internal(as(physeq, "matrix"), classlabel, minPmaxT, ...) +setMethod("mt", c("otu_table", "integer"), function(physeq, classlabel, minPmaxT = "minP", + ...) { + # Guarantee proper orientation of abundance table, and coerce to matrix. + if (!taxa_are_rows(physeq)) { + physeq <- t(physeq) + } + mt.phyloseq.internal(as(physeq, "matrix"), classlabel, minPmaxT, ...) }) -################################################################################ -# Coerce numeric classlabel to be integer, pass-on +################################################################################ Coerce numeric classlabel to be integer, pass-on #' @aliases mt,otu_table,numeric-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "numeric"), function(physeq, classlabel, minPmaxT="minP", ...){ - mt(physeq, as(classlabel, "integer"), minPmaxT="minP", ...) +setMethod("mt", c("otu_table", "numeric"), function(physeq, classlabel, minPmaxT = "minP", + ...) { + mt(physeq, as(classlabel, "integer"), minPmaxT = "minP", ...) }) -################################################################################ -# Coerce logical to integer, pass-on +################################################################################ Coerce logical to integer, pass-on #' @aliases mt,otu_table,logical-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "logical"), function(physeq, classlabel, minPmaxT="minP", ...){ - mt(physeq, as(classlabel, "integer"), minPmaxT="minP", ...) +setMethod("mt", c("otu_table", "logical"), function(physeq, classlabel, minPmaxT = "minP", + ...) { + mt(physeq, as(classlabel, "integer"), minPmaxT = "minP", ...) }) -################################################################################ -# Test for length, then dispatch... +################################################################################ Test for length, then dispatch... #' @aliases mt,otu_table,character-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "character"), function(physeq, classlabel, minPmaxT="minP", ...){ - if( length(classlabel) != nsamples(physeq) ){ - stop("classlabel not the same length as nsamples(physeq)") - } else { - classlabel <- factor(classlabel) - } - # Use mt dispatch with classlabel now a suitable classlabel - mt(physeq, classlabel, minPmaxT, ...) +setMethod("mt", c("otu_table", "character"), function(physeq, classlabel, minPmaxT = "minP", + ...) { + if (length(classlabel) != nsamples(physeq)) { + stop("classlabel not the same length as nsamples(physeq)") + } else { + classlabel <- factor(classlabel) + } + # Use mt dispatch with classlabel now a suitable classlabel + mt(physeq, classlabel, minPmaxT, ...) }) -################################################################################ -# Coerce factor to an integer vector of group labels, -# starting at 0 for the first group +################################################################################ Coerce factor to an integer vector of group labels, starting at 0 for the first +################################################################################ group #' @aliases mt,otu_table,factor-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "factor"), function(physeq, classlabel, minPmaxT="minP", ...){ - # integerize classlabel, starting at 0 - classlabel <- (0:(length(classlabel)-1))[classlabel] - # Use mt dispatch with classlabel now a suitable classlabel - mt(physeq, classlabel, minPmaxT, ...) +setMethod("mt", c("otu_table", "factor"), function(physeq, classlabel, minPmaxT = "minP", + ...) { + # integerize classlabel, starting at 0 + classlabel <- (0:(length(classlabel) - 1))[classlabel] + # Use mt dispatch with classlabel now a suitable classlabel + mt(physeq, classlabel, minPmaxT, ...) }) -#################################################################################### -# Internal function -# @aliases mt,matrix,integer-method -# not exported +#################################################################################### Internal function @aliases mt,matrix,integer-method not exported #' @keywords internal -mt.phyloseq.internal <- function(physeq, classlabel, minPmaxT="minP", ...){ - # require(multtest) - if( minPmaxT == "minP" ){ - return( mt.minP(physeq, classlabel, ...) ) - } else if( minPmaxT == "maxT" ){ - return( mt.maxT(physeq, classlabel, ...) ) - } else { - print("Nothing calculated. minPmaxT argument must be either minP or maxT.") - } +mt.phyloseq.internal <- function(physeq, classlabel, minPmaxT = "minP", ...) { + # require(multtest) + if (minPmaxT == "minP") { + return(mt.minP(physeq, classlabel, ...)) + } else if (minPmaxT == "maxT") { + return(mt.maxT(physeq, classlabel, ...)) + } else { + print("Nothing calculated. minPmaxT argument must be either minP or maxT.") + } } -#################################################################################### +#################################################################################### diff --git a/R/network-methods.R b/R/network-methods.R index 34f129b4..0ce22df6 100644 --- a/R/network-methods.R +++ b/R/network-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Make microbiome network (igraph) #' #' A specialized function for creating a network representation of microbiomes, @@ -7,7 +7,7 @@ #' The graph is ultimately represented using the #' \code{igraph}-package. #' -#' @usage make_network(physeq, type="samples", distance="jaccard", max.dist = 0.4, +#' @usage make_network(physeq, type='samples', distance='jaccard', max.dist = 0.4, #' keep.isolates=FALSE, ...) #' #' @param physeq (Required). Default \code{NULL}. @@ -15,22 +15,22 @@ #' or \code{\link{otu_table-class}} object, #' on which \code{g} is based. \code{phyloseq-class} recommended. #' -#' @param type (Optional). Default \code{"samples"}. +#' @param type (Optional). Default \code{'samples'}. #' Whether the network should be samples or taxa/OTUs. -#' Supported arguments are \code{"samples"}, \code{"taxa"}, -#' where \code{"taxa"} indicates using the OTUs/taxaindices, +#' Supported arguments are \code{'samples'}, \code{'taxa'}, +#' where \code{'taxa'} indicates using the OTUs/taxaindices, #' whether they actually represent species or some other taxonomic rank. #' -#' NOTE: not all distance methods are supported if \code{"taxa"} +#' NOTE: not all distance methods are supported if \code{'taxa'} #' selected for type. For example, the UniFrac distance and DPCoA #' cannot be calculated for taxa-wise distances, because they use #' a taxa-wise tree as part of their calculation between samples, and #' there is no transpose-equivalent for this tree. #' -#' @param distance (Optional). Default \code{"jaccard"}. +#' @param distance (Optional). Default \code{'jaccard'}. #' Any supported argument to the \code{method} parameter of the #' \code{\link{distance}} function is supported here. -#' Some distance methods, like \code{"unifrac"}, may take +#' Some distance methods, like \code{'unifrac'}, may take #' a non-trivial amount of time to calculate, in which case #' you probably want to calculate the distance matrix separately, #' save, and then provide it as the argument to \code{distance} instead. @@ -74,98 +74,103 @@ #' # # Example plots with Enterotype Dataset #' data(enterotype) #' ig <- make_network(enterotype, max.dist=0.3) -#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) +#' plot_network(ig, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) #' # #' ig1 <- make_network(enterotype, max.dist=0.2) -#' plot_network(ig1, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) +#' plot_network(ig1, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) #' # #' # # Three methods of choosing/providing distance/distance-method #' # Provide method name available to distance() function -#' ig <- make_network(enterotype, max.dist=0.3, distance="jaccard") +#' ig <- make_network(enterotype, max.dist=0.3, distance='jaccard') #' # Provide distance object, already computed -#' jaccdist <- distance(enterotype, "jaccard") +#' jaccdist <- distance(enterotype, 'jaccard') #' ih <- make_network(enterotype, max.dist=0.3, distance=jaccdist) -#' # Provide "custom" function. -#' ii <- make_network(enterotype, max.dist=0.3, distance=function(x){vegan::vegdist(x, "jaccard")}) -#' # The have equal results: +#' # Provide 'custom' function. +#' ii <- make_network(enterotype, max.dist=0.3, distance=function(x){vegan::vegdist(x, 'jaccard')}) +#' # The have equal results:\t\t #' all.equal(ig, ih) #' all.equal(ig, ii) #' # -#' # Try out making a trivial "network" of the 3-sample esophagus data, +#' # Try out making a trivial 'network' of the 3-sample esophagus data, #' # with weighted-UniFrac as distance #' data(esophagus) -#' ij <- make_network(esophagus, "samples", "unifrac", weighted=TRUE) -make_network <- function(physeq, type="samples", distance="jaccard", max.dist = 0.4, - keep.isolates=FALSE, ...){ - - if( type %in% c("taxa", "species", "OTUs", "otus", "otu")){ +#' ij <- make_network(esophagus, 'samples', 'unifrac', weighted=TRUE) +make_network <- function(physeq, type = "samples", distance = "jaccard", max.dist = 0.4, + keep.isolates = FALSE, ...) { + + if (type %in% c("taxa", "species", "OTUs", "otus", "otu")) { # Calculate or asign taxa-wise distance matrix - if( class(distance) == "dist" ){ - # If distance a distance object, use it rather than re-calculate - obj.dist <- distance - if( attributes(obj.dist)$Size != ntaxa(physeq) ){ - stop("ntaxa(physeq) does not match size of dist object in distance") - } - if( !setequal(attributes(obj.dist)$Labels, taxa_names(physeq)) ){ - stop("taxa_names does not exactly match dist-indices") - } - } else if( class(distance) == "character" ){ - # If character string, pass on to distance(), assume supported - obj.dist <- distance(physeq, method=distance, type=type, ...) - # Else, assume a custom function and attempt to calculate. - } else { - # Enforce orientation for taxa-wise distances - if( !taxa_are_rows(physeq) ){ physeq <- t(physeq) } - # Calculate distances - obj.dist <- distance(as(otu_table(physeq), "matrix")) - } - # coerce distance-matrix back into vanilla matrix, Taxa Distance Matrix, TaDiMa - TaDiMa <- as.matrix(obj.dist) - # Add Inf to the diagonal to avoid self-connecting edges (inefficient) - TaDiMa <- TaDiMa + diag(Inf, ntaxa(physeq), ntaxa(physeq)) - # Convert distance matrix to coincidence matrix, CoMa, using max.dist - CoMa <- TaDiMa < max.dist - } else if( type == "samples" ){ - # Calculate or asign sample-wise distance matrix - if( class(distance) == "dist" ){ # If argument is already a distance matrix. - # If distance a distance object, use it rather than re-calculate - obj.dist <- distance - if( attributes(obj.dist)$Size != nsamples(physeq) ){ - stop("nsamples(physeq) does not match size of dist object in distance") - } - if( !setequal(attributes(obj.dist)$Labels, sample_names(physeq)) ){ - stop("sample_names does not exactly match dist-indices") - } - # If character string, pass on to distance(), assume supported - } else if( class(distance) == "character" ){ - # Else, assume a custom function and attempt to calculate. - obj.dist <- distance(physeq, method=distance, type=type, ...) - } else { - # Enforce orientation for sample-wise distances - if(taxa_are_rows(physeq)){ physeq <- t(physeq) } - # Calculate distances - obj.dist <- distance(as(otu_table(physeq), "matrix")) - } - # coerce distance-matrix back into vanilla matrix, Sample Distance Matrix, SaDiMa - SaDiMa <- as.matrix(obj.dist) - # Add Inf to the diagonal to avoid self-connecting edges (inefficient) - SaDiMa <- SaDiMa + diag(Inf, nsamples(physeq), nsamples(physeq)) - # Convert distance matrix to coincidence matrix, CoMa, using max.dist - CoMa <- SaDiMa < max.dist - } else { - stop("type argument must be one of \n (1) samples \n or \n (2) taxa") - } - # Calculate the igraph-formatted network - ig <- graph.adjacency(CoMa, mode="lower") - if( !keep.isolates ){ - # If not-keeping isolates, remove them - isolates <- V(ig)[degree(ig) == 0] - ig = delete.vertices(ig, V(ig)[degree(ig) == 0]) - } - if( vcount(ig) < 2 ){ - # Report a warning if the graph is empty - warning("The graph you created has too few vertices. Consider changing `max.dist` argument, and check your data.") - } - return(ig) + if (class(distance) == "dist") { + # If distance a distance object, use it rather than re-calculate + obj.dist <- distance + if (attributes(obj.dist)$Size != ntaxa(physeq)) { + stop("ntaxa(physeq) does not match size of dist object in distance") + } + if (!setequal(attributes(obj.dist)$Labels, taxa_names(physeq))) { + stop("taxa_names does not exactly match dist-indices") + } + } else if (class(distance) == "character") { + # If character string, pass on to distance(), assume supported + obj.dist <- distance(physeq, method = distance, type = type, ...) + # Else, assume a custom function and attempt to calculate. + } else { + # Enforce orientation for taxa-wise distances + if (!taxa_are_rows(physeq)) { + physeq <- t(physeq) + } + # Calculate distances + obj.dist <- distance(as(otu_table(physeq), "matrix")) + } + # coerce distance-matrix back into vanilla matrix, Taxa Distance Matrix, TaDiMa + TaDiMa <- as.matrix(obj.dist) + # Add Inf to the diagonal to avoid self-connecting edges (inefficient) + TaDiMa <- TaDiMa + diag(Inf, ntaxa(physeq), ntaxa(physeq)) + # Convert distance matrix to coincidence matrix, CoMa, using max.dist + CoMa <- TaDiMa < max.dist + } else if (type == "samples") { + # Calculate or asign sample-wise distance matrix If argument is already a + # distance matrix. If distance a distance object, use it rather than + # re-calculate + if (class(distance) == "dist") { + obj.dist <- distance + if (attributes(obj.dist)$Size != nsamples(physeq)) { + stop("nsamples(physeq) does not match size of dist object in distance") + } + if (!setequal(attributes(obj.dist)$Labels, sample_names(physeq))) { + stop("sample_names does not exactly match dist-indices") + } + # If character string, pass on to distance(), assume supported + } else if (class(distance) == "character") { + # Else, assume a custom function and attempt to calculate. + obj.dist <- distance(physeq, method = distance, type = type, ...) + } else { + # Enforce orientation for sample-wise distances + if (taxa_are_rows(physeq)) { + physeq <- t(physeq) + } + # Calculate distances + obj.dist <- distance(as(otu_table(physeq), "matrix")) + } + # coerce distance-matrix back into vanilla matrix, Sample Distance Matrix, SaDiMa + SaDiMa <- as.matrix(obj.dist) + # Add Inf to the diagonal to avoid self-connecting edges (inefficient) + SaDiMa <- SaDiMa + diag(Inf, nsamples(physeq), nsamples(physeq)) + # Convert distance matrix to coincidence matrix, CoMa, using max.dist + CoMa <- SaDiMa < max.dist + } else { + stop("type argument must be one of \n (1) samples \n or \n (2) taxa") + } + # Calculate the igraph-formatted network + ig <- graph.adjacency(CoMa, mode = "lower") + if (!keep.isolates) { + # If not-keeping isolates, remove them + isolates <- V(ig)[degree(ig) == 0] + ig = delete.vertices(ig, V(ig)[degree(ig) == 0]) + } + if (vcount(ig) < 2) { + # Report a warning if the graph is empty + warning("The graph you created has too few vertices. Consider changing `max.dist` argument, and check your data.") + } + return(ig) } -################################################################################ +################################################################################ diff --git a/R/ordination-methods.R b/R/ordination-methods.R index 17dd03dd..92dcb31c 100644 --- a/R/ordination-methods.R +++ b/R/ordination-methods.R @@ -1,9 +1,9 @@ -################################################################################ +################################################################################ #' Perform an ordination on phyloseq data #' #' This function wraps several commonly-used ordination methods. The type of #' ordination depends upon the argument to \code{method}. Try -#' \code{ordinate("help")} or \code{ordinate("list")} for the currently +#' \code{ordinate('help')} or \code{ordinate('list')} for the currently #' supported method options. #' #' @param physeq (Required). Phylogenetic sequencing data @@ -19,10 +19,10 @@ #' also results in these different data components being checked for validity #' and completeness by the method. #' -#' @param method (Optional). A character string. Default is \code{"DCA"}. +#' @param method (Optional). A character string. Default is \code{'DCA'}. #' #' Currently supported method options are: -#' \code{c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA")} +#' \code{c('DCA', 'CCA', 'RDA', 'CAP', 'DPCoA', 'NMDS', 'MDS', 'PCoA')} #' #' \describe{ #' \item{DCA}{Performs detrended correspondence analysis using\code{\link{decorana}}} @@ -54,7 +54,7 @@ #' argument to \code{metaMDS} if it is among the #' supported \code{vegdist} methods. However, all distance methods #' supported by \code{\link{distance}} are supported here, -#' including \code{"unifrac"} (the default) and \code{"DPCoA"}.} +#' including \code{'unifrac'} (the default) and \code{'DPCoA'}.} #' \item{MDS/PCoA}{Performs principal coordinate analysis #' (also called principle coordinate decomposition, #' multidimensional scaling (MDS), or classical scaling) @@ -62,10 +62,10 @@ #' including two correction methods for negative eigenvalues. #' See #' \code{\link[ape]{pcoa}} for further details. -#' } -#' } +#' }\t +#'\t} #' -#' @param distance (Optional). A character string. Default is \code{"bray"}. +#' @param distance (Optional). A character string. Default is \code{'bray'}. #' The name of a supported \code{\link{distance}} method; #' or, alternatively, #' a pre-computed \code{\link{dist}}-class object. @@ -75,9 +75,9 @@ #' #' Any supported \code{\link{distance}} methods #' are supported arguments to \code{distance} here. -#' Try \code{distance("list")} for a explicitly supported distance method +#' Try \code{distance('list')} for a explicitly supported distance method #' abbreviations. User-specified custom distance equations should also work, -#' e.g. \code{"(A+B-2*J)/(A+B)"}. +#' e.g. \code{'(A+B-2*J)/(A+B)'}. #' See \code{\link{distance}} for more details, examples. #' #' @param formula (Optional). A model \code{\link{formula}}. @@ -92,9 +92,9 @@ #' #' @param ... (Optional). Additional arguments to supporting functions. For #' example, the additional argument \code{weighted=TRUE} would be passed on -#' to \code{\link{UniFrac}} if \code{"unifrac"} were chosen as the -#' \code{distance} option and \code{"MDS"} as the ordination \code{method} -#' option. Alternatively, if \code{"DCA"} were chosen as the +#' to \code{\link{UniFrac}} if \code{'unifrac'} were chosen as the +#' \code{distance} option and \code{'MDS'} as the ordination \code{method} +#' option. Alternatively, if \code{'DCA'} were chosen as the #' ordination \code{method} option, additional arguments would be passed on #' to the relevant ordination function, \code{\link{decorana}}, for example. #' @@ -153,94 +153,94 @@ #' @examples #' # See http://joey711.github.io/phyloseq/plot_ordination-examples #' # for many more examples. -#' # plot_ordination(GP, ordinate(GP, "DCA"), "samples", color="SampleType") -ordinate = function(physeq, method="DCA", distance="bray", formula=NULL, ...){ - # If `physeq` is a formula, post deprecated notice, attempt to convert and dispatch - if( inherits(physeq, "formula") ){ - .Deprecated(msg=paste0("First argument, `physeq`, as formula is deprecated.\n", - "There is now an explicit `formula` argument.\n", - "Please revise method call accordingly.")) +#' # plot_ordination(GP, ordinate(GP, 'DCA'), 'samples', color='SampleType') +ordinate = function(physeq, method = "DCA", distance = "bray", formula = NULL, ...) { + # If `physeq` is a formula, post deprecated notice, attempt to convert and + # dispatch + if (inherits(physeq, "formula")) { + .Deprecated(msg = paste0("First argument, `physeq`, as formula is deprecated.\n", + "There is now an explicit `formula` argument.\n", "Please revise method call accordingly.")) # Create the new formula, RHS-only formchar = as.character(physeq) # Error if only RHS. Formula-first syntax required both sides. - if(length(formchar) < 3){ + if (length(formchar) < 3) { stop("Need both sides of formula in this deprecated syntax... Revisit ordinate() documentation / examples.") } # Replace with (presumed) phyloseq object. physeq <- get(as.character(physeq)[2]) - # Create the new formula, RHS-only. - newFormula = as.formula(paste0("~", formchar[length(formchar)])) + # Create the new formula, RHS-only. + newFormula = as.formula(paste0("~", formchar[length(formchar)])) # Dispatch to (hopefully) ordinate,phyloseq - return(ordinate(physeq, method=method, distance=distance, formula=newFormula, ...)) + return(ordinate(physeq, method = method, distance = distance, formula = newFormula, + ...)) + } + # Define table of currently-supported methods + method_table <- c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA") + # List supported method names to user, if requested. + if (inherits(physeq, "character")) { + if (physeq == "help") { + cat("Available arguments to methods:\n") + print(c(method_table)) + cat("Please be exact, partial-matching not supported.\n") + cat("Can alternatively provide a custom distance.\n") + cat("See:\n help(\"distance\") \n") + return() + } else if (physeq == "list") { + return(c(method_table)) + } else { + cat("physeq needs to be a phyloseq-class object, \n") + cat("or a character string matching \"help\" or \"list\". \n") + } } - # Define table of currently-supported methods - method_table <- c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA") - # List supported method names to user, if requested. - if( inherits(physeq, "character") ){ - if( physeq=="help" ){ - cat("Available arguments to methods:\n") - print(c(method_table)) - cat("Please be exact, partial-matching not supported.\n") - cat("Can alternatively provide a custom distance.\n") - cat("See:\n help(\"distance\") \n") - return() - } else if( physeq=="list" ){ - return(c(method_table)) - } else { - cat("physeq needs to be a phyloseq-class object, \n") - cat("or a character string matching \"help\" or \"list\". \n") - } - } # Final check that `physeq` is a phyloseq or otu_table class - if( !inherits(physeq, "phyloseq") & !inherits(physeq, "otu_table") ){ + if (!inherits(physeq, "phyloseq") & !inherits(physeq, "otu_table")) { stop("Expected a phyloseq object or otu_table object.") } - # # Start with methods that don't require - # # additional distance calculation. (distance argument ignored) - # DCA - if( method == "DCA" ){ - return( decorana(veganifyOTU(physeq), ...) ) - } - # CCA / RDA - if( method %in% c("CCA", "RDA") ){ - return(cca.phyloseq(physeq, formula, method, ...)) - } - # CAP - if( method == "CAP" ){ + # # Start with methods that don't require # additional distance calculation. + # (distance argument ignored) DCA + if (method == "DCA") { + return(decorana(veganifyOTU(physeq), ...)) + } + # CCA / RDA + if (method %in% c("CCA", "RDA")) { + return(cca.phyloseq(physeq, formula, method, ...)) + } + # CAP + if (method == "CAP") { # Call/return with do.call - return(capscale.phyloseq(physeq, formula, distance, ...)) - } - # DPCoA - if( method == "DPCoA" ){ - return( DPCoA(physeq, ...) ) - } - # # Now resort to methods that do require a separate distance/dist-calc - # Define ps.dist. Check the class of distance argument is character or dist - if( inherits(distance, "dist") ){ - ps.dist <- distance - } else if( class(distance) == "character" ){ - # There are some special options for NMDS/metaMDS if distance-method - # is supported by vegdist, so check first. If not, just calculate distance - vegdist_methods <- c("manhattan", "euclidean", "canberra", "bray", - "kulczynski", "jaccard", "gower", "altGower", "morisita", "horn", - "mountford", "raup" , "binomial", "chao") - # NMDS with vegdist-method to include species - if(method == "NMDS" & distance %in% vegdist_methods){ - return(metaMDS(veganifyOTU(physeq), distance, ...)) - } - # Calculate distance with handoff to distance() - ps.dist <- distance(physeq, distance, ...) - } - # Vanilla MDS/PCoA - if( method %in% c("PCoA", "MDS")){ - return(pcoa(ps.dist)) - } - # NMDS with non-vegdist-method - if(method == "NMDS"){ - return(metaMDS(ps.dist)) - } + return(capscale.phyloseq(physeq, formula, distance, ...)) + } + # DPCoA + if (method == "DPCoA") { + return(DPCoA(physeq, ...)) + } + # # Now resort to methods that do require a separate distance/dist-calc Define + # ps.dist. Check the class of distance argument is character or dist + if (inherits(distance, "dist")) { + ps.dist <- distance + } else if (class(distance) == "character") { + # There are some special options for NMDS/metaMDS if distance-method is supported + # by vegdist, so check first. If not, just calculate distance + vegdist_methods <- c("manhattan", "euclidean", "canberra", "bray", "kulczynski", + "jaccard", "gower", "altGower", "morisita", "horn", "mountford", "raup", + "binomial", "chao") + # NMDS with vegdist-method to include species + if (method == "NMDS" & distance %in% vegdist_methods) { + return(metaMDS(veganifyOTU(physeq), distance, ...)) + } + # Calculate distance with handoff to distance() + ps.dist <- distance(physeq, distance, ...) + } + # Vanilla MDS/PCoA + if (method %in% c("PCoA", "MDS")) { + return(pcoa(ps.dist)) + } + # NMDS with non-vegdist-method + if (method == "NMDS") { + return(metaMDS(ps.dist)) + } } -################################################################################ +################################################################################ #' Calculate Double Principle Coordinate Analysis (DPCoA) #' using phylogenetic distance #' @@ -310,9 +310,9 @@ ordinate = function(physeq, method="DCA", distance="bray", formula=NULL, ...){ #' data(esophagus) #' eso.dpcoa <- DPCoA(esophagus) #' eso.dpcoa -#' plot_ordination(esophagus, eso.dpcoa, "samples") -#' plot_ordination(esophagus, eso.dpcoa, "species") -#' plot_ordination(esophagus, eso.dpcoa, "biplot") +#' plot_ordination(esophagus, eso.dpcoa, 'samples') +#' plot_ordination(esophagus, eso.dpcoa, 'species') +#' plot_ordination(esophagus, eso.dpcoa, 'biplot') #' # #' # #' # # # # # # GlobalPatterns @@ -322,49 +322,47 @@ ordinate = function(physeq, method="DCA", distance="bray", formula=NULL, ...){ #' GP <- prune_taxa(keepTaxa, GlobalPatterns) #' # Perform DPCoA #' GP.dpcoa <- DPCoA(GP) -#' plot_ordination(GP, GP.dpcoa, color="SampleType") -DPCoA <- function(physeq, correction=cailliez, scannf=FALSE, ...){ - # Check that physeq is a phyloseq-class - if(!class(physeq)=="phyloseq"){stop("physeq must be phyloseq-class")} - - # Remove any OTUs that are absent from all the samples. - physeq <- prune_taxa((taxa_sums(physeq) > 0), physeq) - - # Access components for handing-off - OTU <- otu_table(physeq) - tree <- phy_tree(physeq) - - # Enforce that OTU is in samples-by-species orientation - if(taxa_are_rows(OTU) ){ OTU <- t(OTU) } +#' plot_ordination(GP, GP.dpcoa, color='SampleType') +DPCoA <- function(physeq, correction = cailliez, scannf = FALSE, ...) { + # Check that physeq is a phyloseq-class + if (!class(physeq) == "phyloseq") { + stop("physeq must be phyloseq-class") + } + + # Remove any OTUs that are absent from all the samples. + physeq <- prune_taxa((taxa_sums(physeq) > 0), physeq) + + # Access components for handing-off + OTU <- otu_table(physeq) + tree <- phy_tree(physeq) - # get the patristic distances between the species from the tree - patristicDist <- as.dist(cophenetic.phylo(tree)) - - # if the patristic distances are not Euclidean, - # then correct them or throw meaningful error. - if( !is.euclid(patristicDist) ){ - patristicDist <- correction(patristicDist) - - # Check that this is now Euclidean. - if( !is.euclid(patristicDist) ){ - stop('Corrected distance still not Euclidean \n', - "please provide a different correction method") - } - } - - # NOTE: the dpcoa function in ade4 requires a data.frame - return( dpcoa(data.frame(OTU), patristicDist, scannf, ...) ) + # Enforce that OTU is in samples-by-species orientation + if (taxa_are_rows(OTU)) { + OTU <- t(OTU) + } + + # get the patristic distances between the species from the tree + patristicDist <- as.dist(cophenetic.phylo(tree)) + + # if the patristic distances are not Euclidean, then correct them or throw + # meaningful error. + if (!is.euclid(patristicDist)) { + patristicDist <- correction(patristicDist) + + # Check that this is now Euclidean. + if (!is.euclid(patristicDist)) { + stop("Corrected distance still not Euclidean \n", "please provide a different correction method") + } + } + + # NOTE: the dpcoa function in ade4 requires a data.frame + return(dpcoa(data.frame(OTU), patristicDist, scannf, ...)) } -################################################################################ -################################################################################ -# vegan::cca "extension". -# formula is main input to this function. This complicates signature handling. -# A new method with a separate name is defined instead. -# -# Must transpose the phyloseq otu_table to fit the vegan::cca convention -# Whether-or-not to transpose needs to be a check, based on the -# "taxa_are_rows" slot value -################################################################################ +################################################################################ vegan::cca 'extension'. formula is main input to this function. This +################################################################################ complicates signature handling. A new method with a separate name is defined +################################################################################ instead. Must transpose the phyloseq otu_table to fit the vegan::cca +################################################################################ convention Whether-or-not to transpose needs to be a check, based on the +################################################################################ 'taxa_are_rows' slot value #' Constrained Correspondence Analysis and Redundancy Analysis. #' #' This is the internal function that simplifies getting phyloseq data @@ -383,7 +381,7 @@ DPCoA <- function(physeq, correction=cailliez, scannf=FALSE, ...){ #' from within \code{physeq}. #' #' @param method (Optional). A single \code{\link{character}} string, -#' specifying \code{"RDA"} or \code{"CCA"}. Default is \code{"CCA"}. +#' specifying \code{'RDA'} or \code{'CCA'}. Default is \code{'CCA'}. #' #' @param ... (Optional). Additional named arguments passed to #' \code{\link[vegan]{capscale}}. @@ -401,29 +399,29 @@ DPCoA <- function(physeq, correction=cailliez, scannf=FALSE, ...){ #' @keywords internal #' @examples # #' # cca.phyloseq(physeq, formula, method, ...) -setGeneric("cca.phyloseq", function(physeq, formula=NULL, method="CCA", ...){ +setGeneric("cca.phyloseq", function(physeq, formula = NULL, method = "CCA", ...) { standardGeneric("cca.phyloseq") }) #' @importFrom vegan cca #' @importFrom vegan rda #' @aliases cca.phyloseq,phyloseq,formula-method #' @rdname cca-rda-phyloseq-methods -setMethod("cca.phyloseq", signature=c("phyloseq", "formula"), -function(physeq, formula, method="CCA", ...){ - data = data.frame(sample_data(physeq, FALSE), stringsAsFactors=FALSE) - if( length(data) < 1 ){ +setMethod("cca.phyloseq", signature = c("phyloseq", "formula"), function(physeq, + formula, method = "CCA", ...) { + data = data.frame(sample_data(physeq, FALSE), stringsAsFactors = FALSE) + if (length(data) < 1) { stop("`physeq` argument must include non-empty `sample_data`") } - OTU = veganifyOTU(physeq) - # Create new formula. Left-hand side is ignored. - formchar = as.character(formula) - newFormula = as.formula(paste0("OTU ~ ", formchar[length(formchar)])) - # Note that ade4 also has a conflicting "cca" function. - # You don't import ade4::cca to avoid the conflict. - if(method=="CCA"){ - return(cca(newFormula, data=data)) - } else if(method=="RDA"){ - return(rda(newFormula, data=data)) + OTU = veganifyOTU(physeq) + # Create new formula. Left-hand side is ignored. + formchar = as.character(formula) + newFormula = as.formula(paste0("OTU ~ ", formchar[length(formchar)])) + # Note that ade4 also has a conflicting 'cca' function. You don't import + # ade4::cca to avoid the conflict. + if (method == "CCA") { + return(cca(newFormula, data = data)) + } else if (method == "RDA") { + return(rda(newFormula, data = data)) } else { warning("Unsupported `method` argument. Must be 'RDA' or 'CCA'") return(NULL) @@ -432,29 +430,29 @@ function(physeq, formula, method="CCA", ...){ #' @importFrom vegan cca #' @aliases cca.phyloseq,otu_table-method #' @rdname cca-rda-phyloseq-methods -setMethod("cca.phyloseq", signature="otu_table", - function(physeq, formula=NULL, method="CCA", ...){ +setMethod("cca.phyloseq", signature = "otu_table", function(physeq, formula = NULL, + method = "CCA", ...) { # OTU table by itself indicates an unconstrained ordination is requested. # Formula argument is ignored. - if(method=="CCA"){ - return(cca(veganifyOTU(physeq))) - } else if(method=="RDA"){ - return(rda(veganifyOTU(physeq))) - } else { - warning("Unsupported `method` argument. Must be 'RDA' or 'CCA'") + if (method == "CCA") { + return(cca(veganifyOTU(physeq))) + } else if (method == "RDA") { + return(rda(veganifyOTU(physeq))) + } else { + warning("Unsupported `method` argument. Must be 'RDA' or 'CCA'") return(NULL) - } + } }) #' @importFrom vegan cca #' @aliases cca.phyloseq,phyloseq,NULL-method #' @rdname cca-rda-phyloseq-methods -setMethod("cca.phyloseq", signature=c("phyloseq", "NULL"), -function(physeq, formula, method="CCA", ...){ - # Absence of a formula (NULL) indicates unconstrained ordination. - # Access otu_table, and dispatch. +setMethod("cca.phyloseq", signature = c("phyloseq", "NULL"), function(physeq, formula, + method = "CCA", ...) { + # Absence of a formula (NULL) indicates unconstrained ordination. Access + # otu_table, and dispatch. return(cca.phyloseq(otu_table(physeq), NULL, method, ...)) }) -################################################################################ +################################################################################ #' Estimate the gap statistic on an ordination result #' #' This is a wrapper for the \code{\link[cluster]{clusGap}} function, @@ -465,10 +463,10 @@ function(physeq, formula, method="CCA", ...){ #' should work, ultimately by passing to the \code{\link[vegan]{scores}} function #' or its internal extensions in phyloseq. #' @param axes (Optional). The ordination axes that you want to include. -#' @param type (Optional). One of \code{"sites"} +#' @param type (Optional). One of \code{'sites'} #' (the vegan package label for samples) or -#' \code{"species"} (the vegan package label for OTUs/taxa). -#' Default is \code{"sites"}. +#' \code{'species'} (the vegan package label for OTUs/taxa). +#' Default is \code{'sites'}. #' @param FUNcluster (Optional). This is passed to \code{\link[cluster]{clusGap}}. #' The documentation is copied here for convenience: #' a function which accepts as first argument a (data) matrix like \code{x}, @@ -484,7 +482,7 @@ function(physeq, formula, method="CCA", ...){ #' Any function that has these input/output properties (performing a clustering) #' will suffice. The more appropriate the clustering method, the better chance #' your gap statistic results will be useful. -#' @param K.max (Optional). A single positive integer value. +#' @param K.max\t(Optional). A single positive integer value. #' It indicates the maximum number of clusters that will be considered. #' Value must be at least two. #' This is passed to \code{\link[cluster]{clusGap}}. @@ -496,7 +494,7 @@ function(physeq, formula, method="CCA", ...){ #' See the \code{\link[cluster]{clusGap}} documentation for more details. #' #' @return -#' An object of S3 class \code{"clusGap"}, basically a list with components. +#' An object of S3 class \code{'clusGap'}, basically a list with components. #' See the \code{\link[cluster]{clusGap}} documentation for more details. #' #' @importFrom vegan scores @@ -504,38 +502,38 @@ function(physeq, formula, method="CCA", ...){ #' @importFrom cluster pam #' @export #' @examples -#' data("soilrep") -#' sord = ordinate(soilrep, "PCoA", "bray") +#' data('soilrep') +#' sord = ordinate(soilrep, 'PCoA', 'bray') #' # Evaluate axes with scree plot #' plot_scree(sord) #' # Gap Statistic #' gs = gapstat_ord(sord, axes=1:3, verbose=FALSE) -#' # plot_ordination(soilrep, sord, color="Treatment") +#' # plot_ordination(soilrep, sord, color='Treatment') #' plot_clusgap(gs) -#' print(gs, method="Tibs2001SEmax") -gapstat_ord = function(ord, axes=c(1:2), type="sites", - FUNcluster=function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))}, - K.max=8, ...){ - # - # Use the scores function to get the ordination coordinates - x = scores(ord, display=type) - # If axes not explicitly defined (NULL), then use all of them - if(is.null(axes)){ - axes = 1:ncol(x) - } - # Finally, perform, and return, the gap statistic calculation using - # cluster::clusGap - return(clusGap(x[, axes], FUNcluster, K.max, ...)) +#' print(gs, method='Tibs2001SEmax') +gapstat_ord = function(ord, axes = c(1:2), type = "sites", FUNcluster = function(x, + k) { + list(cluster = pam(x, k, cluster.only = TRUE)) +}, K.max = 8, ...) { + # Use the scores function to get the ordination coordinates + x = scores(ord, display = type) + # If axes not explicitly defined (NULL), then use all of them + if (is.null(axes)) { + axes = 1:ncol(x) + } + # Finally, perform, and return, the gap statistic calculation using + # cluster::clusGap + return(clusGap(x[, axes], FUNcluster, K.max, ...)) } -################################################################################ -# Define an internal function for accessing and orienting the OTU table -# in a fashion suitable for vegan functions -# @keywords internal -veganifyOTU <- function(physeq){ - if(taxa_are_rows(physeq)){physeq <- t(physeq)} +################################################################################ Define an internal function for accessing and orienting the OTU table in a +################################################################################ fashion suitable for vegan functions @keywords internal +veganifyOTU <- function(physeq) { + if (taxa_are_rows(physeq)) { + physeq <- t(physeq) + } return(as(otu_table(physeq), "matrix")) } -################################################################################ +################################################################################ #' Constrained Analysis of Principal Coordinates, \code{\link[vegan]{capscale}}. #' #' See \code{\link[vegan]{capscale}} for details. A formula is main input. @@ -562,7 +560,7 @@ veganifyOTU <- function(physeq){ #' (except for possible sign reversal). However, it makes no sense to use #' \code{\link[vegan]{capscale}} with Euclidean distances, #' since direct use of \code{\link[vegan]{rda}} is much more efficient -#' (and supported in the \code{\link{ordinate}} function with \code{method=="RDA"}) +#' (and supported in the \code{\link{ordinate}} function with \code{method=='RDA'}) #' Even with non-Euclidean dissimilarities, #' the rest of the analysis will be metric and linear. #' @@ -588,11 +586,11 @@ veganifyOTU <- function(physeq){ #' # http://joey711.github.io/phyloseq/plot_ordination-examples #' data(GlobalPatterns) #' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns) -#' ordcap = ordinate(GP, "CAP", "bray", ~SampleType) -#' plot_ordination(GP, ordcap, "samples", color="SampleType") -setGeneric("capscale.phyloseq", function(physeq, formula, distance, ...){ - data = data.frame(sample_data(physeq, FALSE), stringsAsFactors=FALSE) - if( length(data) < 1 ){ +#' ordcap = ordinate(GP, 'CAP', 'bray', ~SampleType) +#' plot_ordination(GP, ordcap, 'samples', color='SampleType') +setGeneric("capscale.phyloseq", function(physeq, formula, distance, ...) { + data = data.frame(sample_data(physeq, FALSE), stringsAsFactors = FALSE) + if (length(data) < 1) { stop("`physeq` argument must include non-empty `sample_data`") } standardGeneric("capscale.phyloseq") @@ -600,45 +598,43 @@ setGeneric("capscale.phyloseq", function(physeq, formula, distance, ...){ #' @importFrom vegan capscale #' @aliases capscale.phyloseq,phyloseq,formula,dist-method #' @rdname capscale-phyloseq-methods -setMethod("capscale.phyloseq", c("phyloseq", "formula", "dist"), -function(physeq, formula, distance, ...){ - data = data.frame(sample_data(physeq), stringsAsFactors=FALSE) +setMethod("capscale.phyloseq", c("phyloseq", "formula", "dist"), function(physeq, + formula, distance, ...) { + data = data.frame(sample_data(physeq), stringsAsFactors = FALSE) # Convert formula to character vector, compute on language. - formchar = as.character(formula) + formchar = as.character(formula) newFormula = as.formula(paste0("distance ~ ", formchar[length(formchar)])) - return(capscale(formula=newFormula, data=data, ...)) + return(capscale(formula = newFormula, data = data, ...)) }) #' @importFrom vegan capscale #' @aliases capscale.phyloseq,phyloseq,formula,character-method #' @rdname capscale-phyloseq-methods -setMethod("capscale.phyloseq", c("phyloseq", "formula", "character"), -function(physeq, formula, distance, ...){ - data = data.frame(sample_data(physeq), stringsAsFactors=FALSE) - # The goal here is to process the distance identifier string - # and dispatch accordingly. - if( length(distance) != 1 ){ - warning("`distance` was unexpected length. \n", - " `distance` argument should be a single character string", - " or dist matrix. \n", - "Attempting to use first element only.") +setMethod("capscale.phyloseq", c("phyloseq", "formula", "character"), function(physeq, + formula, distance, ...) { + data = data.frame(sample_data(physeq), stringsAsFactors = FALSE) + # The goal here is to process the distance identifier string and dispatch + # accordingly. + if (length(distance) != 1) { + warning("`distance` was unexpected length. \n", " `distance` argument should be a single character string", + " or dist matrix. \n", "Attempting to use first element only.") } distance <- distance[1] - if(!distance %in% unlist(distance("list"))){ - # distance must be among the supported distance options - # (which is a superset of vegdist). + if (!distance %in% unlist(distance("list"))) { + # distance must be among the supported distance options (which is a superset of + # vegdist). stop("The distance method you specified is not supported by phyloseq") } # Convert formula to character vector, compute on language. formchar = as.character(formula) - if(distance %in% distance("list")$vegdist){ + if (distance %in% distance("list")$vegdist) { # If it is among the vegdist distances, pass it along to vegan::capscale OTU = veganifyOTU(physeq) newFormula = as.formula(paste0("OTU ~ ", formchar[length(formchar)])) - return(capscale(formula=newFormula, data=data, distance=distance, ...)) + return(capscale(formula = newFormula, data = data, distance = distance, ...)) } else { # Else calculate the distance matrix here, and dispatch. - distance <- distance(physeq=physeq, method=distance, type="samples") + distance <- distance(physeq = physeq, method = distance, type = "samples") return(capscale.phyloseq(physeq, formula, distance, ...)) } }) -################################################################################ \ No newline at end of file +################################################################################ diff --git a/R/otuTable-class.R b/R/otuTable-class.R index f202de95..dceda0ba 100644 --- a/R/otuTable-class.R +++ b/R/otuTable-class.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Build or access the otu_table. #' #' This is the suggested method for both constructing and accessing @@ -34,56 +34,58 @@ #' @examples # #' # data(GlobalPatterns) #' # otu_table(GlobalPatterns) -setGeneric("otu_table", function(object, taxa_are_rows, errorIfNULL=TRUE){ - standardGeneric("otu_table") +setGeneric("otu_table", function(object, taxa_are_rows, errorIfNULL = TRUE) { + standardGeneric("otu_table") }) # Access the otu_table slot. #' @aliases otu_table,phyloseq-method #' @rdname otu_table-methods -setMethod("otu_table", "phyloseq", function(object, errorIfNULL=TRUE){ - access(object, "otu_table", errorIfNULL) +setMethod("otu_table", "phyloseq", function(object, errorIfNULL = TRUE) { + access(object, "otu_table", errorIfNULL) }) # return the otu_table as-is. #' @aliases otu_table,otu_table-method #' @rdname otu_table-methods -setMethod("otu_table", "otu_table", function(object, errorIfNULL=TRUE){ return(object) }) +setMethod("otu_table", "otu_table", function(object, errorIfNULL = TRUE) { + return(object) +}) # Instantiate an otu_table from a raw abundance matrix. #' @aliases otu_table,matrix-method #' @rdname otu_table-methods -setMethod("otu_table", "matrix", function(object, taxa_are_rows){ - # instantiate first to check validity - otutab <- new("otu_table", object, taxa_are_rows=taxa_are_rows) - # Want dummy species/sample index names if missing - if(taxa_are_rows){ - if(is.null(rownames(otutab))){ - rownames(otutab) <- paste("sp", 1:nrow(otutab), sep="") - } - if(is.null(colnames(otutab))){ - colnames(otutab) <- paste("sa", 1:ncol(otutab), sep="") - } - } else { - if(is.null(rownames(otutab))){ - rownames(otutab) <- paste("sa",1:nrow(otutab),sep="") - } - if(is.null(colnames(otutab))){ - colnames(otutab) <- paste("sp",1:ncol(otutab),sep="") - } - } - return(otutab) +setMethod("otu_table", "matrix", function(object, taxa_are_rows) { + # instantiate first to check validity + otutab <- new("otu_table", object, taxa_are_rows = taxa_are_rows) + # Want dummy species/sample index names if missing + if (taxa_are_rows) { + if (is.null(rownames(otutab))) { + rownames(otutab) <- paste("sp", 1:nrow(otutab), sep = "") + } + if (is.null(colnames(otutab))) { + colnames(otutab) <- paste("sa", 1:ncol(otutab), sep = "") + } + } else { + if (is.null(rownames(otutab))) { + rownames(otutab) <- paste("sa", 1:nrow(otutab), sep = "") + } + if (is.null(colnames(otutab))) { + colnames(otutab) <- paste("sp", 1:ncol(otutab), sep = "") + } + } + return(otutab) }) # # # Convert to matrix, then dispatch. #' @aliases otu_table,data.frame-method #' @rdname otu_table-methods -setMethod("otu_table", "data.frame", function(object, taxa_are_rows){ - otu_table(as(object, "matrix"), taxa_are_rows) +setMethod("otu_table", "data.frame", function(object, taxa_are_rows) { + otu_table(as(object, "matrix"), taxa_are_rows) }) # Any less-specific class, not inherited by those above. #' @aliases otu_table,ANY-method #' @rdname otu_table-methods -setMethod("otu_table", "ANY", function(object, errorIfNULL=TRUE){ - access(object, "otu_table", errorIfNULL) +setMethod("otu_table", "ANY", function(object, errorIfNULL = TRUE) { + access(object, "otu_table", errorIfNULL) }) -################################################################################ +################################################################################ #' Returns the total number of individuals observed from each species/taxa/OTU. #' #' A convenience function equivalent to rowSums or colSums, but where @@ -104,15 +106,15 @@ setMethod("otu_table", "ANY", function(object, errorIfNULL=TRUE){ #' taxa_sums(enterotype) #' data(esophagus) #' taxa_sums(esophagus) -taxa_sums <- function(x){ - x <- otu_table(x) - if( taxa_are_rows(x) ){ - rowSums(x) - } else { - colSums(x) - } +taxa_sums <- function(x) { + x <- otu_table(x) + if (taxa_are_rows(x)) { + rowSums(x) + } else { + colSums(x) + } } -################################################################################ +################################################################################ #' Returns the total number of individuals observed from each sample. #' #' A convenience function equivalent to rowSums or colSums, but where @@ -134,12 +136,12 @@ taxa_sums <- function(x){ #' sample_sums(enterotype) #' data(esophagus) #' sample_sums(esophagus) -sample_sums <- function(x){ - x <- otu_table(x) - if( taxa_are_rows(x) ){ - colSums(x) - } else { - rowSums(x) - } +sample_sums <- function(x) { + x <- otu_table(x) + if (taxa_are_rows(x)) { + colSums(x) + } else { + rowSums(x) + } } -################################################################################ +################################################################################ diff --git a/R/phylo-class.R b/R/phylo-class.R index ef8e7af3..ce7f5886 100644 --- a/R/phylo-class.R +++ b/R/phylo-class.R @@ -1,26 +1,23 @@ -# Methods related to using phylo in phyloseq, including -# phyloseq-internal calls to ape internals. -################################################################################ +# Methods related to using phylo in phyloseq, including phyloseq-internal calls +# to ape internals. #' Method for fixing problems with phylo-class trees in phyloseq #' #' For now this only entails replacing each missing (\code{NA}) branch-length #' value with 0.0. #' #' @keywords internal -setGeneric("fix_phylo", function(tree) standardGeneric("fix_phylo") ) +setGeneric("fix_phylo", function(tree) standardGeneric("fix_phylo")) #' @rdname fix_phylo #' @aliases fix_phylo,phylo-method -setMethod("fix_phylo", "phylo", function(tree){ +setMethod("fix_phylo", "phylo", function(tree) { tree$edge.length[which(is.na(tree$edge.length))] <- 0 return(tree) }) -################################################################################ -# Define horizontal position / node-ages by depth to root -# For instance, `xx` in `plot_tree` and `tipAges` in `fastUniFrac` +################################################################################ Define horizontal position / node-ages by depth to root For instance, `xx` in +################################################################################ `plot_tree` and `tipAges` in `fastUniFrac` #' @keywords internal -ape_node_depth_edge_length <- function(Ntip, Nnode, edge, Nedge, edge.length){ - .C(ape:::node_depth_edgelength, PACKAGE="ape", as.integer(Ntip), - as.integer(Nnode), as.integer(edge[, 1]), - as.integer(edge[, 2]), as.integer(Nedge), - as.double(edge.length), double(Ntip + Nnode))[[7]] -} \ No newline at end of file +ape_node_depth_edge_length <- function(Ntip, Nnode, edge, Nedge, edge.length) { + .C(ape:::node_depth_edgelength, PACKAGE = "ape", as.integer(Ntip), as.integer(Nnode), + as.integer(edge[, 1]), as.integer(edge[, 2]), as.integer(Nedge), as.double(edge.length), + double(Ntip + Nnode))[[7]] +} diff --git a/R/phyloseq-class.R b/R/phyloseq-class.R index ad125d26..f16e74cc 100644 --- a/R/phyloseq-class.R +++ b/R/phyloseq-class.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Build phyloseq-class objects from their components. #' #' \code{phyloseq()} is a constructor method, This is the main method @@ -38,93 +38,80 @@ #' # # phyloseq(phy_tree(GP), otu_table(GP), sample_data(GP)) #' # # phyloseq(otu_table(GP), tax_table(GP), sample_data(GP)) #' # # phyloseq(otu_table(GP), phy_tree(GP), tax_table(GP), sample_data(GP)) -phyloseq <- function(...){ - - arglist <- list(...) - - # Remove names from arglist. Will replace them based on their class - names(arglist) <- NULL - - # ignore all but component data classes. - arglist <- arglist[sapply(arglist, is.component.class)] - - # Make the name-replaced, splatted list - splatlist <- sapply(arglist, splat.phyloseq.objects) - - # rm any forbidden chars in index names (e.g. quotes - phylogenetic tree). - # Right now, only extra quotes are forbidden. - splatlist = lapply(splatlist, function(x){ - taxa_names(x) <- gsub("\"", "", taxa_names(x), fixed=TRUE) - taxa_names(x) <- gsub("\'", "", taxa_names(x), fixed=TRUE) - return(x) - }) - - #################### - ## Need to determine whether to - # (A) instantiate a new raw/uncleaned phyloseq object, or - # (B) return a single component, or - # (C) to stop with an error because of incorrect argument types. - if( length(splatlist) > length(get.component.classes()) ){ - stop("Too many components provided\n") - } else if( length(names(splatlist)) > length(unique(names(splatlist))) ){ - stop("Only one of each component type allowed.\n", - "For merging multiple objects of the same type/class, try merge_phyloseq(...)\n") - } else if( length(splatlist) == 1){ - return(arglist[[1]]) - } else { - # Instantiate the phyloseq-class object, ps. - ps <- do.call("new", c(list(Class="phyloseq"), splatlist) ) - } - - #################### - ## Reconcile the taxa and sample index names between components - ## in the newly-minted phyloseq object - shared_taxa = intersect_taxa(ps) - shared_samples = intersect_samples(ps) - - if( length(shared_taxa) < 1 ){ - stop("Problem with OTU/taxa indices among those you provided.\n", - "Check using intersect() and taxa_names()\n" - ) - } - if( length(shared_samples) < 1 ){ - stop("Problem with sample indices among those you provided.\n", - "Check using intersect() and taxa_names()\n" - ) - } - - # Start with OTU indices - ps = prune_taxa(shared_taxa, ps) - - # Verify there is more than one component - # that describes samples before attempting to reconcile. - ps = prune_samples(shared_samples, ps) - - # Force both samples and taxa indices to be in the same order. - ps = index_reorder(ps, "both") - - # Replace any NA branch-length values in the tree with zero. - if( !is.null(phy_tree(ps, FALSE)) ){ - ps@phy_tree <- fix_phylo(ps@phy_tree) - } +phyloseq <- function(...) { - return(ps) + arglist <- list(...) + + # Remove names from arglist. Will replace them based on their class + names(arglist) <- NULL + + # ignore all but component data classes. + arglist <- arglist[sapply(arglist, is.component.class)] + + # Make the name-replaced, splatted list + splatlist <- sapply(arglist, splat.phyloseq.objects) + + # rm any forbidden chars in index names (e.g. quotes - phylogenetic tree). Right + # now, only extra quotes are forbidden. + splatlist = lapply(splatlist, function(x) { + taxa_names(x) <- gsub("\"", "", taxa_names(x), fixed = TRUE) + taxa_names(x) <- gsub("'", "", taxa_names(x), fixed = TRUE) + return(x) + }) + + #################### Need to determine whether to (A) instantiate a new raw/uncleaned phyloseq + #################### object, or (B) return a single component, or (C) to stop with an error because + #################### of incorrect argument types. + if (length(splatlist) > length(get.component.classes())) { + stop("Too many components provided\n") + } else if (length(names(splatlist)) > length(unique(names(splatlist)))) { + stop("Only one of each component type allowed.\n", "For merging multiple objects of the same type/class, try merge_phyloseq(...)\n") + } else if (length(splatlist) == 1) { + return(arglist[[1]]) + } else { + # Instantiate the phyloseq-class object, ps. + ps <- do.call("new", c(list(Class = "phyloseq"), splatlist)) + } + + #################### Reconcile the taxa and sample index names between components in the + #################### newly-minted phyloseq object + shared_taxa = intersect_taxa(ps) + shared_samples = intersect_samples(ps) + + if (length(shared_taxa) < 1) { + stop("Problem with OTU/taxa indices among those you provided.\n", "Check using intersect() and taxa_names()\n") + } + if (length(shared_samples) < 1) { + stop("Problem with sample indices among those you provided.\n", "Check using intersect() and taxa_names()\n") + } + + # Start with OTU indices + ps = prune_taxa(shared_taxa, ps) + + # Verify there is more than one component that describes samples before + # attempting to reconcile. + ps = prune_samples(shared_samples, ps) + + # Force both samples and taxa indices to be in the same order. + ps = index_reorder(ps, "both") + + # Replace any NA branch-length values in the tree with zero. + if (!is.null(phy_tree(ps, FALSE))) { + ps@phy_tree <- fix_phylo(ps@phy_tree) + } + + return(ps) } -################################################################################ -# A relatively fast way to access from phyloseq object components -# f - function name as character string -# physeq - a phyloseq object (phyloseq-class instance) +################################################################################ A relatively fast way to access from phyloseq object components f - function +################################################################################ name as character string physeq - a phyloseq object (phyloseq-class instance) #' @keywords internal -f_comp_ps = function(f, physeq){ - sapply(names(getSlots("phyloseq")), function(i, ps){ - eval(parse(text=paste(f, "(ps@", i, ")", sep=""))) - }, physeq) +f_comp_ps = function(f, physeq) { + sapply(names(getSlots("phyloseq")), function(i, ps) { + eval(parse(text = paste(f, "(ps@", i, ")", sep = ""))) + }, physeq) } -# f_comp_ps("taxa_names", ps) -# f_comp_ps("ntaxa", ps) -# Reduce("union", f_comp_ps("taxa_names", ps)) -# Reduce("intersect", f_comp_ps("taxa_names", ps)) -################################################################################ +# f_comp_ps('taxa_names', ps) f_comp_ps('ntaxa', ps) Reduce('union', +# f_comp_ps('taxa_names', ps)) Reduce('intersect', f_comp_ps('taxa_names', ps)) #' Show the component objects classes and slot names. #' #' There are no arguments to this function. It returns a named character @@ -139,38 +126,43 @@ f_comp_ps = function(f, physeq){ #' #' @examples # #' #get.component.classes() -get.component.classes <- function(){ - # define classes vector - component.classes <- c("otu_table", "sample_data", "phylo", "taxonomyTable", "XStringSet") - # the names of component.classes needs to be the slot names to match getSlots / splat - names(component.classes) <- c("otu_table", "sam_data", "phy_tree", "tax_table", "refseq") - return(component.classes) +get.component.classes <- function() { + # define classes vector + component.classes <- c("otu_table", "sample_data", "phylo", "taxonomyTable", + "XStringSet") + # the names of component.classes needs to be the slot names to match getSlots / + # splat + names(component.classes) <- c("otu_table", "sam_data", "phy_tree", "tax_table", + "refseq") + return(component.classes) } # Explicitly define components/slots that describe taxa. #' @keywords internal -taxa.components = function(){ - # define classes vector - component.classes <- c("otu_table", "phylo", "taxonomyTable", "XStringSet") - # the names of component.classes needs to be the slot names to match getSlots / splat - names(component.classes) <- c("otu_table", "phy_tree", "tax_table", "refseq") - return(component.classes) +taxa.components = function() { + # define classes vector + component.classes <- c("otu_table", "phylo", "taxonomyTable", "XStringSet") + # the names of component.classes needs to be the slot names to match getSlots / + # splat + names(component.classes) <- c("otu_table", "phy_tree", "tax_table", "refseq") + return(component.classes) } # Explicitly define components/slots that describe samples. #' @keywords internal -sample.components = function(){ - # define classes vector - component.classes <- c("otu_table", "sample_data") - # the names of component.classes needs to be the slot names to match getSlots / splat - names(component.classes) <- c("otu_table", "sam_data") - return(component.classes) +sample.components = function() { + # define classes vector + component.classes <- c("otu_table", "sample_data") + # the names of component.classes needs to be the slot names to match getSlots / + # splat + names(component.classes) <- c("otu_table", "sam_data") + return(component.classes) } -# Returns TRUE if x is a component class, FALSE otherwise. -# This shows up over and over again in data infrastructure +# Returns TRUE if x is a component class, FALSE otherwise. This shows up over +# and over again in data infrastructure #' @keywords internal -is.component.class = function(x){ - inherits(x, get.component.classes()) +is.component.class = function(x) { + inherits(x, get.component.classes()) } -################################################################################ +################################################################################ #' Convert \code{\link{phyloseq-class}} into a named list of its non-empty components. #' #' This is used in internal handling functions, and one of its key features @@ -195,23 +187,26 @@ is.component.class = function(x){ #' @seealso merge_phyloseq #' @keywords internal #' @examples # -splat.phyloseq.objects <- function(x){ - if( is.component.class(x) ){ - # Check if class of x is among the component classes already (not phyloseq-class) - splatx <- list(x) - names(splatx) <- names(which(sapply(get.component.classes(), function(cclass, x) inherits(x, cclass), x))) - } else if( inherits(x, "phyloseq") ){ - # Else, check if it inherits from phyloseq, and if-so splat - slotnames = names(getSlots("phyloseq")) - allslots = sapply(slotnames, function(i, x){access(x, i, FALSE)}, x) - splatx = allslots[!sapply(allslots, is.null)] - } else { - # Otherwise, who knows what it is, silently return NULL. - return(NULL) - } - return(splatx) +splat.phyloseq.objects <- function(x) { + if (is.component.class(x)) { + # Check if class of x is among the component classes already (not phyloseq-class) + splatx <- list(x) + names(splatx) <- names(which(sapply(get.component.classes(), function(cclass, + x) inherits(x, cclass), x))) + } else if (inherits(x, "phyloseq")) { + # Else, check if it inherits from phyloseq, and if-so splat + slotnames = names(getSlots("phyloseq")) + allslots = sapply(slotnames, function(i, x) { + access(x, i, FALSE) + }, x) + splatx = allslots[!sapply(allslots, is.null)] + } else { + # Otherwise, who knows what it is, silently return NULL. + return(NULL) + } + return(splatx) } -################################################################################ +################################################################################ #' Return the non-empty slot names of a phyloseq object. #' #' Like \code{\link{getSlots}}, but returns the class name if argument @@ -235,10 +230,10 @@ splat.phyloseq.objects <- function(x){ #' getslots.phyloseq(GlobalPatterns) #' data(esophagus) #' getslots.phyloseq(esophagus) -getslots.phyloseq = function(physeq){ - names(splat.phyloseq.objects(physeq)) +getslots.phyloseq = function(physeq) { + names(splat.phyloseq.objects(physeq)) } -################################################################################ +################################################################################ #' Universal slot accessor function for phyloseq-class. #' #' This function is used internally by many accessors and in @@ -270,37 +265,37 @@ getslots.phyloseq = function(physeq){ #' @export #' @examples # #' ## data(GlobalPatterns) -#' ## access(GlobalPatterns, "tax_table") -#' ## access(GlobalPatterns, "phy_tree") -#' ## access(otu_table(GlobalPatterns), "otu_table") +#' ## access(GlobalPatterns, 'tax_table') +#' ## access(GlobalPatterns, 'phy_tree') +#' ## access(otu_table(GlobalPatterns), 'otu_table') #' ## # Should return NULL: -#' ## access(otu_table(GlobalPatterns), "sample_data") -#' ## access(otuTree(GlobalPatterns), "sample_data") -#' ## access(otuSam(GlobalPatterns), "phy_tree") -access <- function(physeq, slot, errorIfNULL=FALSE){ - if( is.component.class(physeq) ){ - # If physeq is a component class, might return as-is. Depends on slot. - if( inherits(physeq, get.component.classes()[slot]) ){ - # if slot-name matches, return physeq as-is. - out = physeq - } else { - # If slot/component mismatch, set out to NULL. Test later if this is an error. - out = NULL - } - } else if(!slot %in% slotNames(physeq) ){ - # If slot is invalid, set out to NULL. Test later if this is an error. - out = NULL - } else { - # By elimination, must be valid. Access slot - out = eval(parse(text=paste("physeq@", slot, sep=""))) - } - if( errorIfNULL & is.null(out) ){ - # Only error regarding a NULL return value if errorIfNULL is TRUE. - stop(slot, " slot is empty.") - } - return(out) +#' ## access(otu_table(GlobalPatterns), 'sample_data') +#' ## access(otuTree(GlobalPatterns), 'sample_data') +#' ## access(otuSam(GlobalPatterns), 'phy_tree') +access <- function(physeq, slot, errorIfNULL = FALSE) { + if (is.component.class(physeq)) { + # If physeq is a component class, might return as-is. Depends on slot. + if (inherits(physeq, get.component.classes()[slot])) { + # if slot-name matches, return physeq as-is. + out = physeq + } else { + # If slot/component mismatch, set out to NULL. Test later if this is an error. + out = NULL + } + } else if (!slot %in% slotNames(physeq)) { + # If slot is invalid, set out to NULL. Test later if this is an error. + out = NULL + } else { + # By elimination, must be valid. Access slot + out = eval(parse(text = paste("physeq@", slot, sep = ""))) + } + if (errorIfNULL & is.null(out)) { + # Only error regarding a NULL return value if errorIfNULL is TRUE. + stop(slot, " slot is empty.") + } + return(out) } -################################################################################ +################################################################################ #' Returns the intersection of species and samples for the components of x #' #' This function is used internally as part of the infrastructure to ensure that @@ -322,18 +317,18 @@ access <- function(physeq, slot, errorIfNULL=FALSE){ #' @examples # #' ## data(GlobalPatterns) #' ## head(intersect_taxa(GlobalPatterns), 10) -intersect_taxa <- function(x){ - taxa_vectors = f_comp_ps("taxa_names", x) - taxa_vectors = taxa_vectors[!sapply(taxa_vectors, is.null)] - return( Reduce("intersect", taxa_vectors) ) +intersect_taxa <- function(x) { + taxa_vectors = f_comp_ps("taxa_names", x) + taxa_vectors = taxa_vectors[!sapply(taxa_vectors, is.null)] + return(Reduce("intersect", taxa_vectors)) } #' @keywords internal -intersect_samples <- function(x){ - sample_vectors = f_comp_ps("sample_names", x) - sample_vectors = sample_vectors[!sapply(sample_vectors, is.null)] - return( Reduce("intersect", sample_vectors) ) +intersect_samples <- function(x) { + sample_vectors = f_comp_ps("sample_names", x) + sample_vectors = sample_vectors[!sapply(sample_vectors, is.null)] + return(Reduce("intersect", sample_vectors)) } -################################################################################ +################################################################################ #' Force index order of phyloseq objects #' #' @usage index_reorder(ps, index_type) @@ -341,60 +336,59 @@ intersect_samples <- function(x){ #' @param ps (Required). A \code{\link{phyloseq-class}} instance. #' @param index_type (Optional). A character string #' specifying the indices to properly order. -#' Supported values are \code{c("both", "taxa", "samples")}. -#' Default is \code{"both"}, meaning samples and taxa indices +#' Supported values are \code{c('both', 'taxa', 'samples')}. +#' Default is \code{'both'}, meaning samples and taxa indices #' will be checked/re-ordered. #' #' @keywords internal #' @docType methods #' #' @examples -#' ## data("GlobalPatterns") +#' ## data('GlobalPatterns') #' ## GP = index_reorder(GlobalPatterns) -setGeneric("index_reorder", function(ps, index_type) standardGeneric("index_reorder") ) +setGeneric("index_reorder", function(ps, index_type) standardGeneric("index_reorder")) #' @rdname index_reorder #' @aliases index_reorder,phyloseq-method -setMethod("index_reorder", "phyloseq", function(ps, index_type="both"){ - if( index_type %in% c("both", "taxa") ){ - ## ENFORCE CONSISTENT ORDER OF TAXA INDICES. - if( !is.null(phy_tree(ps, FALSE)) ){ - # If there is a phylogenetic tree included, - # re-order based on that, and reorder the otu_table - # The new taxa order, torder, will also trickle down to - # the taxonomyTable or XStringSet if present. - torder = taxa_names(phy_tree(ps)) - # Re-order the OTU table - if( taxa_are_rows(ps) ){ - ps@otu_table = otu_table(ps)[torder, ] - } else { - ps@otu_table = otu_table(ps)[, torder] - } - } else { - # Else, re-order anything/everything else based on the OTU-table order - torder = taxa_names(otu_table(ps)) - } - if( !is.null(tax_table(ps, FALSE)) ){ - # If there is a taxonomyTable, re-order that too. - ps@tax_table = tax_table(ps)[torder, ] - } - if( !is.null(refseq(ps, FALSE)) ){ - # If there is a XStringSet, re-order that too. - ps@refseq = refseq(ps)[torder] - } - } - - if( index_type %in% c("both", "samples") ){ - ## ENFORCE CONSISTENT ORDER OF SAMPLE INDICES - # Errors can creep when sample indices do not match. - if( !is.null(sample_data(ps, FALSE)) ){ - # check first that ps has sample_data - if( !all(sample_names(otu_table(ps)) == rownames(sample_data(ps))) ){ - # Reorder the sample_data rows so that they match the otu_table order. - ps@sam_data <- sample_data(ps)[sample_names(otu_table(ps)), ] - } - } - } - - return(ps) +setMethod("index_reorder", "phyloseq", function(ps, index_type = "both") { + if (index_type %in% c("both", "taxa")) { + ## ENFORCE CONSISTENT ORDER OF TAXA INDICES. + if (!is.null(phy_tree(ps, FALSE))) { + # If there is a phylogenetic tree included, re-order based on that, and reorder + # the otu_table The new taxa order, torder, will also trickle down to the + # taxonomyTable or XStringSet if present. + torder = taxa_names(phy_tree(ps)) + # Re-order the OTU table + if (taxa_are_rows(ps)) { + ps@otu_table = otu_table(ps)[torder, ] + } else { + ps@otu_table = otu_table(ps)[, torder] + } + } else { + # Else, re-order anything/everything else based on the OTU-table order + torder = taxa_names(otu_table(ps)) + } + if (!is.null(tax_table(ps, FALSE))) { + # If there is a taxonomyTable, re-order that too. + ps@tax_table = tax_table(ps)[torder, ] + } + if (!is.null(refseq(ps, FALSE))) { + # If there is a XStringSet, re-order that too. + ps@refseq = refseq(ps)[torder] + } + } + + if (index_type %in% c("both", "samples")) { + ## ENFORCE CONSISTENT ORDER OF SAMPLE INDICES Errors can creep when sample indices + ## do not match. + if (!is.null(sample_data(ps, FALSE))) { + # check first that ps has sample_data + if (!all(sample_names(otu_table(ps)) == rownames(sample_data(ps)))) { + # Reorder the sample_data rows so that they match the otu_table order. + ps@sam_data <- sample_data(ps)[sample_names(otu_table(ps)), ] + } + } + } + + return(ps) }) -################################################################################ \ No newline at end of file +################################################################################ diff --git a/R/plot-methods.R b/R/plot-methods.R index 9e770653..84440212 100644 --- a/R/plot-methods.R +++ b/R/plot-methods.R @@ -1,10 +1,4 @@ -# # extension of plot methods for phyloseq object. -# -################################################################################ -################################################################################ -################################################################################ -################################################################################ #' Generic plot defaults for phyloseq. #' #' There are many useful examples of phyloseq graphics functions in the @@ -44,22 +38,23 @@ #' @examples #' data(esophagus) #' plot_phyloseq(esophagus) -setGeneric("plot_phyloseq", function(physeq, ...){ standardGeneric("plot_phyloseq") }) +setGeneric("plot_phyloseq", function(physeq, ...) { + standardGeneric("plot_phyloseq") +}) #' @aliases plot_phyloseq,phyloseq-method #' @rdname plot_phyloseq-methods -setMethod("plot_phyloseq", "phyloseq", function(physeq, ...){ - if( all(c("otu_table", "sample_data", "phy_tree") %in% getslots.phyloseq(physeq)) ){ - plot_tree(esophagus, color="samples") - } else if( all(c("otu_table", "sample_data", "tax_table") %in% getslots.phyloseq(physeq) ) ){ - plot_bar(physeq, ...) - } else if( all(c("otu_table", "phy_tree") %in% getslots.phyloseq(physeq)) ){ - plot_tree(esophagus, color="samples") - } else { - plot_richness(physeq) - } +setMethod("plot_phyloseq", "phyloseq", function(physeq, ...) { + if (all(c("otu_table", "sample_data", "phy_tree") %in% getslots.phyloseq(physeq))) { + plot_tree(esophagus, color = "samples") + } else if (all(c("otu_table", "sample_data", "tax_table") %in% getslots.phyloseq(physeq))) { + plot_bar(physeq, ...) + } else if (all(c("otu_table", "phy_tree") %in% getslots.phyloseq(physeq))) { + plot_tree(esophagus, color = "samples") + } else { + plot_richness(physeq) + } }) -################################################################################ -################################################################################ +################################################################################ #' Microbiome Network Plot using ggplot2 #' #' There are many useful examples of phyloseq network graphics in the @@ -79,11 +74,11 @@ setMethod("plot_phyloseq", "phyloseq", function(physeq, ...){ #' nodes is below a potentially arbitrary threshold, #' and special care should be given to considering the choice of this threshold. #' -#' @usage plot_network(g, physeq=NULL, type="samples", -#' color=NULL, shape=NULL, point_size=4, alpha=1, -#' label="value", hjust = 1.35, -#' line_weight=0.5, line_color=color, line_alpha=0.4, -#' layout.method=layout.fruchterman.reingold, title=NULL) +#' @usage plot_network(g, physeq=NULL, type='samples', +#' \tcolor=NULL, shape=NULL, point_size=4, alpha=1, +#' \tlabel='value', hjust = 1.35, +#' \tline_weight=0.5, line_color=color, line_alpha=0.4, +#' \tlayout.method=layout.fruchterman.reingold, title=NULL) #' #' @param g (Required). An \code{igraph}-class object created #' either by the convenience wrapper \code{\link{make_network}}, @@ -92,11 +87,11 @@ setMethod("plot_phyloseq", "phyloseq", function(physeq, ...){ #' @param physeq (Optional). Default \code{NULL}. #' A \code{\link{phyloseq-class}} object on which \code{g} is based. #' -#' @param type (Optional). Default \code{"samples"}. +#' @param type (Optional). Default \code{'samples'}. #' Whether the network represented in the primary argument, \code{g}, #' is samples or taxa/OTUs. -#' Supported arguments are \code{"samples"}, \code{"taxa"}, -#' where \code{"taxa"} indicates using the taxa indices, +#' Supported arguments are \code{'samples'}, \code{'taxa'}, +#' where \code{'taxa'} indicates using the taxa indices, #' whether they actually represent species or some other taxonomic rank. #' #' @param color (Optional). Default \code{NULL}. @@ -113,7 +108,7 @@ setMethod("plot_phyloseq", "phyloseq", function(physeq, ...){ #' @param alpha (Optional). Default \code{1}. #' A value between 0 and 1 for the alpha transparency of the vertex points. #' -#' @param label (Optional). Default \code{"value"}. +#' @param label (Optional). Default \code{'value'}. #' The name of the sample variable in \code{physeq} to use for #' labelling the vertex points. #' @@ -165,92 +160,83 @@ setMethod("plot_phyloseq", "phyloseq", function(physeq, ...){ #' #' data(enterotype) #' ig <- make_network(enterotype, max.dist=0.3) -#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) +#' plot_network(ig, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) #' # Change distance parameter #' ig <- make_network(enterotype, max.dist=0.2) -#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) -plot_network <- function(g, physeq=NULL, type="samples", - color=NULL, shape=NULL, point_size=4, alpha=1, - label="value", hjust = 1.35, - line_weight=0.5, line_color=color, line_alpha=0.4, - layout.method=layout.fruchterman.reingold, title=NULL){ - - if( vcount(g) < 2 ){ +#' plot_network(ig, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) +plot_network <- function(g, physeq = NULL, type = "samples", color = NULL, shape = NULL, + point_size = 4, alpha = 1, label = "value", hjust = 1.35, line_weight = 0.5, + line_color = color, line_alpha = 0.4, layout.method = layout.fruchterman.reingold, + title = NULL) { + + if (vcount(g) < 2) { # Report a warning if the graph is empty - stop("The graph you provided, `g`, has too few vertices. - Check your graph, or the output of `make_network` and try again.") + stop("The graph you provided, `g`, has too few vertices. \n Check your graph, or the output of `make_network` and try again.") + } + + # disambiguate species/OTU/taxa as argument type... + if (type %in% c("taxa", "species", "OTUs", "otus", "otu")) { + type <- "taxa" + } + + # Make the edge-coordinates data.frame + edgeDF <- data.frame(get.edgelist(g)) + edgeDF$id <- 1:length(edgeDF[, 1]) + + # Make the vertices-coordinates data.frame + vertDF <- layout.method(g) + colnames(vertDF) <- c("x", "y") + vertDF <- data.frame(value = get.vertex.attribute(g, "name"), vertDF) + + # If phyloseq object provided, AND it has the relevant additional data THEN add + # it to vertDF + if (!is.null(physeq)) { + extraData <- NULL + if (type == "samples" & !is.null(sample_data(physeq, FALSE))) { + extraData = data.frame(sample_data(physeq))[as.character(vertDF$value), + , drop = FALSE] + } else if (type == "taxa" & !is.null(tax_table(physeq, FALSE))) { + extraData = data.frame(tax_table(physeq))[as.character(vertDF$value), + , drop = FALSE] + } + # Only mod vertDF if extraData exists + if (!is.null(extraData)) { + vertDF <- data.frame(vertDF, extraData) + } } - # disambiguate species/OTU/taxa as argument type... - if( type %in% c("taxa", "species", "OTUs", "otus", "otu") ){ - type <- "taxa" - } - - # Make the edge-coordinates data.frame - edgeDF <- data.frame(get.edgelist(g)) - edgeDF$id <- 1:length(edgeDF[, 1]) - - # Make the vertices-coordinates data.frame - vertDF <- layout.method(g) - colnames(vertDF) <- c("x", "y") - vertDF <- data.frame(value=get.vertex.attribute(g, "name"), vertDF) - - # If phyloseq object provided, - # AND it has the relevant additional data - # THEN add it to vertDF - if( !is.null(physeq) ){ - extraData <- NULL - if( type == "samples" & !is.null(sample_data(physeq, FALSE)) ){ - extraData = data.frame(sample_data(physeq))[as.character(vertDF$value), , drop=FALSE] - } else if( type == "taxa" & !is.null(tax_table(physeq, FALSE)) ){ - extraData = data.frame(tax_table(physeq))[as.character(vertDF$value), , drop=FALSE] - } - # Only mod vertDF if extraData exists - if( !is.null(extraData) ){ - vertDF <- data.frame(vertDF, extraData) - } - } - - # Combine vertex and edge coordinate data.frames - graphDF <- merge(reshape2::melt(edgeDF, id="id"), vertDF, by = "value") - - # Initialize the ggplot - p <- ggplot(vertDF, aes(x, y)) - - # Strip all the typical annotations from the plot, leave the legend - p <- p + theme_bw() + - theme( - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - axis.text.x = element_blank(), - axis.text.y = element_blank(), - axis.title.x = element_blank(), - axis.title.y = element_blank(), - axis.ticks = element_blank(), - panel.border = element_blank() - ) - - # Add the graph vertices as points - p <- p + geom_point(aes_string(color=color, shape=shape), size=point_size, na.rm=TRUE) - - # Add the text labels - if( !is.null(label) ){ - p <- p + geom_text(aes_string(label=label), size = 2, hjust=hjust, na.rm=TRUE) - } - - # Add the edges: - p <- p + geom_line(aes_string(group="id", color=line_color), - graphDF, size=line_weight, alpha=line_alpha, na.rm=TRUE) - - # Optionally add a title to the plot - if( !is.null(title) ){ - p <- p + ggtitle(title) - } - - return(p) + # Combine vertex and edge coordinate data.frames + graphDF <- merge(reshape2::melt(edgeDF, id = "id"), vertDF, by = "value") + + # Initialize the ggplot + p <- ggplot(vertDF, aes(x, y)) + + # Strip all the typical annotations from the plot, leave the legend + p <- p + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + axis.text.x = element_blank(), axis.text.y = element_blank(), axis.title.x = element_blank(), + axis.title.y = element_blank(), axis.ticks = element_blank(), panel.border = element_blank()) + + # Add the graph vertices as points + p <- p + geom_point(aes_string(color = color, shape = shape), size = point_size, + na.rm = TRUE) + + # Add the text labels + if (!is.null(label)) { + p <- p + geom_text(aes_string(label = label), size = 2, hjust = hjust, na.rm = TRUE) + } + + # Add the edges: + p <- p + geom_line(aes_string(group = "id", color = line_color), graphDF, size = line_weight, + alpha = line_alpha, na.rm = TRUE) + + # Optionally add a title to the plot + if (!is.null(title)) { + p <- p + ggtitle(title) + } + + return(p) } -################################################################################ -################################################################################ +################################################################################ #' Microbiome Network Plot using ggplot2 #' #' There are many useful examples of phyloseq network graphics in the @@ -277,7 +263,7 @@ plot_network <- function(g, physeq=NULL, type="samples", #' @param physeq (Required). #' The \code{\link{phyloseq-class}} object that you want to represent as a network. #' -#' @param distance (Optional). Default is \code{"bray"}. +#' @param distance (Optional). Default is \code{'bray'}. #' Can be either a distance method supported by \code{\link[phyloseq]{distance}}, #' or an already-computed \code{\link{dist}}-class with labels that match #' the indices implied by both the \code{physeq} and \code{type} arguments @@ -289,14 +275,14 @@ plot_network <- function(g, physeq=NULL, type="samples", #' The maximum distance value between two vertices #' to connect with an edge in the graphic. #' -#' @param type (Optional). Default \code{"samples"}. +#' @param type (Optional). Default \code{'samples'}. #' Whether the network represented in the primary argument, \code{g}, #' is samples or taxa/OTUs. -#' Supported arguments are \code{"samples"}, \code{"taxa"}, -#' where \code{"taxa"} indicates using the taxa indices, +#' Supported arguments are \code{'samples'}, \code{'taxa'}, +#' where \code{'taxa'} indicates using the taxa indices, #' whether they actually represent species or some other taxonomic rank. #' -#' @param laymeth (Optional). Default \code{"fruchterman.reingold"}. +#' @param laymeth (Optional). Default \code{'fruchterman.reingold'}. #' A character string that indicates the method that will determine #' the placement of vertices, typically based on conectedness of vertices #' and the number of vertices. @@ -305,8 +291,8 @@ plot_network <- function(g, physeq=NULL, type="samples", #' and see \code{\link[igraph]{layout.auto}} for descriptions of various #' alternative layout method options supported here. #' The character string argument should match exactly the -#' layout function name with the \code{"layout."} omitted. -#' Try \code{laymeth="list"} to see a list of options. +#' layout function name with the \code{'layout.'} omitted. +#' Try \code{laymeth='list'} to see a list of options. #' #' @param color (Optional). Default \code{NULL}. #' The name of the sample variable in \code{physeq} to use for color mapping @@ -367,95 +353,86 @@ plot_network <- function(g, physeq=NULL, type="samples", #' @export #' @examples #' data(enterotype) -#' plot_net(enterotype, color="SeqTech", maxdist = 0.3) -#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "auto") -#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "svd") -#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "circle") -#' plot_net(enterotype, color="SeqTech", shape="Enterotype", maxdist = 0.3, laymeth = "circle") -plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7, - laymeth="fruchterman.reingold", color=NULL, shape=NULL, rescale=FALSE, - point_size=5, point_alpha=1, point_label=NULL, hjust = 1.35, title=NULL){ +#' plot_net(enterotype, color='SeqTech', maxdist = 0.3) +#' plot_net(enterotype, color='SeqTech', maxdist = 0.3, laymeth = 'auto') +#' plot_net(enterotype, color='SeqTech', maxdist = 0.3, laymeth = 'svd') +#' plot_net(enterotype, color='SeqTech', maxdist = 0.3, laymeth = 'circle') +#' plot_net(enterotype, color='SeqTech', shape='Enterotype', maxdist = 0.3, laymeth = 'circle') +plot_net <- function(physeq, distance = "bray", type = "samples", maxdist = 0.7, + laymeth = "fruchterman.reingold", color = NULL, shape = NULL, rescale = FALSE, + point_size = 5, point_alpha = 1, point_label = NULL, hjust = 1.35, title = NULL) { # Supported layout methods - available_layouts = list( - auto = layout.auto, - random = layout.random, - circle = layout.circle, - sphere = layout.sphere, - fruchterman.reingold = layout.fruchterman.reingold, - kamada.kawai = layout.kamada.kawai, - spring = layout.spring, - reingold.tilford = layout.reingold.tilford, - fruchterman.reingold.grid = layout.fruchterman.reingold.grid, - lgl = layout.lgl, - graphopt = layout.graphopt, - svd = layout.svd - ) - if(laymeth=="list"){ + available_layouts = list(auto = layout.auto, random = layout.random, circle = layout.circle, + sphere = layout.sphere, fruchterman.reingold = layout.fruchterman.reingold, + kamada.kawai = layout.kamada.kawai, spring = layout.spring, reingold.tilford = layout.reingold.tilford, + fruchterman.reingold.grid = layout.fruchterman.reingold.grid, lgl = layout.lgl, + graphopt = layout.graphopt, svd = layout.svd) + if (laymeth == "list") { return(names(available_layouts)) } - if(!laymeth %in% names(available_layouts)){ + if (!laymeth %in% names(available_layouts)) { stop("Unsupported argument to `laymeth` option. Please use an option returned by `plot_net(laymeth='list')`") } - # 1. - # Calculate Distance - if( inherits(distance, "dist") ){ + # 1. Calculate Distance + if (inherits(distance, "dist")) { # If distance a distance object, use it rather than re-calculate Distance <- distance # Check that it at least has (a subset of) the correct labels - possibleVertexLabels = switch(type, taxa=taxa_names(physeq), samples=sample_names(physeq)) - if( !all(attributes(distance)$Labels %in% possibleVertexLabels) ){ + possibleVertexLabels = switch(type, taxa = taxa_names(physeq), samples = sample_names(physeq)) + if (!all(attributes(distance)$Labels %in% possibleVertexLabels)) { stop("Some or all `distance` index labels do not match ", type, " names in `physeq`") } } else { # Coerce to character and attempt distance calculation - scaled_distance = function(physeq, method, type, rescale=TRUE){ + scaled_distance = function(physeq, method, type, rescale = TRUE) { Dist = distance(physeq, method, type) - if(rescale){ + if (rescale) { # rescale the distance matrix to be [0, 1] - Dist <- Dist / max(Dist, na.rm=TRUE) - Dist <- Dist - min(Dist, na.rm=TRUE) + Dist <- Dist/max(Dist, na.rm = TRUE) + Dist <- Dist - min(Dist, na.rm = TRUE) } return(Dist) } distance <- as(distance[1], "character") Distance = scaled_distance(physeq, distance, type, rescale) } - # 2. - # Create edge data.table - dist_to_edge_table = function(Dist, MaxDistance=NULL, vnames = c("v1", "v2")){ + # 2. Create edge data.table + dist_to_edge_table = function(Dist, MaxDistance = NULL, vnames = c("v1", "v2")) { dmat <- as.matrix(Dist) # Set duplicate entries and self-links to Inf dmat[upper.tri(dmat, diag = TRUE)] <- Inf - LinksData = data.table(reshape2::melt(dmat, varnames=vnames, as.is = TRUE)) + LinksData = data.table(reshape2::melt(dmat, varnames = vnames, as.is = TRUE)) setnames(LinksData, old = "value", new = "Distance") # Remove self-links and duplicate links LinksData <- LinksData[is.finite(Distance), ] # Remove entries above the threshold, MaxDistance - if(!is.null(MaxDistance)){ + if (!is.null(MaxDistance)) { LinksData <- LinksData[Distance < MaxDistance, ] } return(LinksData) } LinksData0 = dist_to_edge_table(Distance, maxdist) - # 3. Create vertex layout - # Make the vertices-coordinates data.table - vertex_layout = function(LinksData, physeq=NULL, type="samples", - laymeth=igraph::layout.fruchterman.reingold, ...){ - # `physeq` can be anything, only has effect when non-NULL returned by sample_data or tax_table - g = igraph::graph.data.frame(LinksData, directed=FALSE) - vertexDT = data.table(laymeth(g, ...), - vertex=get.vertex.attribute(g, "name")) + # 3. Create vertex layout Make the vertices-coordinates data.table + vertex_layout = function(LinksData, physeq = NULL, type = "samples", laymeth = igraph::layout.fruchterman.reingold, + ...) { + # `physeq` can be anything, only has effect when non-NULL returned by sample_data + # or tax_table + g = igraph::graph.data.frame(LinksData, directed = FALSE) + vertexDT = data.table(laymeth(g, ...), vertex = get.vertex.attribute(g, "name")) setkey(vertexDT, vertex) setnames(vertexDT, old = c(1, 2), new = c("x", "y")) extraData = NULL - if( type == "samples" & !is.null(sample_data(physeq, FALSE)) ){ - extraData <- data.table(data.frame(sample_data(physeq)), key = "rn", keep.rownames = TRUE) - } else if( type == "taxa" & !is.null(tax_table(physeq, FALSE)) ){ - extraData <- data.table(as(tax_table(physeq), "matrix"), key = "rn", keep.rownames = TRUE) + if (type == "samples" & !is.null(sample_data(physeq, FALSE))) { + extraData <- data.table(data.frame(sample_data(physeq)), key = "rn", + keep.rownames = TRUE) + } else if (type == "taxa" & !is.null(tax_table(physeq, FALSE))) { + extraData <- data.table(as(tax_table(physeq), "matrix"), key = "rn", + keep.rownames = TRUE) } # Only mod vertexDT if extraData exists - if(!is.null(extraData)){ - # Join vertexDT, extraData using data.table syntax. Presumes `vertex` is key in both. + if (!is.null(extraData)) { + # Join vertexDT, extraData using data.table syntax. Presumes `vertex` is key in + # both. setnames(extraData, old = "rn", new = "vertex") vertexDT <- vertexDT[extraData] vertexDT <- vertexDT[!is.na(x), ] @@ -463,49 +440,38 @@ plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7, return(vertexDT) } vertexDT = vertex_layout(LinksData0, physeq, type, available_layouts[[laymeth]]) - # 4. - # Update the links layout for ggplot: x, y, xend, yend - link_layout = function(LinksData, vertexDT){ + # 4. Update the links layout for ggplot: x, y, xend, yend + link_layout = function(LinksData, vertexDT) { linkstart = vertexDT[LinksData$v1, x, y] linkend = vertexDT[LinksData$v2, x, y] setnames(linkend, old = c("y", "x"), new = c("yend", "xend")) LinksData <- cbind(LinksData, linkstart, linkend) - return(LinksData) + return(LinksData) } LinksData = link_layout(LinksData0, vertexDT) - # 5. - # Define ggplot2 network plot - links_to_ggplot = function(LinksData, vertexDT, vertmap=aes(x, y)){ - p0 = ggplot(data=LinksData) + - geom_segment(aes(x, y, xend=xend, yend=yend, size=Distance, alpha=Distance)) + - geom_point(mapping = vertmap, data=vertexDT, size=5, na.rm = TRUE) + - scale_alpha(range = c(1, 0.1)) + - scale_size(range = c(2, 0.25)) + # 5. Define ggplot2 network plot + links_to_ggplot = function(LinksData, vertexDT, vertmap = aes(x, y)) { + p0 = ggplot(data = LinksData) + geom_segment(aes(x, y, xend = xend, yend = yend, + size = Distance, alpha = Distance)) + geom_point(mapping = vertmap, data = vertexDT, + size = 5, na.rm = TRUE) + scale_alpha(range = c(1, 0.1)) + scale_size(range = c(2, + 0.25)) return(p0) } - p = links_to_ggplot(LinksData, vertexDT, - vertmap = aes_string(x="x", y="y", color=color, shape=shape)) + p = links_to_ggplot(LinksData, vertexDT, vertmap = aes_string(x = "x", y = "y", + color = color, shape = shape)) # Add labels - if(!is.null(point_label)){ - p <- p + geom_text(aes_string(x="x", y="y", label=point_label), - data = vertexDT, size = 2, hjust = hjust, na.rm = TRUE) + if (!is.null(point_label)) { + p <- p + geom_text(aes_string(x = "x", y = "y", label = point_label), data = vertexDT, + size = 2, hjust = hjust, na.rm = TRUE) } # Add default theme - net_theme = theme( - panel.grid.major = element_blank(), - panel.grid.minor = element_blank(), - axis.text.x = element_blank(), - axis.text.y = element_blank(), - axis.title.x = element_blank(), - axis.title.y = element_blank(), - axis.ticks = element_blank(), - panel.border = element_blank() - ) + net_theme = theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + axis.text.x = element_blank(), axis.text.y = element_blank(), axis.title.x = element_blank(), + axis.title.y = element_blank(), axis.ticks = element_blank(), panel.border = element_blank()) p <- p + theme_bw() + net_theme return(p) } -################################################################################ -################################################################################ +################################################################################ #' Plot alpha diversity, flexibly with ggplot2 #' #' There are many useful examples of alpha-diversity graphics in the @@ -528,7 +494,7 @@ plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7, #' plot might be kindof strange, and not the intended behavior of this function). #' The following are the names you will want to avoid using in \code{x} or \code{color}: #' -#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}. +#' \code{c('Observed', 'Chao1', 'ACE', 'Shannon', 'Simpson', 'InvSimpson', 'Fisher')}. #' #' @param physeq (Required). \code{\link{phyloseq-class}}, or alternatively, #' an \code{\link{otu_table-class}}. The data about which you want to estimate. @@ -542,7 +508,7 @@ plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7, #' or a custom supplied vector with length equal to the number of samples #' in the dataset (nsamples(physeq)). #' -#' The default value is \code{"samples"}, which will map each sample's name +#' The default value is \code{'samples'}, which will map each sample's name #' to a separate horizontal position in the plot. #' #' @param color (Optional). Default \code{NULL}. @@ -571,11 +537,11 @@ plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7, #' @param title (Optional). Default \code{NULL}. Character string. #' The main title for the graphic. #' -#' @param scales (Optional). Default \code{"free_y"}. +#' @param scales (Optional). Default \code{'free_y'}. #' Whether to let vertical axis have free scale that adjusts to #' the data in each panel. #' This argument is passed to \code{\link[ggplot2]{facet_wrap}}. -#' If set to \code{"fixed"}, a single vertical scale will +#' If set to \code{'fixed'}, a single vertical scale will #' be used in all panels. This can obscure values if the #' \code{measures} argument includes both #' richness estimates and diversity indices, for example. @@ -595,7 +561,7 @@ plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7, #' Alternatively, you can specify one or more measures #' as a character vector of measure names. #' Values must be among those supported: -#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}. +#' \code{c('Observed', 'Chao1', 'ACE', 'Shannon', 'Simpson', 'InvSimpson', 'Fisher')}. #' #' @param sortby (Optional). A character string subset of \code{measures} argument. #' Sort x-indices by the mean of one or more \code{measures}, @@ -623,72 +589,71 @@ plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7, #' @examples #' ## There are many more interesting examples at the phyloseq online tutorials. #' ## http://joey711.github.io/phyloseq/plot_richness-examples -#' data("soilrep") -#' plot_richness(soilrep, measures=c("InvSimpson", "Fisher")) -#' plot_richness(soilrep, "Treatment", "warmed", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3) -#' data("GlobalPatterns") -#' plot_richness(GlobalPatterns, x="SampleType", measures=c("InvSimpson")) -#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3) -#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3, sortby = "Chao1") -plot_richness = function(physeq, x="samples", color=NULL, shape=NULL, title=NULL, - scales="free_y", nrow=1, shsi=NULL, measures=NULL, sortby=NULL){ +#' data('soilrep') +#' plot_richness(soilrep, measures=c('InvSimpson', 'Fisher')) +#' plot_richness(soilrep, 'Treatment', 'warmed', measures=c('Chao1', 'ACE', 'InvSimpson'), nrow=3) +#' data('GlobalPatterns') +#' plot_richness(GlobalPatterns, x='SampleType', measures=c('InvSimpson')) +#' plot_richness(GlobalPatterns, x='SampleType', measures=c('Chao1', 'ACE', 'InvSimpson'), nrow=3) +#' plot_richness(GlobalPatterns, x='SampleType', measures=c('Chao1', 'ACE', 'InvSimpson'), nrow=3, sortby = 'Chao1') +plot_richness = function(physeq, x = "samples", color = NULL, shape = NULL, title = NULL, + scales = "free_y", nrow = 1, shsi = NULL, measures = NULL, sortby = NULL) { # Calculate the relevant alpha-diversity measures - erDF = estimate_richness(physeq, split=TRUE, measures=measures) + erDF = estimate_richness(physeq, split = TRUE, measures = measures) # Measures may have been renamed in `erDF`. Replace it with the name from erDF measures = colnames(erDF) - # Define "measure" variables and s.e. labels, for melting. + # Define 'measure' variables and s.e. labels, for melting. ses = colnames(erDF)[grep("^se\\.", colnames(erDF))] # Remove any S.E. from `measures` measures = measures[!measures %in% ses] - # Make the plotting data.frame. - # This coerces to data.frame, required for reliable output from reshape2::melt() - if( !is.null(sample_data(physeq, errorIfNULL=FALSE)) ){ + # Make the plotting data.frame. This coerces to data.frame, required for + # reliable output from reshape2::melt() + if (!is.null(sample_data(physeq, errorIfNULL = FALSE))) { # Include the sample data, if it is there. - DF <- data.frame(erDF, sample_data(physeq)) + DF <- data.frame(erDF, sample_data(physeq)) } else { # If no sample data, leave it out. DF <- data.frame(erDF) } - if( !"samples" %in% colnames(DF) ){ - # If there is no "samples" variable in DF, add it - DF$samples <- sample_names(physeq) - } - # sample_names used to be default, and should also work. - # #backwardcompatibility - if( !is.null(x) ){ - if( x %in% c("sample", "samples", "sample_names", "sample.names") ){ - x <- "samples" - } - } else { - # If x was NULL for some reason, set it to "samples" - x <- "samples" - } - # melt to display different alpha-measures separately - mdf = reshape2::melt(DF, measure.vars=measures) + if (!"samples" %in% colnames(DF)) { + # If there is no 'samples' variable in DF, add it + DF$samples <- sample_names(physeq) + } + # sample_names used to be default, and should also work. #backwardcompatibility + if (!is.null(x)) { + if (x %in% c("sample", "samples", "sample_names", "sample.names")) { + x <- "samples" + } + } else { + # If x was NULL for some reason, set it to 'samples' + x <- "samples" + } + # melt to display different alpha-measures separately + mdf = reshape2::melt(DF, measure.vars = measures) # Initialize the se column. Helpful even if not used. mdf$se <- NA_integer_ - if( length(ses) > 0 ){ - ## Merge s.e. into one "se" column - # Define conversion vector, `selabs` + if (length(ses) > 0) { + ## Merge s.e. into one 'se' column Define conversion vector, `selabs` selabs = ses - # Trim the "se." from the names + # Trim the 'se.' from the names names(selabs) <- substr(selabs, 4, 100) # Make first letter of selabs' names uppercase substr(names(selabs), 1, 1) <- toupper(substr(names(selabs), 1, 1)) # use selabs conversion vector to process `mdf` - mdf$wse <- sapply(as.character(mdf$variable), function(i, selabs){selabs[i]}, selabs) - for( i in 1:nrow(mdf) ){ - if( !is.na(mdf[i, "wse"]) ){ + mdf$wse <- sapply(as.character(mdf$variable), function(i, selabs) { + selabs[i] + }, selabs) + for (i in 1:nrow(mdf)) { + if (!is.na(mdf[i, "wse"])) { mdf[i, "se"] <- mdf[i, (mdf[i, "wse"])] } } # prune the redundant columns mdf <- mdf[, -which(colnames(mdf) %in% c(selabs, "wse"))] } - ## Interpret measures - # If not provided (default), keep all - if( !is.null(measures) ){ - if( any(measures %in% as.character(mdf$variable)) ){ + ## Interpret measures If not provided (default), keep all + if (!is.null(measures)) { + if (any(measures %in% as.character(mdf$variable))) { # If any measures were in mdf, then subset to just those. mdf <- mdf[as.character(mdf$variable) %in% measures, ] } else { @@ -696,53 +661,49 @@ plot_richness = function(physeq, x="samples", color=NULL, shape=NULL, title=NULL warning("Argument to `measures` not supported. All alpha-diversity measures (should be) included in plot.") } } - if( !is.null(shsi) ){ - # Deprecated: - # If shsi is anything but NULL, print a warning about its being deprecated - warning("shsi no longer supported option in plot_richness. Please use `measures` instead") - } + if (!is.null(shsi)) { + # Deprecated: If shsi is anything but NULL, print a warning about its being + # deprecated + warning("shsi no longer supported option in plot_richness. Please use `measures` instead") + } # Address `sortby` argument - if(!is.null(sortby)){ - if(!all(sortby %in% levels(mdf$variable))){ + if (!is.null(sortby)) { + if (!all(sortby %in% levels(mdf$variable))) { warning("`sortby` argument not among `measures`. Ignored.") } - if(!is.discrete(mdf[, x])){ + if (!is.discrete(mdf[, x])) { warning("`sortby` argument provided, but `x` not a discrete variable. `sortby` is ignored.") } - if(all(sortby %in% levels(mdf$variable)) & is.discrete(mdf[, x])){ - # Replace x-factor with same factor that has levels re-ordered according to `sortby` + if (all(sortby %in% levels(mdf$variable)) & is.discrete(mdf[, x])) { + # Replace x-factor with same factor that has levels re-ordered according to + # `sortby` wh.sortby = which(mdf$variable %in% sortby) - mdf[, x] <- factor(mdf[, x], - levels = names(sort(tapply(X = mdf[wh.sortby, "value"], - INDEX = mdf[wh.sortby, x], - mean, - na.rm=TRUE, simplify = TRUE)))) + mdf[, x] <- factor(mdf[, x], levels = names(sort(tapply(X = mdf[wh.sortby, + "value"], INDEX = mdf[wh.sortby, x], mean, na.rm = TRUE, simplify = TRUE)))) } } # Define variable mapping - richness_map = aes_string(x=x, y="value", colour=color, shape=shape) + richness_map = aes_string(x = x, y = "value", colour = color, shape = shape) # Make the ggplot. - p = ggplot(mdf, richness_map) + geom_point(na.rm=TRUE) + p = ggplot(mdf, richness_map) + geom_point(na.rm = TRUE) # Add error bars if mdf$se is not all NA - if( any(!is.na(mdf[, "se"])) ){ - p = p + geom_errorbar(aes(ymax=value + se, ymin=value - se), width=0.1) + if (any(!is.na(mdf[, "se"]))) { + p = p + geom_errorbar(aes(ymax = value + se, ymin = value - se), width = 0.1) } # Rotate horizontal axis labels, and adjust - p = p + theme(axis.text.x=element_text(angle=-90, vjust=0.5, hjust=0)) - # Add y-label - p = p + ylab('Alpha Diversity Measure') + p = p + theme(axis.text.x = element_text(angle = -90, vjust = 0.5, hjust = 0)) + # Add y-label + p = p + ylab("Alpha Diversity Measure") # Facet wrap using user-options - p = p + facet_wrap(~variable, nrow=nrow, scales=scales) - # Optionally add a title to the plot - if( !is.null(title) ){ - p <- p + ggtitle(title) - } - return(p) + p = p + facet_wrap(~variable, nrow = nrow, scales = scales) + # Optionally add a title to the plot + if (!is.null(title)) { + p <- p + ggtitle(title) + } + return(p) } -################################################################################ -################################################################################ -# The general case, could plot samples, taxa, or both (biplot/split). Default samples. -################################################################################ +################################################################################ The general case, could plot samples, taxa, or both (biplot/split). Default +################################################################################ samples. #' General ordination plotter based on ggplot2. #' #' There are many useful examples of phyloseq ordination graphics in the @@ -762,20 +723,20 @@ plot_richness = function(physeq, x="samples", color=NULL, shape=NULL, title=NULL #' supported here. There is no default, as the expectation is that the #' ordination will be performed and saved prior to calling this plot function. #' -#' @param type (Optional). The plot type. Default is \code{"samples"}. The +#' @param type (Optional). The plot type. Default is \code{'samples'}. The #' currently supported options are -#' \code{c("samples", "sites", "species", "taxa", "biplot", "split", "scree")}. +#' \code{c('samples', 'sites', 'species', 'taxa', 'biplot', 'split', 'scree')}. #' The option #' ``taxa'' is equivalent to ``species'' in this case, and similarly, #' ``samples'' is equivalent to ``sites''. #' The options -#' \code{"sites"} and \code{"species"} result in a single-plot of just the +#' \code{'sites'} and \code{'species'} result in a single-plot of just the #' sites/samples or species/taxa of the ordination, respectively. -#' The \code{"biplot"} and \code{"split"} options result in a combined +#' The \code{'biplot'} and \code{'split'} options result in a combined #' plot with both taxa and samples, either combined into one plot (``biplot'') #' or #' separated in two facet panels (``split''), respectively. -#' The \code{"scree"} option results in a call to \code{\link{plot_scree}}, +#' The \code{'scree'} option results in a call to \code{\link{plot_scree}}, #' which produces an ordered bar plot of the normalized eigenvalues #' associated with each ordination axis. #' @@ -845,270 +806,280 @@ plot_richness = function(physeq, x="samples", color=NULL, shape=NULL, title=NULL #' # http://joey711.github.io/phyloseq/plot_ordination-examples #' data(GlobalPatterns) #' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns) -#' gp_bray_pcoa = ordinate(GP, "CCA", "bray") -#' plot_ordination(GP, gp_bray_pcoa, "samples", color="SampleType") -plot_ordination = function(physeq, ordination, type="samples", axes=1:2, - color=NULL, shape=NULL, label=NULL, title=NULL, justDF=FALSE){ - if(length(type) > 1){ - warning("`type` can only be a single option, - but more than one provided. Using only the first.") +#' gp_bray_pcoa = ordinate(GP, 'CCA', 'bray') +#' plot_ordination(GP, gp_bray_pcoa, 'samples', color='SampleType') +plot_ordination = function(physeq, ordination, type = "samples", axes = 1:2, color = NULL, + shape = NULL, label = NULL, title = NULL, justDF = FALSE) { + if (length(type) > 1) { + warning("`type` can only be a single option,\n but more than one provided. Using only the first.") type <- type[[1]] } - if(length(color) > 1){ - warning("The `color` variable argument should have length equal to 1.", - "Taking first value.") + if (length(color) > 1) { + warning("The `color` variable argument should have length equal to 1.", "Taking first value.") color = color[[1]][1] } - if(length(shape) > 1){ - warning("The `shape` variable argument should have length equal to 1.", - "Taking first value.") + if (length(shape) > 1) { + warning("The `shape` variable argument should have length equal to 1.", "Taking first value.") shape = shape[[1]][1] } - if(length(label) > 1){ - warning("The `label` variable argument should have length equal to 1.", - "Taking first value.") + if (length(label) > 1) { + warning("The `label` variable argument should have length equal to 1.", "Taking first value.") label = label[[1]][1] } official_types = c("sites", "species", "biplot", "split", "scree") - if(!inherits(physeq, "phyloseq")){ - if(inherits(physeq, "character")){ - if(physeq=="list"){ + if (!inherits(physeq, "phyloseq")) { + if (inherits(physeq, "character")) { + if (physeq == "list") { return(official_types) } - } - warning("Full functionality requires `physeq` be phyloseq-class ", - "with multiple components.") + } + warning("Full functionality requires `physeq` be phyloseq-class ", "with multiple components.") } # Catch typos and synonyms - type = gsub("^.*site[s]*.*$", "sites", type, ignore.case=TRUE) - type = gsub("^.*sample[s]*.*$", "sites", type, ignore.case=TRUE) - type = gsub("^.*species.*$", "species", type, ignore.case=TRUE) - type = gsub("^.*taxa.*$", "species", type, ignore.case=TRUE) - type = gsub("^.*OTU[s]*.*$", "species", type, ignore.case=TRUE) - type = gsub("^.*biplot[s]*.*$", "biplot", type, ignore.case=TRUE) - type = gsub("^.*split[s]*.*$", "split", type, ignore.case=TRUE) - type = gsub("^.*scree[s]*.*$", "scree", type, ignore.case=TRUE) + type = gsub("^.*site[s]*.*$", "sites", type, ignore.case = TRUE) + type = gsub("^.*sample[s]*.*$", "sites", type, ignore.case = TRUE) + type = gsub("^.*species.*$", "species", type, ignore.case = TRUE) + type = gsub("^.*taxa.*$", "species", type, ignore.case = TRUE) + type = gsub("^.*OTU[s]*.*$", "species", type, ignore.case = TRUE) + type = gsub("^.*biplot[s]*.*$", "biplot", type, ignore.case = TRUE) + type = gsub("^.*split[s]*.*$", "split", type, ignore.case = TRUE) + type = gsub("^.*scree[s]*.*$", "scree", type, ignore.case = TRUE) # If type argument is not supported... - if( !type %in% official_types ){ - warning("type argument not supported. `type` set to 'samples'.\n", - "See `plot_ordination('list')`") + if (!type %in% official_types) { + warning("type argument not supported. `type` set to 'samples'.\n", "See `plot_ordination('list')`") type <- "sites" } - if( type %in% c("scree") ){ - # Stop early by passing to plot_scree() if "scree" was chosen as a type - return( plot_scree(ordination, title=title) ) + if (type %in% c("scree")) { + # Stop early by passing to plot_scree() if 'scree' was chosen as a type + return(plot_scree(ordination, title = title)) } # Define a function to check if a data.frame is empty - is_empty = function(x){ + is_empty = function(x) { length(x) < 2 | suppressWarnings(all(is.na(x))) } - # The plotting data frames. - # Call scores to get coordinates. - # Silently returns only the coordinate systems available. - # e.g. sites-only, even if species requested. + # The plotting data frames. Call scores to get coordinates. Silently returns + # only the coordinate systems available. e.g. sites-only, even if species + # requested. specDF = siteDF = NULL - trash1 = try({siteDF <- scores(ordination, choices = axes, - display="sites", physeq=physeq)}, - silent = TRUE) - trash2 = try({specDF <- scores(ordination, choices = axes, - display="species", physeq=physeq)}, - silent = TRUE) + trash1 = try({ + siteDF <- scores(ordination, choices = axes, display = "sites", physeq = physeq) + }, silent = TRUE) + trash2 = try({ + specDF <- scores(ordination, choices = axes, display = "species", physeq = physeq) + }, silent = TRUE) # Check that have assigned coordinates to the correct object siteSampIntx = length(intersect(rownames(siteDF), sample_names(physeq))) siteTaxaIntx = length(intersect(rownames(siteDF), taxa_names(physeq))) specSampIntx = length(intersect(rownames(specDF), sample_names(physeq))) specTaxaIntx = length(intersect(rownames(specDF), taxa_names(physeq))) - if(siteSampIntx < specSampIntx & specTaxaIntx < siteTaxaIntx){ + if (siteSampIntx < specSampIntx & specTaxaIntx < siteTaxaIntx) { # Double-swap co = specDF specDF <- siteDF siteDF <- co rm(co) } else { - if(siteSampIntx < specSampIntx){ + if (siteSampIntx < specSampIntx) { # Single swap siteDF <- specDF specDF <- NULL } - if(specTaxaIntx < siteTaxaIntx){ - # Single swap + if (specTaxaIntx < siteTaxaIntx) { + # Single swap specDF <- siteDF siteDF <- NULL } } # If both empty, warn and return NULL - if(is_empty(siteDF) & is_empty(specDF)){ - warning("Could not obtain coordinates from the provided `ordination`. \n", - "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.") + if (is_empty(siteDF) & is_empty(specDF)) { + warning("Could not obtain coordinates from the provided `ordination`. \n", + "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.") return(NULL) } # If either is missing, do weighted average - if(is_empty(specDF) & type != "sites"){ + if (is_empty(specDF) & type != "sites") { message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)") - specDF <- data.frame(wascores(siteDF, w = veganifyOTU(physeq)), stringsAsFactors=FALSE) + specDF <- data.frame(wascores(siteDF, w = veganifyOTU(physeq)), stringsAsFactors = FALSE) } - if(is_empty(siteDF) & type != "species"){ + if (is_empty(siteDF) & type != "species") { message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)") - siteDF <- data.frame(wascores(specDF, w = t(veganifyOTU(physeq))), stringsAsFactors=FALSE) + siteDF <- data.frame(wascores(specDF, w = t(veganifyOTU(physeq))), stringsAsFactors = FALSE) } # Double-check that have assigned coordinates to the correct object specTaxaIntx <- siteSampIntx <- NULL siteSampIntx <- length(intersect(rownames(siteDF), sample_names(physeq))) specTaxaIntx <- length(intersect(rownames(specDF), taxa_names(physeq))) - if(siteSampIntx < 1L & !is_empty(siteDF)){ - # If siteDF is not empty, but it doesn't intersect the sample_names in physeq, warn and set to NULL + if (siteSampIntx < 1L & !is_empty(siteDF)) { + # If siteDF is not empty, but it doesn't intersect the sample_names in physeq, + # warn and set to NULL warning("`Ordination site/sample coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.") siteDF <- NULL } - if(specTaxaIntx < 1L & !is_empty(specDF)){ - # If specDF is not empty, but it doesn't intersect the taxa_names in physeq, warn and set to NULL + if (specTaxaIntx < 1L & !is_empty(specDF)) { + # If specDF is not empty, but it doesn't intersect the taxa_names in physeq, warn + # and set to NULL warning("`Ordination species/OTU/taxa coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.") specDF <- NULL } # If you made it this far and both NULL, return NULL and throw a warning - if(is_empty(siteDF) & is_empty(specDF)){ - warning("Could not obtain coordinates from the provided `ordination`. \n", - "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.") + if (is_empty(siteDF) & is_empty(specDF)) { + warning("Could not obtain coordinates from the provided `ordination`. \n", + "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.") return(NULL) } - if(type %in% c("biplot", "split") & (is_empty(siteDF) | is_empty(specDF)) ){ - # biplot and split require both coordinates systems available. - # Both were attempted, or even evaluated by weighted average. - # If still empty, warn and switch to relevant type. - if(is_empty(siteDF)){ + if (type %in% c("biplot", "split") & (is_empty(siteDF) | is_empty(specDF))) { + # biplot and split require both coordinates systems available. Both were + # attempted, or even evaluated by weighted average. If still empty, warn and + # switch to relevant type. + if (is_empty(siteDF)) { warning("Could not access/evaluate site/sample coordinates. Switching type to 'species'") type <- "species" } - if(is_empty(specDF)){ + if (is_empty(specDF)) { warning("Could not access/evaluate species/taxa/OTU coordinates. Switching type to 'sites'") type <- "sites" } } - if(type != "species"){ + if (type != "species") { # samples covariate data frame, `sdf` sdf = NULL - sdf = data.frame(access(physeq, slot="sam_data"), stringsAsFactors=FALSE) - if( !is_empty(sdf) & !is_empty(siteDF) ){ + sdf = data.frame(access(physeq, slot = "sam_data"), stringsAsFactors = FALSE) + if (!is_empty(sdf) & !is_empty(siteDF)) { # The first two axes should always be x and y, the ordination axes. siteDF <- cbind(siteDF, sdf[rownames(siteDF), ]) } } - if(type != "sites"){ + if (type != "sites") { # taxonomy data frame `tdf` tdf = NULL - tdf = data.frame(access(physeq, slot="tax_table"), stringsAsFactors=FALSE) - if( !is_empty(tdf) & !is_empty(specDF) ){ + tdf = data.frame(access(physeq, slot = "tax_table"), stringsAsFactors = FALSE) + if (!is_empty(tdf) & !is_empty(specDF)) { # The first two axes should always be x and y, the ordination axes. specDF = cbind(specDF, tdf[rownames(specDF), ]) } } - # In "naked" OTU-table cases, `siteDF` or `specDF` could be matrix. - if(!inherits(siteDF, "data.frame")){ - #warning("Sample Co-variables apparently missing in provided `physeq` for this plot-type. Coercing coord matrix to data.frame.") + # In 'naked' OTU-table cases, `siteDF` or `specDF` could be matrix. + if (!inherits(siteDF, "data.frame")) { + # warning('Sample Co-variables apparently missing in provided `physeq` for this + # plot-type. Coercing coord matrix to data.frame.') siteDF <- as.data.frame(siteDF, stringsAsFactors = FALSE) - } - if(!inherits(specDF, "data.frame")){ - #warning("Taxonomy apparently missing in provided `physeq` for this plot-type. Coercing coord matrix to data.frame.") + } + if (!inherits(specDF, "data.frame")) { + # warning('Taxonomy apparently missing in provided `physeq` for this plot-type. + # Coercing coord matrix to data.frame.') specDF <- as.data.frame(specDF, stringsAsFactors = FALSE) } # Define the main plot data frame, `DF` DF = NULL DF <- switch(EXPR = type, sites = siteDF, species = specDF, { - # Anything else. In practice, type should be "biplot" or "split" here. - # Add id.type label + # Anything else. In practice, type should be 'biplot' or 'split' here. Add + # id.type label specDF$id.type <- "Taxa" siteDF$id.type <- "Samples" - # But what if the axis variables differ b/w them? - # Coerce specDF to match samples (siteDF) axis names + # But what if the axis variables differ b/w them? Coerce specDF to match samples + # (siteDF) axis names colnames(specDF)[1:2] <- colnames(siteDF)[1:2] # Merge the two data frames together for joint plotting. - DF = merge(specDF, siteDF, all=TRUE) - # Replace NA with "samples" or "taxa", where appropriate (factor/character) - if(!is.null(shape)){ DF <- rp.joint.fill(DF, shape, "Samples") } - if(!is.null(shape)){ DF <- rp.joint.fill(DF, shape, "Taxa") } - if(!is.null(color)){ DF <- rp.joint.fill(DF, color, "Samples") } - if(!is.null(color)){ DF <- rp.joint.fill(DF, color, "Taxa") } + DF = merge(specDF, siteDF, all = TRUE) + # Replace NA with 'samples' or 'taxa', where appropriate (factor/character) + if (!is.null(shape)) { + DF <- rp.joint.fill(DF, shape, "Samples") + } + if (!is.null(shape)) { + DF <- rp.joint.fill(DF, shape, "Taxa") + } + if (!is.null(color)) { + DF <- rp.joint.fill(DF, color, "Samples") + } + if (!is.null(color)) { + DF <- rp.joint.fill(DF, color, "Taxa") + } DF }) # In case user wants the plot-DF for some other purpose, return early - if(justDF){return(DF)} + if (justDF) { + return(DF) + } # Check variable availability before defining mapping. - if(!is.null(color)){ - if(!color %in% names(DF)){ - warning("Color variable was not found in the available data you provided.", - "No color mapped.") + if (!is.null(color)) { + if (!color %in% names(DF)) { + warning("Color variable was not found in the available data you provided.", + "No color mapped.") color <- NULL } } - if(!is.null(shape)){ - if(!shape %in% names(DF)){ - warning("Shape variable was not found in the available data you provided.", - "No shape mapped.") + if (!is.null(shape)) { + if (!shape %in% names(DF)) { + warning("Shape variable was not found in the available data you provided.", + "No shape mapped.") shape <- NULL } } - if(!is.null(label)){ - if(!label %in% names(DF)){ - warning("Label variable was not found in the available data you provided.", - "No label mapped.") + if (!is.null(label)) { + if (!label %in% names(DF)) { + warning("Label variable was not found in the available data you provided.", + "No label mapped.") label <- NULL } } # Grab the ordination axis names from the plot data frame (as strings) x = colnames(DF)[1] - y = colnames(DF)[2] + y = colnames(DF)[2] # Mapping section - if( ncol(DF) <= 2){ + if (ncol(DF) <= 2) { # If there is nothing to map, enforce simple mapping. message("No available covariate data to map on the points for this plot `type`") - ord_map = aes_string(x=x, y=y) - } else if( type %in% c("sites", "species", "split") ){ - ord_map = aes_string(x=x, y=y, color=color, shape=shape, na.rm=TRUE) - } else if(type=="biplot"){ - # biplot, `id.type` should try to map to color and size. Only size if color specified. - if( is.null(color) ){ - ord_map = aes_string(x=x, y=y, size="id.type", color="id.type", shape=shape, na.rm=TRUE) + ord_map = aes_string(x = x, y = y) + } else if (type %in% c("sites", "species", "split")) { + ord_map = aes_string(x = x, y = y, color = color, shape = shape, na.rm = TRUE) + } else if (type == "biplot") { + # biplot, `id.type` should try to map to color and size. Only size if color + # specified. + if (is.null(color)) { + ord_map = aes_string(x = x, y = y, size = "id.type", color = "id.type", + shape = shape, na.rm = TRUE) } else { - ord_map = aes_string(x=x, y=y, size="id.type", color=color, shape=shape, na.rm=TRUE) + ord_map = aes_string(x = x, y = y, size = "id.type", color = color, shape = shape, + na.rm = TRUE) } } # Plot-building section - p <- ggplot(DF, ord_map) + geom_point(na.rm=TRUE) + p <- ggplot(DF, ord_map) + geom_point(na.rm = TRUE) # split/facet color and shape can be anything in one or other. - if( type=="split" ){ + if (type == "split") { # split-option requires a facet_wrap - p <- p + facet_wrap(~id.type, nrow=1) + p <- p + facet_wrap(~id.type, nrow = 1) } # If biplot, adjust scales - if( type=="biplot" ){ - if( is.null(color) ){ + if (type == "biplot") { + if (is.null(color)) { # Rename color title in legend. - p <- update_labels(p, list(colour="Ordination Type")) - } + p <- update_labels(p, list(colour = "Ordination Type")) + } # Adjust size so that samples are bigger than taxa by default. - p <- p + scale_size_manual("type", values=c(Samples=5, Taxa=2)) + p <- p + scale_size_manual("type", values = c(Samples = 5, Taxa = 2)) } # Add text labels to points - if( !is.null(label) ){ - label_map <- aes_string(x=x, y=y, label=label, na.rm=TRUE) - p = p + geom_text(label_map, data=rm.na.phyloseq(DF, label), - size=2, vjust=1.5, na.rm=TRUE) + if (!is.null(label)) { + label_map <- aes_string(x = x, y = y, label = label, na.rm = TRUE) + p = p + geom_text(label_map, data = rm.na.phyloseq(DF, label), size = 2, + vjust = 1.5, na.rm = TRUE) } # Optionally add a title to the plot - if( !is.null(title) ){ + if (!is.null(title)) { p = p + ggtitle(title) } # Add fraction variability to axis labels, if available - if( length(extract_eigenvalue(ordination)[axes]) > 0 ){ - # Only attempt to add fraction variability - # if extract_eigenvalue returns something + if (length(extract_eigenvalue(ordination)[axes]) > 0) { + # Only attempt to add fraction variability if extract_eigenvalue returns + # something eigvec = extract_eigenvalue(ordination) # Fraction variability, fracvar - fracvar = eigvec[axes] / sum(eigvec) + fracvar = eigvec[axes]/sum(eigvec) # Percent variability, percvar - percvar = round(100*fracvar, 1) - # The string to add to each axis label, strivar - # Start with the curent axis labels in the plot + percvar = round(100 * fracvar, 1) + # The string to add to each axis label, strivar Start with the curent axis labels + # in the plot strivar = as(c(p$label$x, p$label$y), "character") # paste the percent variability string at the end strivar = paste0(strivar, " [", percvar, "%]") @@ -1118,38 +1089,35 @@ plot_ordination = function(physeq, ordination, type="samples", axes=1:2, # Return the ggplot object return(p) } -################################################################################ -# Remove NA elements from data.frame prior to plotting -# Remove NA level from factor -################################################################################ +################################################################################ Remove NA elements from data.frame prior to plotting Remove NA level from +################################################################################ factor #' @keywords internal -rm.na.phyloseq <- function(DF, key.var){ - # (1) Remove elements from DF if key.var has NA - # DF[!is.na(DF[, key.var]), ] - DF <- subset(DF, !is.na(eval(parse(text=key.var)))) - # (2) Remove NA from the factor level, if a factor. - if( class(DF[, key.var]) == "factor" ){ - DF[, key.var] <- factor(as(DF[, key.var], "character")) - } - return(DF) +rm.na.phyloseq <- function(DF, key.var) { + # (1) Remove elements from DF if key.var has NA DF[!is.na(DF[, key.var]), ] + DF <- subset(DF, !is.na(eval(parse(text = key.var)))) + # (2) Remove NA from the factor level, if a factor. + if (class(DF[, key.var]) == "factor") { + DF[, key.var] <- factor(as(DF[, key.var], "character")) + } + return(DF) } -################################################################################ -################################################################################ +################################################################################ #' @keywords internal #' @importFrom plyr is.discrete -rp.joint.fill <- function(DF, map.var, id.type.rp="samples"){ - # If all of the map.var values for samples/species are NA, replace with id.type.rp - if( all(is.na(DF[DF$id.type==id.type.rp, map.var])) ){ - # If discrete, coerce to character, convert to factor, replace, relevel. - if( is.discrete(DF[, map.var]) ){ - temp.vec <- as(DF[, map.var], "character") - temp.vec[is.na(temp.vec)] <- id.type.rp - DF[, map.var] <- relevel(factor(temp.vec), id.type.rp) - } - } - return(DF) +rp.joint.fill <- function(DF, map.var, id.type.rp = "samples") { + # If all of the map.var values for samples/species are NA, replace with + # id.type.rp + if (all(is.na(DF[DF$id.type == id.type.rp, map.var]))) { + # If discrete, coerce to character, convert to factor, replace, relevel. + if (is.discrete(DF[, map.var])) { + temp.vec <- as(DF[, map.var], "character") + temp.vec[is.na(temp.vec)] <- id.type.rp + DF[, map.var] <- relevel(factor(temp.vec), id.type.rp) + } + } + return(DF) } -################################################################################ +################################################################################ #' Subset points from an ordination-derived ggplot #' #' Easily retrieve a plot-derived \code{data.frame} with a subset of points @@ -1158,7 +1126,7 @@ rp.joint.fill <- function(DF, map.var, id.type.rp="samples"){ #' There are many useful examples of phyloseq ordination graphics in the #' \href{http://joey711.github.io/phyloseq/subset_ord_plot-examples}{phyloseq online tutorials}. #' -#' @usage subset_ord_plot(p, threshold=0.05, method="farthest") +#' @usage subset_ord_plot(p, threshold=0.05, method='farthest') #' #' @param p (Required). A \code{\link{ggplot}} object created by #' \code{\link{plot_ordination}}. It contains the complete data that you @@ -1170,7 +1138,7 @@ rp.joint.fill <- function(DF, map.var, id.type.rp="samples"){ #' determining which points are included in returned \code{data.frame}. #' #' @param method (Optional). A character string. One of -#' \code{c("farthest", "radial", "square")}. Default is \code{"farthest"}. +#' \code{c('farthest', 'radial', 'square')}. Default is \code{'farthest'}. #' This determines how threshold will be interpreted. #' #' \describe{ @@ -1186,7 +1154,7 @@ rp.joint.fill <- function(DF, map.var, id.type.rp="samples"){ #' } #' #' \item{radial}{ -#' Keep only those points that are beyond \code{threshold} +#'\t Keep only those points that are beyond \code{threshold} #' radial distance from the origin. Has the effect of removing a #' circle of points from the plot, centered at the origin. #' } @@ -1213,36 +1181,36 @@ rp.joint.fill <- function(DF, map.var, id.type.rp="samples"){ #' @examples #' ## See the online tutorials. #' ## http://joey711.github.io/phyloseq/subset_ord_plot-examples -subset_ord_plot <- function(p, threshold=0.05, method="farthest"){ - threshold <- threshold[1] # ignore all but first threshold value. - method <- method[1] # ignore all but first string. - method.names <- c("farthest", "radial", "square") - # Subset to only some small fraction of points - # with furthest distance from origin - df <- p$data[, c(1, 2)] - d <- sqrt(df[, 1]^2 + df[, 2]^2) - names(d) <- rownames(df) - if( method.names[pmatch(method, method.names)] == "farthest"){ - if( threshold >= 1){ - show.names <- names(sort(d, TRUE)[1:threshold]) - } else if( threshold < 1 ){ - show.names <- names(sort(d, TRUE)[1:round(threshold*length(d))]) - } else { - stop("threshold not a valid positive numeric scalar") - } - } else if( method.names[pmatch(method, method.names)] == "radial"){ - show.names <- names(d[d > threshold]) - } else if( method.names[pmatch(method, method.names)] == "square"){ - # show.names <- rownames(df)[as.logical((abs(df[, 1]) > threshold) + (abs(df[, 2]) > threshold))] - show.names <- rownames(df)[((abs(df[, 1]) > threshold) | (abs(df[, 2]) > threshold))] - } else { - stop("method name not supported. Please select a valid method") - } - - return(p$data[show.names, ]) +subset_ord_plot <- function(p, threshold = 0.05, method = "farthest") { + threshold <- threshold[1] # ignore all but first threshold value. + method <- method[1] # ignore all but first string. + method.names <- c("farthest", "radial", "square") + # Subset to only some small fraction of points with furthest distance from origin + df <- p$data[, c(1, 2)] + d <- sqrt(df[, 1]^2 + df[, 2]^2) + names(d) <- rownames(df) + if (method.names[pmatch(method, method.names)] == "farthest") { + if (threshold >= 1) { + show.names <- names(sort(d, TRUE)[1:threshold]) + } else if (threshold < 1) { + show.names <- names(sort(d, TRUE)[1:round(threshold * length(d))]) + } else { + stop("threshold not a valid positive numeric scalar") + } + } else if (method.names[pmatch(method, method.names)] == "radial") { + show.names <- names(d[d > threshold]) + } else if (method.names[pmatch(method, method.names)] == "square") { + # show.names <- rownames(df)[as.logical((abs(df[, 1]) > threshold) + (abs(df[, + # 2]) > threshold))] + show.names <- rownames(df)[((abs(df[, 1]) > threshold) | (abs(df[, 2]) > + threshold))] + } else { + stop("method name not supported. Please select a valid method") + } + + return(p$data[show.names, ]) } -################################################################################ -################################################################################ +################################################################################ #' General ordination eigenvalue plotter using ggplot2. #' #' Convenience wrapper for plotting ordination eigenvalues (if available) @@ -1275,61 +1243,61 @@ subset_ord_plot <- function(p, threshold=0.05, method="farthest"){ #' @export #' @examples #' # First load and trim a dataset -#' data("GlobalPatterns") +#' data('GlobalPatterns') #' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns) #' # Test plots (preforms ordination in-line, then makes scree plot) -#' plot_scree(ordinate(GP, "DPCoA", "bray")) -#' plot_scree(ordinate(GP, "PCoA", "bray")) +#' plot_scree(ordinate(GP, 'DPCoA', 'bray')) +#' plot_scree(ordinate(GP, 'PCoA', 'bray')) #' # Empty return with message -#' plot_scree(ordinate(GP, "NMDS", "bray")) +#' plot_scree(ordinate(GP, 'NMDS', 'bray')) #' # Constrained ordinations -#' plot_scree(ordinate(GP, "CCA", formula=~SampleType)) -#' plot_scree(ordinate(GP, "RDA", formula=~SampleType)) -#' plot_scree(ordinate(GP, "CAP", formula=~SampleType)) +#' plot_scree(ordinate(GP, 'CCA', formula=~SampleType)) +#' plot_scree(ordinate(GP, 'RDA', formula=~SampleType)) +#' plot_scree(ordinate(GP, 'CAP', formula=~SampleType)) #' # Deprecated example of constrained ordination (emits a warning) -#' #plot_scree(ordinate(GP ~ SampleType, "RDA")) -#' plot_scree(ordinate(GP, "DCA")) -#' plot_ordination(GP, ordinate(GP, "DCA"), type="scree") -plot_scree = function(ordination, title=NULL){ - # Use get_eigenvalue method dispatch. It always returns a numeric vector. - x = extract_eigenvalue(ordination) - # Were eigenvalues found? If not, return NULL - if( is.null(x) ){ - cat("No eigenvalues found in ordination\n") - return(NULL) - } else { - # If no names, add them arbitrarily "axis1, axis2, ..., axisN" - if( is.null(names(x)) ) names(x) = 1:length(x) - # For scree plot, want to show the fraction of total eigenvalues - x = x/sum(x) - # Set negative values to zero - x[x <= 0.0] = 0.0 - # Create the ggplot2 data.frame, and basic ggplot2 plot - gdf = data.frame(axis=names(x), eigenvalue = x) - p = ggplot(gdf, aes(x=axis, y=eigenvalue)) + geom_bar(stat="identity") - # Force the order to be same as original in x - p = p + scale_x_discrete(limits = names(x)) - # Orient the x-labels for space. - p = p + theme(axis.text.x=element_text(angle=90, vjust=0.5)) - # Optionally add a title to the plot - if( !is.null(title) ){ - p <- p + ggtitle(title) - } - return(p) - } +#' #plot_scree(ordinate(GP ~ SampleType, 'RDA')) +#' plot_scree(ordinate(GP, 'DCA')) +#' plot_ordination(GP, ordinate(GP, 'DCA'), type='scree') +plot_scree = function(ordination, title = NULL) { + # Use get_eigenvalue method dispatch. It always returns a numeric vector. + x = extract_eigenvalue(ordination) + # Were eigenvalues found? If not, return NULL + if (is.null(x)) { + cat("No eigenvalues found in ordination\n") + return(NULL) + } else { + # If no names, add them arbitrarily 'axis1, axis2, ..., axisN' + if (is.null(names(x))) + names(x) = 1:length(x) + # For scree plot, want to show the fraction of total eigenvalues + x = x/sum(x) + # Set negative values to zero + x[x <= 0] = 0 + # Create the ggplot2 data.frame, and basic ggplot2 plot + gdf = data.frame(axis = names(x), eigenvalue = x) + p = ggplot(gdf, aes(x = axis, y = eigenvalue)) + geom_bar(stat = "identity") + # Force the order to be same as original in x + p = p + scale_x_discrete(limits = names(x)) + # Orient the x-labels for space. + p = p + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) + # Optionally add a title to the plot + if (!is.null(title)) { + p <- p + ggtitle(title) + } + return(p) + } } -################################################################################ -# Define S3 generic extract_eigenvalue function; formerly S4 generic get_eigenvalue() -# Function is used by `plot_scree` to get the eigenvalue vector from different -# types of ordination objects. -# Used S3 generic in this case because many ordination objects, the input, are -# not formally-defined S4 classes, but vaguely-/un-defined S3. This throws -# warnings during package build if extract_eigenvalue were S4 generic method, -# because the ordination classes don't appear to have any definition in phyloseq -# or dependencies. +################################################################################ Define S3 generic extract_eigenvalue function; formerly S4 generic +################################################################################ get_eigenvalue() Function is used by `plot_scree` to get the eigenvalue vector +################################################################################ from different types of ordination objects. Used S3 generic in this case +################################################################################ because many ordination objects, the input, are not formally-defined S4 +################################################################################ classes, but vaguely-/un-defined S3. This throws warnings during package build +################################################################################ if extract_eigenvalue were S4 generic method, because the ordination classes +################################################################################ don't appear to have any definition in phyloseq or dependencies. #' @keywords internal extract_eigenvalue = function(ordination) UseMethod("extract_eigenvalue", ordination) -# Default is to return NULL (e.g. for NMDS, or non-supported ordinations/classes). +# Default is to return NULL (e.g. for NMDS, or non-supported +# ordinations/classes). extract_eigenvalue.default = function(ordination) NULL # for pcoa objects extract_eigenvalue.pcoa = function(ordination) ordination$values$Relative_eig @@ -1341,7 +1309,7 @@ extract_eigenvalue.rda = function(ordination) c(ordination$CCA$eig, ordination$C extract_eigenvalue.dpcoa = function(ordination) ordination$eig # for decorana (dca) objects extract_eigenvalue.decorana = function(ordination) ordination$evals -################################################################################ +################################################################################ #' Melt phyloseq data object into large data.frame #' #' The psmelt function is a specialized melt function for melting phyloseq objects @@ -1352,7 +1320,7 @@ extract_eigenvalue.decorana = function(ordination) ordination$evals #' have reserved the following variable names that should not be used #' as the names of \code{\link{sample_variables}} #' or taxonomic \code{\link{rank_names}}. -#' These reserved names are \code{c("Sample", "Abundance", "OTU")}. +#' These reserved names are \code{c('Sample', 'Abundance', 'OTU')}. #' Also, you should not have identical names for #' sample variables and taxonomic ranks. #' That is, the intersection of the output of the following two functions @@ -1396,116 +1364,107 @@ extract_eigenvalue.decorana = function(ordination) ordination$evals #' @export #' #' @examples -#' data("GlobalPatterns") -#' gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae") +#' data('GlobalPatterns') +#' gp.ch = subset_taxa(GlobalPatterns, Phylum == 'Chlamydiae') #' mdf = psmelt(gp.ch) #' nrow(mdf) #' ncol(mdf) #' colnames(mdf) #' head(rownames(mdf)) #' # Create a ggplot similar to -#' library("ggplot2") +#' library('ggplot2') #' p = ggplot(mdf, aes(x=SampleType, y=Abundance, fill=Genus)) -#' p = p + geom_bar(color="black", stat="identity", position="stack") +#' p = p + geom_bar(color='black', stat='identity', position='stack') #' print(p) -psmelt = function(physeq){ +psmelt = function(physeq) { # Access covariate names from object, if present - if(!inherits(physeq, "phyloseq")){ + if (!inherits(physeq, "phyloseq")) { rankNames = NULL sampleVars = NULL } else { # Still might be NULL, but attempt access rankNames = rank_names(physeq, FALSE) - sampleVars = sample_variables(physeq, FALSE) + sampleVars = sample_variables(physeq, FALSE) } # Define reserved names - reservedVarnames = c("Sample", "Abundance", "OTU") - # type-1a conflict: between sample_data - # and reserved psmelt variable names + reservedVarnames = c("Sample", "Abundance", "OTU") + # type-1a conflict: between sample_data and reserved psmelt variable names type1aconflict = intersect(reservedVarnames, sampleVars) - if(length(type1aconflict) > 0){ + if (length(type1aconflict) > 0) { wh1a = which(sampleVars %in% type1aconflict) new1a = paste0("sample_", sampleVars[wh1a]) # First warn about the change - warning("The sample variables: \n", - paste(sampleVars[wh1a], collapse=", "), - "\n have been renamed to: \n", - paste0(new1a, collapse=", "), "\n", - "to avoid conflicts with special phyloseq plot attribute names.") + warning("The sample variables: \n", paste(sampleVars[wh1a], collapse = ", "), + "\n have been renamed to: \n", paste0(new1a, collapse = ", "), "\n", + "to avoid conflicts with special phyloseq plot attribute names.") # Rename the sample variables. colnames(sample_data(physeq))[wh1a] <- new1a } - # type-1b conflict: between tax_table - # and reserved psmelt variable names + # type-1b conflict: between tax_table and reserved psmelt variable names type1bconflict = intersect(reservedVarnames, rankNames) - if(length(type1bconflict) > 0){ + if (length(type1bconflict) > 0) { wh1b = which(rankNames %in% type1bconflict) new1b = paste0("taxa_", rankNames[wh1b]) # First warn about the change - warning("The rank names: \n", - paste(rankNames[wh1b], collapse=", "), - "\n have been renamed to: \n", - paste0(new1b, collapse=", "), "\n", - "to avoid conflicts with special phyloseq plot attribute names.") + warning("The rank names: \n", paste(rankNames[wh1b], collapse = ", "), "\n have been renamed to: \n", + paste0(new1b, collapse = ", "), "\n", "to avoid conflicts with special phyloseq plot attribute names.") # Rename the conflicting taxonomic ranks colnames(tax_table(physeq))[wh1b] <- new1b } # type-2 conflict: internal between tax_table and sample_data type2conflict = intersect(sampleVars, rankNames) - if(length(type2conflict) > 0){ + if (length(type2conflict) > 0) { wh2 = which(sampleVars %in% type2conflict) new2 = paste0("sample_", sampleVars[wh2]) # First warn about the change - warning("The sample variables: \n", - paste0(sampleVars[wh2], collapse=", "), - "\n have been renamed to: \n", - paste0(new2, collapse=", "), "\n", - "to avoid conflicts with taxonomic rank names.") + warning("The sample variables: \n", paste0(sampleVars[wh2], collapse = ", "), + "\n have been renamed to: \n", paste0(new2, collapse = ", "), "\n", "to avoid conflicts with taxonomic rank names.") # Rename the sample variables colnames(sample_data(physeq))[wh2] <- new2 } - # Enforce OTU table orientation. Redundant-looking step - # supports "naked" otu_table as `physeq` input. + # Enforce OTU table orientation. Redundant-looking step supports 'naked' + # otu_table as `physeq` input. otutab = otu_table(physeq) - if(!taxa_are_rows(otutab)){otutab <- t(otutab)} + if (!taxa_are_rows(otutab)) { + otutab <- t(otutab) + } # Melt the OTU table: wide form to long form table mdf = reshape2::melt(as(otutab, "matrix")) colnames(mdf)[1] <- "OTU" colnames(mdf)[2] <- "Sample" colnames(mdf)[3] <- "Abundance" - # Row and Col names are coerced to integer or factor if possible. - # Do not want this. Coerce these to character. - # e.g. `OTU` should always be discrete, even if OTU ID values can be coerced to integer + # Row and Col names are coerced to integer or factor if possible. Do not want + # this. Coerce these to character. e.g. `OTU` should always be discrete, even if + # OTU ID values can be coerced to integer mdf$OTU <- as.character(mdf$OTU) mdf$Sample <- as.character(mdf$Sample) # Merge the sample data.frame if present - if(!is.null(sampleVars)){ - sdf = data.frame(sample_data(physeq), stringsAsFactors=FALSE) + if (!is.null(sampleVars)) { + sdf = data.frame(sample_data(physeq), stringsAsFactors = FALSE) sdf$Sample <- sample_names(physeq) # merge the sample-data and the melted otu table - mdf <- merge(mdf, sdf, by.x="Sample") + mdf <- merge(mdf, sdf, by.x = "Sample") } # Next merge taxonomy data, if present - if(!is.null(rankNames)){ + if (!is.null(rankNames)) { TT = access(physeq, "tax_table") # First, check for empty TT columns (all NA) keepTTcols <- colSums(is.na(TT)) < ntaxa(TT) # Protect against all-empty columns, or col-less matrix - if(length(which(keepTTcols)) > 0 & ncol(TT) > 0){ + if (length(which(keepTTcols)) > 0 & ncol(TT) > 0) { # Remove the empty columns TT <- TT[, keepTTcols] - # Add TT to the "psmelt" data.frame - tdf = data.frame(TT, OTU=taxa_names(physeq)) - # Now add to the "psmelt" output data.frame, `mdf` - mdf <- merge(mdf, tdf, by.x="OTU") + # Add TT to the 'psmelt' data.frame + tdf = data.frame(TT, OTU = taxa_names(physeq)) + # Now add to the 'psmelt' output data.frame, `mdf` + mdf <- merge(mdf, tdf, by.x = "OTU") } } # Sort the entries by abundance - mdf = mdf[order(mdf$Abundance, decreasing=TRUE), ] + mdf = mdf[order(mdf$Abundance, decreasing = TRUE), ] return(mdf) } -################################################################################ -################################################################################ +################################################################################ #' A flexible, informative barplot phyloseq data #' #' There are many useful examples of phyloseq barplot graphics in the @@ -1517,7 +1476,7 @@ psmelt = function(physeq){ #' summary graphics of the differences in taxa abundance between samples in #' an experiment. #' -#' @usage plot_bar(physeq, x="Sample", y="Abundance", fill=NULL, +#' @usage plot_bar(physeq, x='Sample', y='Abundance', fill=NULL, #' title=NULL, facet_grid=NULL) #' #' @param physeq (Required). An \code{\link{otu_table-class}} or @@ -1531,7 +1490,7 @@ psmelt = function(physeq){ #' #' @param y (Optional). A character string. #' The variable in the melted-data that should be mapped to the y-axis. -#' Typically this will be \code{"Abundance"}, in order to +#' Typically this will be \code{'Abundance'}, in order to #' quantitatively display the abundance values for each OTU/group. #' However, alternative variables could be used instead, #' producing a very different, though possibly still informative, plot. @@ -1567,46 +1526,42 @@ psmelt = function(physeq){ #' @export #' #' @examples -#' data("GlobalPatterns") -#' gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae") +#' data('GlobalPatterns') +#' gp.ch = subset_taxa(GlobalPatterns, Phylum == 'Chlamydiae') #' plot_bar(gp.ch) -#' plot_bar(gp.ch, fill="Genus") -#' plot_bar(gp.ch, x="SampleType", fill="Genus") -#' plot_bar(gp.ch, "SampleType", fill="Genus", facet_grid=~Family) +#' plot_bar(gp.ch, fill='Genus') +#' plot_bar(gp.ch, x='SampleType', fill='Genus') +#' plot_bar(gp.ch, 'SampleType', fill='Genus', facet_grid=~Family) #' # See additional examples in the plot_bar online tutorial. Link above. -plot_bar = function(physeq, x="Sample", y="Abundance", fill=NULL, - title=NULL, facet_grid=NULL){ - - # Start by melting the data in the "standard" way using psmelt. - mdf = psmelt(physeq) - - # Build the plot data structure - p = ggplot(mdf, aes_string(x=x, y=y, fill=fill)) - - # Add the bar geometric object. Creates a basic graphic. Basis for the rest. - # Test weather additional - p = p + geom_bar(stat="identity", position="stack", color="black") - - # By default, rotate the x-axis labels (they might be long) - p = p + theme(axis.text.x=element_text(angle=-90, hjust=0)) - - # Add faceting, if given - if( !is.null(facet_grid) ){ - p <- p + facet_grid(facet_grid) - } - - # Optionally add a title to the plot - if( !is.null(title) ){ - p <- p + ggtitle(title) - } - - return(p) +plot_bar = function(physeq, x = "Sample", y = "Abundance", fill = NULL, title = NULL, + facet_grid = NULL) { + + # Start by melting the data in the 'standard' way using psmelt. + mdf = psmelt(physeq) + + # Build the plot data structure + p = ggplot(mdf, aes_string(x = x, y = y, fill = fill)) + + # Add the bar geometric object. Creates a basic graphic. Basis for the rest. + # Test weather additional + p = p + geom_bar(stat = "identity", position = "stack", color = "black") + + # By default, rotate the x-axis labels (they might be long) + p = p + theme(axis.text.x = element_text(angle = -90, hjust = 0)) + + # Add faceting, if given + if (!is.null(facet_grid)) { + p <- p + facet_grid(facet_grid) + } + + # Optionally add a title to the plot + if (!is.null(title)) { + p <- p + ggtitle(title) + } + + return(p) } -################################################################################ -################################################################################ -# plot_tree section. -################################################################################ -################################################################################ +################################################################################ plot_tree section. #' Returns a data table defining the line segments of a phylogenetic tree. #' #' This function takes a \code{\link{phylo}} or \code{\link{phyloseq-class}} object @@ -1619,16 +1574,16 @@ plot_bar = function(physeq, x="Sample", y="Abundance", fill=NULL, #' suitable for plotting with \code{\link[ggplot2]{ggplot}}2. #' #' @param ladderize (Optional). Boolean or character string (either -#' \code{FALSE}, \code{TRUE}, or \code{"left"}). +#' \code{FALSE}, \code{TRUE}, or \code{'left'}). #' Default is \code{FALSE} (no ladderization). #' This parameter specifies whether or not to \code{\link[ape]{ladderize}} the tree #' (i.e., reorder nodes according to the depth of their enclosed #' subtrees) prior to plotting. #' This tends to make trees more aesthetically pleasing and legible in #' a graphical display. -#' When \code{TRUE} or \code{"right"}, ``right'' ladderization is used. +#' When \code{TRUE} or \code{'right'}, ``right'' ladderization is used. #' When set to \code{FALSE}, no ladderization is applied. -#' When set to \code{"left"}, the reverse direction +#' When set to \code{'left'}, the reverse direction #' (``left'' ladderization) is applied. #' #' @return @@ -1655,10 +1610,10 @@ plot_bar = function(physeq, x="Sample", y="Abundance", fill=NULL, #' @importFrom data.table setkey #' @export #' @examples -#' library("ggplot2") -#' data("esophagus") +#' library('ggplot2') +#' data('esophagus') #' phy = phy_tree(esophagus) -#' phy <- ape::root(phy, "65_2_5", resolve.root=TRUE) +#' phy <- ape::root(phy, '65_2_5', resolve.root=TRUE) #' treeSegs0 = tree_layout(phy) #' treeSegs1 = tree_layout(esophagus) #' edgeMap = aes(x=xleft, xend=xright, y=y, yend=y) @@ -1667,32 +1622,32 @@ plot_bar = function(physeq, x="Sample", y="Abundance", fill=NULL, #' p1 = ggplot(treeSegs1$edgeDT, edgeMap) + geom_segment() + geom_segment(vertMap, data=treeSegs1$vertDT) #' print(p0) #' print(p1) -#' plot_tree(esophagus, "treeonly") -#' plot_tree(esophagus, "treeonly", ladderize="left") -tree_layout = function(phy, ladderize=FALSE){ - if(inherits(phy, "phyloseq")){ +#' plot_tree(esophagus, 'treeonly') +#' plot_tree(esophagus, 'treeonly', ladderize='left') +tree_layout = function(phy, ladderize = FALSE) { + if (inherits(phy, "phyloseq")) { phy = phy_tree(phy) } - if(!inherits(phy, "phylo")){ + if (!inherits(phy, "phylo")) { stop("tree missing or invalid. Please check `phy` argument and try again.") } - if(is.null(phy$edge.length)){ + if (is.null(phy$edge.length)) { # If no edge lengths, set them all to value of 1 (dendrogram). - phy$edge.length <- rep(1L, times=nrow(phy$edge)) + phy$edge.length <- rep(1L, times = nrow(phy$edge)) } # Perform ladderizing, if requested - if(ladderize != FALSE){ - if(ladderize == "left"){ + if (ladderize != FALSE) { + if (ladderize == "left") { phy <- ladderize(phy, FALSE) - } else if(ladderize==TRUE | ladderize=="right"){ + } else if (ladderize == TRUE | ladderize == "right") { phy <- ladderize(phy, TRUE) } else { stop("You did not specify a supported option for argument `ladderize`.") } } - # 'z' is the tree in postorder order used in calls to .C - # Descending order of left-hand side of edge (the ancestor to the node) - z = reorder.phylo(phy, order="postorder") + # 'z' is the tree in postorder order used in calls to .C Descending order of + # left-hand side of edge (the ancestor to the node) + z = reorder.phylo(phy, order = "postorder") # Initialize some characteristics of the tree. Nedge = nrow(phy$edge)[1] Nnode = phy$Nnode @@ -1701,85 +1656,83 @@ tree_layout = function(phy, ladderize=FALSE){ TIPS = phy$edge[(phy$edge[, 2] <= Ntip), 2] NODES = (ROOT):(Ntip + Nnode) nodelabels = phy$node.label - # Call phyloseq-internal function that in-turn calls ape's internal - # horizontal position function, in C, using the re-ordered phylo object. + # Call phyloseq-internal function that in-turn calls ape's internal horizontal + # position function, in C, using the re-ordered phylo object. xx = ape_node_depth_edge_length(Ntip, Nnode, z$edge, Nedge, z$edge.length) # Initialize `yy`, before passing to ape internal function in C. yy <- numeric(Ntip + Nnode) yy[TIPS] <- 1:Ntip # Define the ape_node_height wrapping function - ape_node_height <- function(Ntip, Nnode, edge, Nedge, yy){ - .C(ape:::node_height, PACKAGE="ape", - as.integer(Ntip), as.integer(Nnode), - as.integer(edge[, 1]), as.integer(edge[, 2]), - as.integer(Nedge), as.double(yy))[[6]] - } - # The call in ape - #yy <- .nodeHeight(Ntip, Nnode, z$edge, Nedge, yy) + ape_node_height <- function(Ntip, Nnode, edge, Nedge, yy) { + .C(ape:::node_height, PACKAGE = "ape", as.integer(Ntip), as.integer(Nnode), + as.integer(edge[, 1]), as.integer(edge[, 2]), as.integer(Nedge), as.double(yy))[[6]] + } + # The call in ape yy <- .nodeHeight(Ntip, Nnode, z$edge, Nedge, yy) yy <- ape_node_height(Ntip, Nnode, z$edge, Nedge, yy) - # Initialize an edge data.table - # Don't set key, order matters - edgeDT = data.table(phy$edge, edge.length=phy$edge.length, OTU=NA_character_) + # Initialize an edge data.table Don't set key, order matters + edgeDT = data.table(phy$edge, edge.length = phy$edge.length, OTU = NA_character_) # Add tip.labels if present - if(!is.null(phy$tip.label)){ + if (!is.null(phy$tip.label)) { # Initialize OTU, set node (V2) as key, assign taxa_names as OTU label - edgeDT[, OTU:=NA_character_] + edgeDT[, `:=`(OTU, NA_character_)] setkey(edgeDT, V2) - edgeDT[V2 <= Ntip, OTU:=phy$tip.label] + edgeDT[V2 <= Ntip, `:=`(OTU, phy$tip.label)] } - # Add the mapping for each edge defined in `xx` and `yy` - edgeDT[, xleft:=xx[V1]] - edgeDT[, xright:=xx[V2]] - edgeDT[, y:=yy[V2]] + # Add the mapping for each edge defined in `xx` and `yy` + edgeDT[, `:=`(xleft, xx[V1])] + edgeDT[, `:=`(xright, xx[V2])] + edgeDT[, `:=`(y, yy[V2])] # Next define vertical segments - vertDT = edgeDT[, list(x=xleft[1], vmin=min(y), vmax=max(y)), by=V1, mult="last"] - if(!is.null(phy$node.label)){ + vertDT = edgeDT[, list(x = xleft[1], vmin = min(y), vmax = max(y)), by = V1, + mult = "last"] + if (!is.null(phy$node.label)) { # Add non-root node labels to edgeDT - edgeDT[V2 > ROOT, x:=xright] - edgeDT[V2 > ROOT, label:=phy$node.label[-1]] + edgeDT[V2 > ROOT, `:=`(x, xright)] + edgeDT[V2 > ROOT, `:=`(label, phy$node.label[-1])] # Add root label (first node label) to vertDT setkey(vertDT, V1) - vertDT[J(ROOT), y:=mean(c(vmin, vmax))] - vertDT[J(ROOT), label:=phy$node.label[1]] + vertDT[J(ROOT), `:=`(y, mean(c(vmin, vmax)))] + vertDT[J(ROOT), `:=`(label, phy$node.label[1])] } - return(list(edgeDT=edgeDT, vertDT=vertDT)) + return(list(edgeDT = edgeDT, vertDT = vertDT)) } -################################################################################ -# Define an internal function for determining what the text-size should be +################################################################################ Define an internal function for determining what the text-size should be #' @keywords internal -manytextsize <- function(n, mins=0.5, maxs=4, B=6, D=100){ - # empirically selected size-value calculator. - s <- B * exp(-n/D) - # enforce a floor. - s <- ifelse(s > mins, s, mins) - # enforce a max - s <- ifelse(s < maxs, s, maxs) - return(s) +manytextsize <- function(n, mins = 0.5, maxs = 4, B = 6, D = 100) { + # empirically selected size-value calculator. + s <- B * exp(-n/D) + # enforce a floor. + s <- ifelse(s > mins, s, mins) + # enforce a max + s <- ifelse(s < maxs, s, maxs) + return(s) } -################################################################################ -# Return TRUE if the nodes of the tree in the phyloseq object provided are unlabeled. +################################################################################ Return TRUE if the nodes of the tree in the phyloseq object provided are +################################################################################ unlabeled. #' @keywords internal -nodesnotlabeled = function(physeq){ - if(is.null(phy_tree(physeq, FALSE))){ - warning("There is no phylogenetic tree in the object you have provided. Try `phy_tree(physeq)` to see.") - return(TRUE) - } else { - return(is.null(phy_tree(physeq)$node.label) | length(phy_tree(physeq)$node.label)==0L) - } +nodesnotlabeled = function(physeq) { + if (is.null(phy_tree(physeq, FALSE))) { + warning("There is no phylogenetic tree in the object you have provided. Try `phy_tree(physeq)` to see.") + return(TRUE) + } else { + return(is.null(phy_tree(physeq)$node.label) | length(phy_tree(physeq)$node.label) == + 0L) + } } -# A quick test function to decide how nodes should be labeled by default, if at all. -# +# A quick test function to decide how nodes should be labeled by default, if at +# all. #' @keywords internal -howtolabnodes = function(physeq){ - if(!nodesnotlabeled(physeq)){ - # If the nodes are labeled, use a version of this function, taking into account `ntaxa`. - return(nodeplotdefault(manytextsize(ntaxa(physeq)))) - } else { +howtolabnodes = function(physeq) { + if (!nodesnotlabeled(physeq)) { + # If the nodes are labeled, use a version of this function, taking into account + # `ntaxa`. + return(nodeplotdefault(manytextsize(ntaxa(physeq)))) + } else { # Else, use `nodeplotblank`, which returns the ggplot object as-is. - return(nodeplotblank) - } + return(nodeplotblank) + } } -################################################################################ +################################################################################ #' Function to avoid plotting node labels #' #' Unlike, \code{\link{nodeplotdefault}} and \code{\link{nodeplotboot}}, @@ -1810,13 +1763,13 @@ howtolabnodes = function(physeq){ #' @import ggplot2 #' @export #' @examples -#' data("esophagus") +#' data('esophagus') #' plot_tree(esophagus) #' plot_tree(esophagus, nodelabf=nodeplotblank) -nodeplotblank = function(p, nodelabdf){ - return(p) +nodeplotblank = function(p, nodelabdf) { + return(p) } -################################################################################ +################################################################################ #' Generates a function for labeling bootstrap values on a phylogenetic tree. #' #' Is not a labeling function itself, but returns one. @@ -1871,36 +1824,36 @@ nodeplotblank = function(p, nodelabdf){ #' @examples #' nodeplotboot() #' nodeplotboot(3, -0.4) -nodeplotboot = function(highthresh=95L, lowcthresh=50L, size=2L, hjust=-0.2){ - function(p, nodelabdf){ - # For bootstrap, check that the node labels can be coerced to numeric - try(boot <- as(as(nodelabdf$label, "character"), "numeric"), TRUE) - # Want NAs/NaN to propagate, but still need to test remainder - goodboot = boot[complete.cases(boot)] - if( !is(goodboot, "numeric") & length(goodboot) > 0 ){ - stop("The node labels, phy_tree(physeq)$node.label, are not coercable to a numeric vector with any elements.") - } - # So they look even more like bootstraps and display well, - # force them to be between 0 and 100, rounded to 2 digits. - if( all( goodboot >= 0.0 & goodboot <= 1.0 ) ){ - boot = round(boot, 2)*100L - } - nodelabdf$boot = boot - boottop = subset(nodelabdf, boot >= highthresh) - bootmid = subset(nodelabdf, boot > lowcthresh & boot < highthresh) - # Label the high-confidence nodes with a point. - if( nrow(boottop)>0L ){ - p = p + geom_point(mapping=aes(x=x, y=y), data=boottop, na.rm=TRUE) - } - # Label the remaining bootstrap values as text at the nodes. - if( nrow(bootmid)>0L ){ - bootmid$label = bootmid$boot - p = nodeplotdefault(size, hjust)(p, bootmid) - } - return(p) - } +nodeplotboot = function(highthresh = 95L, lowcthresh = 50L, size = 2L, hjust = -0.2) { + function(p, nodelabdf) { + # For bootstrap, check that the node labels can be coerced to numeric + try(boot <- as(as(nodelabdf$label, "character"), "numeric"), TRUE) + # Want NAs/NaN to propagate, but still need to test remainder + goodboot = boot[complete.cases(boot)] + if (!is(goodboot, "numeric") & length(goodboot) > 0) { + stop("The node labels, phy_tree(physeq)$node.label, are not coercable to a numeric vector with any elements.") + } + # So they look even more like bootstraps and display well, force them to be + # between 0 and 100, rounded to 2 digits. + if (all(goodboot >= 0 & goodboot <= 1)) { + boot = round(boot, 2) * 100L + } + nodelabdf$boot = boot + boottop = subset(nodelabdf, boot >= highthresh) + bootmid = subset(nodelabdf, boot > lowcthresh & boot < highthresh) + # Label the high-confidence nodes with a point. + if (nrow(boottop) > 0L) { + p = p + geom_point(mapping = aes(x = x, y = y), data = boottop, na.rm = TRUE) + } + # Label the remaining bootstrap values as text at the nodes. + if (nrow(bootmid) > 0L) { + bootmid$label = bootmid$boot + p = nodeplotdefault(size, hjust)(p, bootmid) + } + return(p) + } } -################################################################################ +################################################################################ #' Generates a default node-label function #' #' Is not a labeling function itself, but returns one. @@ -1942,14 +1895,14 @@ nodeplotboot = function(highthresh=95L, lowcthresh=50L, size=2L, hjust=-0.2){ #' @examples #' nodeplotdefault() #' nodeplotdefault(3, -0.4) -nodeplotdefault = function(size=2L, hjust=-0.2){ - function(p, nodelabdf){ - p = p + geom_text(mapping=aes(x=x, y=y, label=label), data=nodelabdf, - size=size, hjust=hjust, na.rm=TRUE) - return(p) - } +nodeplotdefault = function(size = 2L, hjust = -0.2) { + function(p, nodelabdf) { + p = p + geom_text(mapping = aes(x = x, y = y, label = label), data = nodelabdf, + size = size, hjust = hjust, na.rm = TRUE) + return(p) + } } -################################################################################ +################################################################################ #' Plot a phylogenetic tree with optional annotations #' #' There are many useful examples of phyloseq tree graphics in the @@ -1982,11 +1935,11 @@ nodeplotdefault = function(size=2L, hjust=-0.2){ #' the \code{physeq} argument should also have a \code{\link{sample_data}} #' and/or \code{\link{tax_table}} component(s). #' -#' @param method (Optional). Character string. Default \code{"sampledodge"}. +#' @param method (Optional). Character string. Default \code{'sampledodge'}. #' The name of the annotation method to use. #' This will be expanded in future versions. -#' Currently only \code{"sampledodge"} and \code{"treeonly"} are supported. -#' The \code{"sampledodge"} option results in points +#' Currently only \code{'sampledodge'} and \code{'treeonly'} are supported. +#' The \code{'sampledodge'} option results in points #' drawn next to leaves if individuals from that taxa were observed, #' and a separate point is drawn for each sample. #' @@ -2013,7 +1966,7 @@ nodeplotdefault = function(size=2L, hjust=-0.2){ #' #' @param size (Optional). Character string. Default \code{NULL}. #' The name of the variable in \code{physeq} to map to point size. -#' A special argument \code{"abundance"} is reserved here and scales +#' A special argument \code{'abundance'} is reserved here and scales #' point size using abundance in each sample on a log scale. #' Supported options here also include the reserved special variables #' of \code{\link{psmelt}}. @@ -2027,7 +1980,7 @@ nodeplotdefault = function(size=2L, hjust=-0.2){ #' #' @param label.tips (Optional). Character string. Default is \code{NULL}, #' indicating that no tip labels will be printed. -#' If \code{"taxa_names"}, then the name of the taxa will be added +#' If \code{'taxa_names'}, then the name of the taxa will be added #' to the tree; either next to the leaves, or next to #' the set of points that label the leaves. Alternatively, #' if this is one of the rank names (from \code{rank_names(physeq)}), @@ -2059,16 +2012,16 @@ nodeplotdefault = function(size=2L, hjust=-0.2){ #' shrink this value. #' #' @param ladderize (Optional). Boolean or character string (either -#' \code{FALSE}, \code{TRUE}, or \code{"left"}). +#' \code{FALSE}, \code{TRUE}, or \code{'left'}). #' Default is \code{FALSE}. #' This parameter specifies whether or not to \code{\link[ape]{ladderize}} the tree #' (i.e., reorder nodes according to the depth of their enclosed #' subtrees) prior to plotting. #' This tends to make trees more aesthetically pleasing and legible in #' a graphical display. -#' When \code{TRUE} or \code{"right"}, ``right'' ladderization is used. +#' When \code{TRUE} or \code{'right'}, ``right'' ladderization is used. #' When set to \code{FALSE}, no ladderization is applied. -#' When set to \code{"left"}, the reverse direction +#' When set to \code{'left'}, the reverse direction #' (``left'' ladderization) is applied. #' This argument is passed on to \code{\link{tree_layout}}. #' @@ -2093,7 +2046,7 @@ nodeplotdefault = function(size=2L, hjust=-0.2){ #' #' @param justify (Optional). A character string indicating the #' type of justification to use on dodged points and tip labels. -#' A value of \code{"jagged"}, the default, results in +#' A value of \code{'jagged'}, the default, results in #' these tip-mapped elements being spaced as close to the tips as possible #' without gaps. #' Currently, any other value for \code{justify} results in @@ -2115,218 +2068,218 @@ nodeplotdefault = function(size=2L, hjust=-0.2){ #' @examples #' # # Using plot_tree() with the esophagus dataset. #' # # Please note that many more interesting examples are shown -#' # # in the online tutorials" +#' # # in the online tutorials' #' # # http://joey711.github.io/phyloseq/plot_tree-examples #' data(esophagus) #' # plot_tree(esophagus) -#' # plot_tree(esophagus, color="Sample") -#' # plot_tree(esophagus, size="Abundance") -#' # plot_tree(esophagus, size="Abundance", color="samples") -#' plot_tree(esophagus, size="Abundance", color="Sample", base.spacing=0.03) -################################################################################ -#' plot_tree(esophagus, size="abundance", color="samples", base.spacing=0.03) -plot_tree = function(physeq, method="sampledodge", nodelabf=NULL, - color=NULL, shape=NULL, size=NULL, - min.abundance=Inf, label.tips=NULL, text.size=NULL, - sizebase=5, base.spacing = 0.02, - ladderize=FALSE, plot.margin=0.2, title=NULL, - treetheme=NULL, justify="jagged"){ - ######################################## - # Support mis-capitalization of reserved variable names in color, shape, size - # This helps, for instance, with backward-compatibility where "abundance" - # was the reserved variable name for mapping OTU abundance entries - fix_reserved_vars = function(aesvar){ - aesvar <- gsub("^abundance[s]{0,}$", "Abundance", aesvar, ignore.case=TRUE) - aesvar <- gsub("^OTU[s]{0,}$", "OTU", aesvar, ignore.case=TRUE) - aesvar <- gsub("^taxa_name[s]{0,}$", "OTU", aesvar, ignore.case=TRUE) - aesvar <- gsub("^sample[s]{0,}$", "Sample", aesvar, ignore.case=TRUE) +#' # plot_tree(esophagus, color='Sample') +#' # plot_tree(esophagus, size='Abundance') +#' # plot_tree(esophagus, size='Abundance', color='samples') +#' plot_tree(esophagus, size='Abundance', color='Sample', base.spacing=0.03) +################################################################################ +#' plot_tree(esophagus, size='abundance', color='samples', base.spacing=0.03) +plot_tree = function(physeq, method = "sampledodge", nodelabf = NULL, color = NULL, + shape = NULL, size = NULL, min.abundance = Inf, label.tips = NULL, text.size = NULL, + sizebase = 5, base.spacing = 0.02, ladderize = FALSE, plot.margin = 0.2, title = NULL, + treetheme = NULL, justify = "jagged") { + ######################################## Support mis-capitalization of reserved variable names in color, shape, size + ######################################## This helps, for instance, with backward-compatibility where 'abundance' was the + ######################################## reserved variable name for mapping OTU abundance entries + fix_reserved_vars = function(aesvar) { + aesvar <- gsub("^abundance[s]{0,}$", "Abundance", aesvar, ignore.case = TRUE) + aesvar <- gsub("^OTU[s]{0,}$", "OTU", aesvar, ignore.case = TRUE) + aesvar <- gsub("^taxa_name[s]{0,}$", "OTU", aesvar, ignore.case = TRUE) + aesvar <- gsub("^sample[s]{0,}$", "Sample", aesvar, ignore.case = TRUE) return(aesvar) } - if(!is.null(label.tips)){label.tips <- fix_reserved_vars(label.tips)} - if(!is.null(color)){color <- fix_reserved_vars(color)} - if(!is.null(shape)){shape <- fix_reserved_vars(shape)} - if(!is.null(size) ){size <- fix_reserved_vars(size)} - ######################################## - if( is.null(phy_tree(physeq, FALSE)) ){ - stop("There is no phylogenetic tree in the object you have provided.\n", - "Try phy_tree(physeq) to see for yourself.") + if (!is.null(label.tips)) { + label.tips <- fix_reserved_vars(label.tips) + } + if (!is.null(color)) { + color <- fix_reserved_vars(color) + } + if (!is.null(shape)) { + shape <- fix_reserved_vars(shape) + } + if (!is.null(size)) { + size <- fix_reserved_vars(size) + } + ######################################## + if (is.null(phy_tree(physeq, FALSE))) { + stop("There is no phylogenetic tree in the object you have provided.\n", + "Try phy_tree(physeq) to see for yourself.") } - if(!inherits(physeq, "phyloseq")){ + if (!inherits(physeq, "phyloseq")) { # If only a phylogenetic tree, then only tree available to overlay. method <- "treeonly" } # Create the tree data.table - treeSegs <- tree_layout(phy_tree(physeq), ladderize=ladderize) - edgeMap = aes(x=xleft, xend=xright, y=y, yend=y) - vertMap = aes(x=x, xend=x, y=vmin, yend=vmax) - # Initialize phylogenetic tree. - # Naked, lines-only, unannotated tree as first layers. Edge (horiz) first, then vertical. - p = ggplot(data=treeSegs$edgeDT) + geom_segment(edgeMap) + - geom_segment(vertMap, data=treeSegs$vertDT) - # If no text.size given, calculate it from number of tips ("species", aka taxa) + treeSegs <- tree_layout(phy_tree(physeq), ladderize = ladderize) + edgeMap = aes(x = xleft, xend = xright, y = y, yend = y) + vertMap = aes(x = x, xend = x, y = vmin, yend = vmax) + # Initialize phylogenetic tree. Naked, lines-only, unannotated tree as first + # layers. Edge (horiz) first, then vertical. + p = ggplot(data = treeSegs$edgeDT) + geom_segment(edgeMap) + geom_segment(vertMap, + data = treeSegs$vertDT) + # If no text.size given, calculate it from number of tips ('species', aka taxa) # This is very fast. No need to worry about whether text is printed or not. - if(is.null(text.size)){ + if (is.null(text.size)) { text.size <- manytextsize(ntaxa(physeq)) } # Add the species labels to the right. - if(!is.null(label.tips) & method!="sampledodge"){ + if (!is.null(label.tips) & method != "sampledodge") { # If method is sampledodge, then labels are added to the right of points, later. # Add labels layer to plotting object. labelDT = treeSegs$edgeDT[!is.na(OTU), ] - if(!is.null(tax_table(object=physeq, errorIfNULL=FALSE))){ + if (!is.null(tax_table(object = physeq, errorIfNULL = FALSE))) { # If there is a taxonomy available, merge it with the label data.table - taxDT = data.table(tax_table(physeq), OTU=taxa_names(physeq), key="OTU") + taxDT = data.table(tax_table(physeq), OTU = taxa_names(physeq), key = "OTU") # Merge with taxonomy. - labelDT = merge(x=labelDT, y=taxDT, by="OTU") + labelDT = merge(x = labelDT, y = taxDT, by = "OTU") } - if(justify=="jagged"){ - # Tip label aesthetic mapping. - # Aesthetics can be NULL, and that aesthetic gets ignored. - labelMap <- aes_string(x="xright", y="y", label=label.tips, color=color) + if (justify == "jagged") { + # Tip label aesthetic mapping. Aesthetics can be NULL, and that aesthetic gets + # ignored. + labelMap <- aes_string(x = "xright", y = "y", label = label.tips, color = color) } else { # The left-justified version of tip-labels. - labelMap <- aes_string(x="max(xright, na.rm=TRUE)", y="y", label=label.tips, color=color) + labelMap <- aes_string(x = "max(xright, na.rm=TRUE)", y = "y", label = label.tips, + color = color) } - p <- p + geom_text(labelMap, data=labelDT, size=I(text.size), hjust=-0.1, na.rm=TRUE) - } - # Node label section. - # - # If no nodelabf ("node label function") given, ask internal function to pick one. - # Is NULL by default, meaning will dispatch to `howtolabnodes` to select function. - # For no node labels, the "dummy" function `nodeplotblank` will return tree plot - # object, p, as-is, unmodified. - if(is.null(nodelabf)){ + p <- p + geom_text(labelMap, data = labelDT, size = I(text.size), hjust = -0.1, + na.rm = TRUE) + } + # Node label section. If no nodelabf ('node label function') given, ask internal + # function to pick one. Is NULL by default, meaning will dispatch to + # `howtolabnodes` to select function. For no node labels, the 'dummy' function + # `nodeplotblank` will return tree plot object, p, as-is, unmodified. + if (is.null(nodelabf)) { nodelabf = howtolabnodes(physeq) } - #### set node `y` as the mean of the vertical segment - # Use the provided/inferred node label function to add the node labels layer(s) - # Non-root nodes first + #### set node `y` as the mean of the vertical segment Use the provided/inferred node + #### label function to add the node labels layer(s) Non-root nodes first p = nodelabf(p, treeSegs$edgeDT[!is.na(label), ]) # Add root label (if present) p = nodelabf(p, treeSegs$vertDT[!is.na(label), ]) # Theme specification - if(is.null(treetheme)){ + if (is.null(treetheme)) { # If NULL, then use the default tree theme. - treetheme <- theme(axis.ticks = element_blank(), - axis.title.x=element_blank(), axis.text.x=element_blank(), - axis.title.y=element_blank(), axis.text.y=element_blank(), - panel.background = element_blank(), - panel.grid.minor = element_blank(), - panel.grid.major = element_blank()) - } - if(inherits(treetheme, "theme")){ - # If a theme, add theme layer to plot. - # For all other cases, skip this, which will cause default theme to be used + treetheme <- theme(axis.ticks = element_blank(), axis.title.x = element_blank(), + axis.text.x = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank(), + panel.background = element_blank(), panel.grid.minor = element_blank(), + panel.grid.major = element_blank()) + } + if (inherits(treetheme, "theme")) { + # If a theme, add theme layer to plot. For all other cases, skip this, which + # will cause default theme to be used p <- p + treetheme } # Optionally add a title to the plot - if(!is.null(title)){ + if (!is.null(title)) { p <- p + ggtitle(title) - } - if(method!="sampledodge"){ + } + if (method != "sampledodge") { # If anything but a sampledodge tree, return now without further decorations. return(p) } - ######################################## - # Sample Dodge Section - # Special words, c("Sample", "Abundance", "OTU") - # See psmelt() - ######################################## - # Initialize the species/taxa/OTU data.table + ######################################## Sample Dodge Section Special words, c('Sample', 'Abundance', 'OTU') See + ######################################## psmelt() Initialize the species/taxa/OTU data.table dodgeDT = treeSegs$edgeDT[!is.na(OTU), ] # Merge with psmelt() result, to make all co-variables available - dodgeDT = merge(x=dodgeDT, y=data.table(psmelt(physeq), key="OTU"), by="OTU") - if(justify=="jagged"){ + dodgeDT = merge(x = dodgeDT, y = data.table(psmelt(physeq), key = "OTU"), by = "OTU") + if (justify == "jagged") { # Remove 0 Abundance value entries now, not later, for jagged. - dodgeDT <- dodgeDT[Abundance > 0, ] + dodgeDT <- dodgeDT[Abundance > 0, ] } # Set key. Changes `dodgeDT` in place. OTU is first key, always. - if( !is.null(color) | !is.null(shape) | !is.null(size) ){ + if (!is.null(color) | !is.null(shape) | !is.null(size)) { # If color, shape, or size is chosen, setkey by those as well - setkeyv(dodgeDT, cols=c("OTU", color, shape, size)) + setkeyv(dodgeDT, cols = c("OTU", color, shape, size)) } else { - # Else, set key by OTU and sample name. + # Else, set key by OTU and sample name. setkey(dodgeDT, OTU, Sample) } # Add sample-dodge horizontal adjustment index. In-place data.table assignment - dodgeDT[, h.adj.index := 1:length(xright), by=OTU] - # `base.spacing` is a user-input parameter. - # The sampledodge step size is based on this and the max `x` value - if(justify=="jagged"){ - dodgeDT[, xdodge:=(xright + h.adj.index * base.spacing * max(xright, na.rm=TRUE))] + dodgeDT[, `:=`(h.adj.index, 1:length(xright)), by = OTU] + # `base.spacing` is a user-input parameter. The sampledodge step size is based + # on this and the max `x` value + if (justify == "jagged") { + dodgeDT[, `:=`(xdodge, (xright + h.adj.index * base.spacing * max(xright, + na.rm = TRUE)))] } else { - # Left-justified version, `xdodge` always starts at the max of all `xright` values. - dodgeDT[, xdodge := max(xright, na.rm=TRUE) + h.adj.index * base.spacing * max(xright, na.rm=TRUE)] + # Left-justified version, `xdodge` always starts at the max of all `xright` + # values. + dodgeDT[, `:=`(xdodge, max(xright, na.rm = TRUE) + h.adj.index * base.spacing * + max(xright, na.rm = TRUE))] # zeroes removed only after all sample points have been mapped. dodgeDT <- dodgeDT[Abundance > 0, ] } - # The general tip-point map. Objects can be NULL, and that aesthetic gets ignored. - dodgeMap <- aes_string(x="xdodge", y="y", color=color, fill=color, - shape=shape, size=size) - p <- p + geom_point(dodgeMap, data=dodgeDT, na.rm=TRUE) + # The general tip-point map. Objects can be NULL, and that aesthetic gets + # ignored. + dodgeMap <- aes_string(x = "xdodge", y = "y", color = color, fill = color, shape = shape, + size = size) + p <- p + geom_point(dodgeMap, data = dodgeDT, na.rm = TRUE) # Adjust point size transform - if( !is.null(size) ){ - p <- p + scale_size_continuous(trans=log_trans(sizebase)) - } - # Optionally-add abundance value label to each point. - # User controls this by the `min.abundance` parameter. - # A value of `Inf` implies no labels. - if( any(dodgeDT$Abundance >= min.abundance[1]) ){ - pointlabdf = dodgeDT[Abundance>=min.abundance[1], ] - p <- p + geom_text(mapping=aes(xdodge, y, label=Abundance), - data=pointlabdf, size=text.size, na.rm=TRUE) + if (!is.null(size)) { + p <- p + scale_size_continuous(trans = log_trans(sizebase)) + } + # Optionally-add abundance value label to each point. User controls this by the + # `min.abundance` parameter. A value of `Inf` implies no labels. + if (any(dodgeDT$Abundance >= min.abundance[1])) { + pointlabdf = dodgeDT[Abundance >= min.abundance[1], ] + p <- p + geom_text(mapping = aes(xdodge, y, label = Abundance), data = pointlabdf, + size = text.size, na.rm = TRUE) } # If indicated, add the species labels to the right of dodged points. - if(!is.null(label.tips)){ - # `tiplabDT` has only one row per tip, the farthest horizontal - # adjusted position (one for each taxa) + if (!is.null(label.tips)) { + # `tiplabDT` has only one row per tip, the farthest horizontal adjusted position + # (one for each taxa) tiplabDT = dodgeDT - tiplabDT[, xfartiplab:=max(xdodge), by=OTU] - tiplabDT <- tiplabDT[h.adj.index==1, .SD, by=OTU] - if(!is.null(color)){ - if(color %in% sample_variables(physeq, errorIfNULL=FALSE)){ + tiplabDT[, `:=`(xfartiplab, max(xdodge)), by = OTU] + tiplabDT <- tiplabDT[h.adj.index == 1, .SD, by = OTU] + if (!is.null(color)) { + if (color %in% sample_variables(physeq, errorIfNULL = FALSE)) { color <- NULL } } labelMap <- NULL - if(justify=="jagged"){ - labelMap <- aes_string(x="xfartiplab", y="y", label=label.tips, color=color) + if (justify == "jagged") { + labelMap <- aes_string(x = "xfartiplab", y = "y", label = label.tips, + color = color) } else { - labelMap <- aes_string(x="max(xfartiplab, na.rm=TRUE)", y="y", label=label.tips, color=color) + labelMap <- aes_string(x = "max(xfartiplab, na.rm=TRUE)", y = "y", label = label.tips, + color = color) } # Add labels layer to plotting object. - p <- p + geom_text(labelMap, tiplabDT, size=I(text.size), hjust=-0.1, na.rm=TRUE) - } - # Plot margins. - # Adjust the tree graphic plot margins. - # Helps to manually ensure that graphic elements aren't clipped, - # especially when there are long tip labels. - min.x <- -0.01 # + dodgeDT[, min(c(xleft))] - max.x <- dodgeDT[, max(xright, na.rm=TRUE)] - if("xdodge" %in% names(dodgeDT)){ - max.x <- dodgeDT[, max(xright, xdodge, na.rm=TRUE)] - } - if(plot.margin > 0){ - max.x <- max.x * (1.0 + plot.margin) - } - p <- p + scale_x_continuous(limits=c(min.x, max.x)) + p <- p + geom_text(labelMap, tiplabDT, size = I(text.size), hjust = -0.1, + na.rm = TRUE) + } + # Plot margins. Adjust the tree graphic plot margins. Helps to manually ensure + # that graphic elements aren't clipped, especially when there are long tip + # labels. + min.x <- -0.01 # + dodgeDT[, min(c(xleft))] + max.x <- dodgeDT[, max(xright, na.rm = TRUE)] + if ("xdodge" %in% names(dodgeDT)) { + max.x <- dodgeDT[, max(xright, xdodge, na.rm = TRUE)] + } + if (plot.margin > 0) { + max.x <- max.x * (1 + plot.margin) + } + p <- p + scale_x_continuous(limits = c(min.x, max.x)) return(p) } -################################################################################ -################################################################################ -################################################################################ -# Adapted from NeatMap-package. -# Vectorized for speed and simplicity, also only calculates theta and not r. +################################################################################ Adapted from NeatMap-package. Vectorized for speed and simplicity, also only +################################################################################ calculates theta and not r. #' @keywords internal -RadialTheta <- function(pos){ - pos = as(pos, "matrix") - xc = mean(pos[, 1]) - yc = mean(pos[, 2]) - theta = atan2((pos[, 2] - yc), (pos[, 1] - xc)) - names(theta) <- rownames(pos) - return(theta) +RadialTheta <- function(pos) { + pos = as(pos, "matrix") + xc = mean(pos[, 1]) + yc = mean(pos[, 2]) + theta = atan2((pos[, 2] - yc), (pos[, 1] - xc)) + names(theta) <- rownames(pos) + return(theta) } -################################################################################ +################################################################################ #' Create an ecologically-organized heatmap using ggplot2 graphics #' #' There are many useful examples of phyloseq heatmap graphics in the @@ -2400,25 +2353,25 @@ RadialTheta <- function(pos){ #' @param low (Optional). A character string. An R color. #' See \code{?\link{colors}} for options support in R (there are lots). #' The color that represents the lowest non-zero value -#' in the heatmap. Default is a dark blue color, \code{"#000033"}. +#' in the heatmap. Default is a dark blue color, \code{'#000033'}. #' #' @param high (Optional). A character string. An R color. #' See \code{\link{colors}} for options support in R (there are lots). #' The color that will represent the highest -#' value in the heatmap. The default is \code{"#66CCFF"}. -#' Zero-values are treated as \code{NA}, and set to \code{"black"}, to represent +#' value in the heatmap. The default is \code{'#66CCFF'}. +#' Zero-values are treated as \code{NA}, and set to \code{'black'}, to represent #' a background color. #' #' @param na.value (Optional). A character string. An R color. #' See \code{\link{colors}} for options support in R (there are lots). #' The color to represent what is essentially the background of the plot, #' the non-observations that occur as \code{NA} or -#' \code{0} values in the abundance table. The default is \code{"black"}, which +#' \code{0} values in the abundance table. The default is \code{'black'}, which #' works well on computer-screen graphics devices, but may be a poor choice for -#' printers, in which case you might want this value to be \code{"white"}, and +#' printers, in which case you might want this value to be \code{'white'}, and #' reverse the values of \code{high} and \code{low}, above. #' -#' @param trans (Optional). \code{"trans"}-class transformer-definition object. +#' @param trans (Optional). \code{'trans'}-class transformer-definition object. #' A numerical transformer to use in #' the continuous color scale. See \code{\link[scales]{trans_new}} for details. #' The default is \code{\link{log_trans}(4)}. @@ -2493,209 +2446,203 @@ RadialTheta <- function(pos){ #' #' @export #' @examples -#' data("GlobalPatterns") -#' gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota") +#' data('GlobalPatterns') +#' gpac <- subset_taxa(GlobalPatterns, Phylum=='Crenarchaeota') #' # FYI, the base-R function uses a non-ecological ordering scheme, #' # but does add potentially useful hclust dendrogram to the sides... -#' gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota") +#' gpac <- subset_taxa(GlobalPatterns, Phylum=='Crenarchaeota') #' # Remove the nearly-empty samples (e.g. 10 reads or less) #' gpac = prune_samples(sample_sums(gpac) > 50, gpac) #' # Arbitrary order if method set to NULL -#' plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family") +#' plot_heatmap(gpac, method=NULL, sample.label='SampleType', taxa.label='Family') #' # Use ordination -#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family") +#' plot_heatmap(gpac, sample.label='SampleType', taxa.label='Family') #' # Use ordination for OTUs, but not sample-order -#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", sample.order="SampleType") +#' plot_heatmap(gpac, sample.label='SampleType', taxa.label='Family', sample.order='SampleType') #' # Specifying both orders omits any attempt to use ordination. The following should be the same. -#' p0 = plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", taxa.order="Phylum", sample.order="SampleType") -#' p1 = plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family", taxa.order="Phylum", sample.order="SampleType") +#' p0 = plot_heatmap(gpac, sample.label='SampleType', taxa.label='Family', taxa.order='Phylum', sample.order='SampleType') +#' p1 = plot_heatmap(gpac, method=NULL, sample.label='SampleType', taxa.label='Family', taxa.order='Phylum', sample.order='SampleType') #' #expect_equivalent(p0, p1) #' # Example: Order matters. Random ordering of OTU indices is difficult to interpret, even with structured sample order #' rando = sample(taxa_names(gpac), size=ntaxa(gpac), replace=FALSE) -#' plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family", taxa.order=rando, sample.order="SampleType") +#' plot_heatmap(gpac, method=NULL, sample.label='SampleType', taxa.label='Family', taxa.order=rando, sample.order='SampleType') #' # # Select the edges of each axis. #' # First, arbitrary edge, ordering #' plot_heatmap(gpac, method=NULL) #' # Second, biological-ordering (instead of default ordination-ordering), but arbitrary edge -#' plot_heatmap(gpac, taxa.order="Family", sample.order="SampleType") +#' plot_heatmap(gpac, taxa.order='Family', sample.order='SampleType') #' # Third, biological ordering, selected edges -#' plot_heatmap(gpac, taxa.order="Family", sample.order="SampleType", first.taxa="546313", first.sample="NP2") +#' plot_heatmap(gpac, taxa.order='Family', sample.order='SampleType', first.taxa='546313', first.sample='NP2') #' # Fourth, add meaningful labels -#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", taxa.order="Family", sample.order="SampleType", first.taxa="546313", first.sample="NP2") -plot_heatmap <- function(physeq, method="NMDS", distance="bray", - sample.label=NULL, taxa.label=NULL, - low="#000033", high="#66CCFF", na.value="black", trans=log_trans(4), - max.label=250, title=NULL, sample.order=NULL, taxa.order=NULL, - first.sample=NULL, first.taxa=NULL, ...){ - +#' plot_heatmap(gpac, sample.label='SampleType', taxa.label='Family', taxa.order='Family', sample.order='SampleType', first.taxa='546313', first.sample='NP2') +plot_heatmap <- function(physeq, method = "NMDS", distance = "bray", sample.label = NULL, + taxa.label = NULL, low = "#000033", high = "#66CCFF", na.value = "black", trans = log_trans(4), + max.label = 250, title = NULL, sample.order = NULL, taxa.order = NULL, first.sample = NULL, + first.taxa = NULL, ...) { + # User-override ordering - if( !is.null(taxa.order) & length(taxa.order)==1 ){ + if (!is.null(taxa.order) & length(taxa.order) == 1) { # Assume `taxa.order` is a tax_table variable. Use it for ordering. rankcol = which(rank_names(physeq) %in% taxa.order) taxmat = as(tax_table(physeq)[, 1:rankcol], "matrix") - taxa.order = apply(taxmat, 1, paste, sep="", collapse="") + taxa.order = apply(taxmat, 1, paste, sep = "", collapse = "") names(taxa.order) <- taxa_names(physeq) - taxa.order = names(sort(taxa.order, na.last=TRUE)) + taxa.order = names(sort(taxa.order, na.last = TRUE)) } - if( !is.null(sample.order) & length(sample.order)==1 ){ + if (!is.null(sample.order) & length(sample.order) == 1) { # Assume `sample.order` is a sample variable. Use it for ordering. sample.order = as.character(get_variable(physeq, sample.order)) names(sample.order) <- sample_names(physeq) - sample.order = names(sort(sample.order, na.last=TRUE)) + sample.order = names(sort(sample.order, na.last = TRUE)) } - if( !is.null(method) & (is.null(taxa.order) | is.null(sample.order)) ){ - # Only attempt NeatMap if method is non-NULL & at least one of - # taxa.order and sample.order is not-yet defined. - # If both axes orders pre-defined by user, no need to perform ordination... + if (!is.null(method) & (is.null(taxa.order) | is.null(sample.order))) { + # Only attempt NeatMap if method is non-NULL & at least one of taxa.order and + # sample.order is not-yet defined. If both axes orders pre-defined by user, no + # need to perform ordination... - # Copy the approach from NeatMap by doing ordination on samples, but use - # phyloseq-wrapped distance/ordination procedures. - # Reorder by the angle in radial coordinates on the 2-axis plane. + # Copy the approach from NeatMap by doing ordination on samples, but use + # phyloseq-wrapped distance/ordination procedures. Reorder by the angle in + # radial coordinates on the 2-axis plane. - # In case of NMDS iterations, capture the output so it isn't dumped on standard-out - junk = capture.output( ps.ord <- ordinate(physeq, method, distance, ...), file=NULL) - if( is.null(sample.order) ){ + # In case of NMDS iterations, capture the output so it isn't dumped on + # standard-out + junk = capture.output(ps.ord <- ordinate(physeq, method, distance, ...), + file = NULL) + if (is.null(sample.order)) { siteDF = NULL # Only define new ord-based sample order if user did not define one already - trash1 = try({siteDF <- scores(ps.ord, choices = c(1, 2), display="sites", physeq=physeq)}, - silent = TRUE) - if(inherits(trash1, "try-error")){ + trash1 = try({ + siteDF <- scores(ps.ord, choices = c(1, 2), display = "sites", physeq = physeq) + }, silent = TRUE) + if (inherits(trash1, "try-error")) { # warn that the attempt to get ordination coordinates for ordering failed. - warning("Attempt to access ordination coordinates for sample ordering failed.\n", - "Using default sample ordering.") + warning("Attempt to access ordination coordinates for sample ordering failed.\n", + "Using default sample ordering.") } - if(!is.null(siteDF)){ + if (!is.null(siteDF)) { # If the score accession seemed to work, go ahead and replace sample.order sample.order <- sample_names(physeq)[order(RadialTheta(siteDF))] } } - - if( is.null(taxa.order) ){ - # re-order species/taxa/OTUs, if possible, - # and only if user did not define an order already - specDF = NULL - trash2 = try({specDF <- scores(ps.ord, choices=c(1, 2), display="species", physeq=physeq)}, - silent = TRUE) - if(inherits(trash2, "try-error")){ - # warn that the attempt to get ordination coordinates for ordering failed. - warning("Attempt to access ordination coordinates for feature/species/taxa/OTU ordering failed.\n", - "Using default feature/species/taxa/OTU ordering.") - } - if(!is.null(specDF)){ - # If the score accession seemed to work, go ahead and replace sample.order - taxa.order = taxa_names(physeq)[order(RadialTheta(specDF))] - } - } - } + + if (is.null(taxa.order)) { + # re-order species/taxa/OTUs, if possible, and only if user did not define an + # order already + specDF = NULL + trash2 = try({ + specDF <- scores(ps.ord, choices = c(1, 2), display = "species", + physeq = physeq) + }, silent = TRUE) + if (inherits(trash2, "try-error")) { + # warn that the attempt to get ordination coordinates for ordering failed. + warning("Attempt to access ordination coordinates for feature/species/taxa/OTU ordering failed.\n", + "Using default feature/species/taxa/OTU ordering.") + } + if (!is.null(specDF)) { + # If the score accession seemed to work, go ahead and replace sample.order + taxa.order = taxa_names(physeq)[order(RadialTheta(specDF))] + } + } + } # Now that index orders are determined, check/assign edges of axes, if specified - if( !is.null(first.sample) ){ + if (!is.null(first.sample)) { sample.order = chunkReOrder(sample.order, first.sample) } - if( !is.null(first.taxa) ){ + if (!is.null(first.taxa)) { taxa.order = chunkReOrder(taxa.order, first.taxa) } - - # melt physeq with the standard user-accessible data melting function - # for creating plot-ready data.frames, psmelt. - # This is also used inside some of the other plot_* functions. - adf = psmelt(physeq) - # Coerce the main axis variables to character. - # Will reset them to factor if re-ordering is needed. - adf$OTU = as(adf$OTU, "character") - adf$Sample = as(adf$Sample, "character") - if( !is.null(sample.order) ){ - # If sample-order is available, coerce to factor with special level-order - adf$Sample = factor(adf$Sample, levels=sample.order) - } else { - # Make sure it is a factor, but with default order/levels - adf$Sample = factor(adf$Sample) - } - if( !is.null(taxa.order) ){ - # If OTU-order is available, coerce to factor with special level-order - adf$OTU = factor(adf$OTU, levels=taxa.order) - } else { - # Make sure it is a factor, but with default order/levels - adf$OTU = factor(adf$OTU) - } - - ## Now the plotting part - # Initialize p. - p = ggplot(adf, aes(x = Sample, y = OTU, fill=Abundance)) + - geom_raster() - - # # Don't render labels if more than max.label - # Samples - if( nsamples(physeq) <= max.label ){ - # Add resize layer for samples if there are fewer than max.label - p <- p + theme( - axis.text.x = element_text( - size=manytextsize(nsamples(physeq), 4, 30, 12), - angle=-90, vjust=0.5, hjust=0 - ) - ) - } else { - # Remove the labels from any rendering. - p = p + scale_x_discrete("Sample", labels="") - } - # OTUs - if( ntaxa(physeq) <= max.label ){ - # Add resize layer for OTUs if there are fewer than max.label - p <- p + theme( - axis.text.y = element_text( - size=manytextsize(ntaxa(physeq), 4, 30, 12) - ) - ) - } else { - # Remove the labels from any rendering. - p = p + scale_y_discrete("OTU", labels="") - } - - # # Axis Relabeling (Skipped if more than max.label): - # Re-write sample-labels to some sample variable... - if( !is.null(sample.label) & nsamples(physeq) <= max.label){ - # Make a sample-named char-vector of the values for sample.label - labvec = as(get_variable(physeq, sample.label), "character") - names(labvec) <- sample_names(physeq) - if( !is.null(sample.order) ){ - # Re-order according to sample.order - labvec = labvec[sample.order] - } - # Replace any NA (missing) values with "" instead. Avoid recycling labels. - labvec[is.na(labvec)] <- "" - # Add the sample.label re-labeling layer - p = p + scale_x_discrete(sample.label, labels=labvec) - } - if( !is.null(taxa.label) & ntaxa(physeq) <= max.label){ - # Make a OTU-named vector of the values for taxa.label - labvec <- as(tax_table(physeq)[, taxa.label], "character") - names(labvec) <- taxa_names(physeq) - if( !is.null(taxa.order) ){ - # Re-order according to taxa.order - labvec <- labvec[taxa.order] - } - # Replace any NA (missing) values with "" instead. Avoid recycling labels. - labvec[is.na(labvec)] <- "" - # Add the taxa.label re-labeling layer - p = p + scale_y_discrete(taxa.label, labels=labvec) - } - - # Color scale transformations - if( !is.null(trans) ){ - p = p + scale_fill_gradient(low=low, high=high, trans=trans, na.value=na.value) - } else { - p = p + scale_fill_gradient(low=low, high=high, na.value=na.value) - } - - # Optionally add a title to the plot - if( !is.null(title) ){ - p = p + ggtitle(title) - } - - return(p) + + # melt physeq with the standard user-accessible data melting function for + # creating plot-ready data.frames, psmelt. This is also used inside some of the + # other plot_* functions. + adf = psmelt(physeq) + # Coerce the main axis variables to character. Will reset them to factor if + # re-ordering is needed. + adf$OTU = as(adf$OTU, "character") + adf$Sample = as(adf$Sample, "character") + if (!is.null(sample.order)) { + # If sample-order is available, coerce to factor with special level-order + adf$Sample = factor(adf$Sample, levels = sample.order) + } else { + # Make sure it is a factor, but with default order/levels + adf$Sample = factor(adf$Sample) + } + if (!is.null(taxa.order)) { + # If OTU-order is available, coerce to factor with special level-order + adf$OTU = factor(adf$OTU, levels = taxa.order) + } else { + # Make sure it is a factor, but with default order/levels + adf$OTU = factor(adf$OTU) + } + + ## Now the plotting part Initialize p. + p = ggplot(adf, aes(x = Sample, y = OTU, fill = Abundance)) + geom_raster() + + # # Don't render labels if more than max.label Samples + if (nsamples(physeq) <= max.label) { + # Add resize layer for samples if there are fewer than max.label + p <- p + theme(axis.text.x = element_text(size = manytextsize(nsamples(physeq), + 4, 30, 12), angle = -90, vjust = 0.5, hjust = 0)) + } else { + # Remove the labels from any rendering. + p = p + scale_x_discrete("Sample", labels = "") + } + # OTUs + if (ntaxa(physeq) <= max.label) { + # Add resize layer for OTUs if there are fewer than max.label + p <- p + theme(axis.text.y = element_text(size = manytextsize(ntaxa(physeq), + 4, 30, 12))) + } else { + # Remove the labels from any rendering. + p = p + scale_y_discrete("OTU", labels = "") + } + + # # Axis Relabeling (Skipped if more than max.label): Re-write sample-labels to + # some sample variable... + if (!is.null(sample.label) & nsamples(physeq) <= max.label) { + # Make a sample-named char-vector of the values for sample.label + labvec = as(get_variable(physeq, sample.label), "character") + names(labvec) <- sample_names(physeq) + if (!is.null(sample.order)) { + # Re-order according to sample.order + labvec = labvec[sample.order] + } + # Replace any NA (missing) values with '' instead. Avoid recycling labels. + labvec[is.na(labvec)] <- "" + # Add the sample.label re-labeling layer + p = p + scale_x_discrete(sample.label, labels = labvec) + } + if (!is.null(taxa.label) & ntaxa(physeq) <= max.label) { + # Make a OTU-named vector of the values for taxa.label + labvec <- as(tax_table(physeq)[, taxa.label], "character") + names(labvec) <- taxa_names(physeq) + if (!is.null(taxa.order)) { + # Re-order according to taxa.order + labvec <- labvec[taxa.order] + } + # Replace any NA (missing) values with '' instead. Avoid recycling labels. + labvec[is.na(labvec)] <- "" + # Add the taxa.label re-labeling layer + p = p + scale_y_discrete(taxa.label, labels = labvec) + } + + # Color scale transformations + if (!is.null(trans)) { + p = p + scale_fill_gradient(low = low, high = high, trans = trans, na.value = na.value) + } else { + p = p + scale_fill_gradient(low = low, high = high, na.value = na.value) + } + + # Optionally add a title to the plot + if (!is.null(title)) { + p = p + ggtitle(title) + } + + return(p) } -################################################################################ +################################################################################ #' Chunk re-order a vector so that specified newstart is first. #' #' Different than relevel. @@ -2717,34 +2664,36 @@ plot_heatmap <- function(physeq, method="NMDS", distance="bray", #' # # This is also the default #' # all(chunkReOrder(10:25) == 10:25) #' # # An example with characters -#' # chunkReOrder(LETTERS, "G") -#' # chunkReOrder(LETTERS, "B") -#' # chunkReOrder(LETTERS, "Z") +#' # chunkReOrder(LETTERS, 'G') +#' # chunkReOrder(LETTERS, 'B') +#' # chunkReOrder(LETTERS, 'Z') #' # # What about when `newstart` is not in `x`? Return x as-is, throw warning. -#' # chunkReOrder(LETTERS, "g") -chunkReOrder = function(x, newstart = x[[1]]){ +#' # chunkReOrder(LETTERS, 'g') +chunkReOrder = function(x, newstart = x[[1]]) { pivot = match(newstart[1], x, nomatch = NA) # If pivot `is.na`, throw warning, return x as-is - if(is.na(pivot)){ + if (is.na(pivot)) { warning("The `newstart` argument was not in `x`. Returning `x` without reordering.") newx = x } else { - newx = c(tail(x, {length(x) - pivot + 1}), head(x, pivot - 1L)) + newx = c(tail(x, { + length(x) - pivot + 1 + }), head(x, pivot - 1L)) } return(newx) } -################################################################################ +################################################################################ #' Create a ggplot summary of gap statistic results #' #' @param clusgap (Required). -#' An object of S3 class \code{"clusGap"}, basically a list with components. +#' An object of S3 class \code{'clusGap'}, basically a list with components. #' See the \code{\link[cluster]{clusGap}} documentation for more details. #' In most cases this will be the output of \code{\link{gapstat_ord}}, #' or \code{\link[cluster]{clusGap}} if you called it directly. #' #' @param title (Optional). Character string. #' The main title for the graphic. -#' Default is \code{"Gap Statistic results"}. +#' Default is \code{'Gap Statistic results'}. #' #' @return #' A \code{\link[ggplot2]{ggplot}} plot object. @@ -2762,32 +2711,32 @@ chunkReOrder = function(x, newstart = x[[1]]){ #' @export #' @examples #' # Load and process data -#' data("soilrep") +#' data('soilrep') #' soilr = rarefy_even_depth(soilrep, rngseed=888) #' print(soilr) #' sample_variables(soilr) #' # Ordination -#' sord = ordinate(soilr, "DCA") +#' sord = ordinate(soilr, 'DCA') #' # Gap Statistic #' gs = gapstat_ord(sord, axes=1:4, verbose=FALSE) #' # Evaluate results with plots, etc. #' plot_scree(sord) -#' plot_ordination(soilr, sord, color="Treatment") +#' plot_ordination(soilr, sord, color='Treatment') #' plot_clusgap(gs) -#' print(gs, method="Tibs2001SEmax") +#' print(gs, method='Tibs2001SEmax') #' # Non-ordination example, use cluster::clusGap function directly -#' library("cluster") +#' library('cluster') #' pam1 = function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))} #' gs.pam.RU = clusGap(ruspini, FUN = pam1, K.max = 8, B = 60) #' gs.pam.RU -#' plot(gs.pam.RU, main = "Gap statistic for the 'ruspini' data") -#' mtext("k = 4 is best .. and k = 5 pretty close") +#' plot(gs.pam.RU, main = 'Gap statistic for the 'ruspini' data') +#' mtext('k = 4 is best .. and k = 5 pretty close') #' plot_clusgap(gs.pam.RU) -plot_clusgap = function(clusgap, title="Gap Statistic results"){ - gstab = data.frame(clusgap$Tab, k = 1:nrow(clusgap$Tab)) - p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size = 5) - p = p + geom_errorbar(aes(ymax = gap + SE.sim, ymin = gap - SE.sim)) - p = p + ggtitle(title) - return(p) +plot_clusgap = function(clusgap, title = "Gap Statistic results") { + gstab = data.frame(clusgap$Tab, k = 1:nrow(clusgap$Tab)) + p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size = 5) + p = p + geom_errorbar(aes(ymax = gap + SE.sim, ymin = gap - SE.sim)) + p = p + ggtitle(title) + return(p) } -################################################################################ \ No newline at end of file +################################################################################ diff --git a/R/sampleData-class.R b/R/sampleData-class.R index 80c49eb8..b010fcb1 100644 --- a/R/sampleData-class.R +++ b/R/sampleData-class.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Build or access sample_data. #' #' This is the suggested method for both constructing and accessing a table @@ -40,29 +40,29 @@ #' @examples # #' data(soilrep) #' head(sample_data(soilrep)) -setGeneric("sample_data", function(object, errorIfNULL=TRUE) standardGeneric("sample_data")) +setGeneric("sample_data", function(object, errorIfNULL = TRUE) standardGeneric("sample_data")) #' @rdname sample_data-methods #' @aliases sample_data,ANY-method -setMethod("sample_data", "ANY", function(object, errorIfNULL=TRUE){ - access(object, "sam_data", errorIfNULL) +setMethod("sample_data", "ANY", function(object, errorIfNULL = TRUE) { + access(object, "sam_data", errorIfNULL) }) # constructor; for creating sample_data from a data.frame #' @rdname sample_data-methods #' @aliases sample_data,data.frame-method -setMethod("sample_data", "data.frame", function(object){ - # Make sure there are no phantom levels in categorical variables - object <- reconcile_categories(object) - - # instantiate first to check validity - SM <- new("sample_data", object) - - # Want dummy samples index names if missing - if( all(rownames(SM) == as.character(1:nrow(SM))) ){ - rownames(SM) <- paste("sa", 1:nrow(SM), sep="") - } - return(SM) +setMethod("sample_data", "data.frame", function(object) { + # Make sure there are no phantom levels in categorical variables + object <- reconcile_categories(object) + + # instantiate first to check validity + SM <- new("sample_data", object) + + # Want dummy samples index names if missing + if (all(rownames(SM) == as.character(1:nrow(SM)))) { + rownames(SM) <- paste("sa", 1:nrow(SM), sep = "") + } + return(SM) }) -################################################################################ +################################################################################ #' Cleans absent levels in sample_data/data.frame. #' #' This is used internally by the builder method, \code{\link{sample_data}}, to @@ -85,22 +85,22 @@ setMethod("sample_data", "data.frame", function(object){ #' # # # data(GlobalPatterns) #' # # # SM <- sample_data(GlobalPatterns) #' # # # DF <- data.frame(SM) -#' # # # DF <- data.frame(DF, col1=1:nrow(DF), col2=paste(1:nrow(DF), "t", sep="")) +#' # # # DF <- data.frame(DF, col1=1:nrow(DF), col2=paste(1:nrow(DF), 't', sep='')) #' # # # DF <- reconcile_categories(DF) #' # # # SM <- sample_data(reconcile_categories(SM)) #' # # # sapply(DF, class) #' # # # sapply(SM, class) -reconcile_categories <- function(DFSM){ - DF = as(DFSM, "data.frame") - #variable_classes <- sapply(DF, class) - #factor_cols <- names(variable_classes[variable_classes %in% c("factor", "character")]) - factor_cols = which(sapply(DF, inherits, what="factor")) - for( j in factor_cols){ - DF[, j] <- factor( as(DF[, j], "character") ) - } - return(DF) +reconcile_categories <- function(DFSM) { + DF = as(DFSM, "data.frame") + # variable_classes <- sapply(DF, class) factor_cols <- + # names(variable_classes[variable_classes %in% c('factor', 'character')]) + factor_cols = which(sapply(DF, inherits, what = "factor")) + for (j in factor_cols) { + DF[, j] <- factor(as(DF[, j], "character")) + } + return(DF) } -################################################################################ +################################################################################ #' Subset samples by sample_data expression #' #' This is a convenience wrapper around the \code{\link{subset}} function. @@ -136,20 +136,20 @@ reconcile_categories <- function(DFSM){ #' #' @examples #' # data(GlobalPatterns) -#' # subset_samples(GlobalPatterns, SampleType=="Ocean") -subset_samples <- function(physeq, ...){ - if( is.null(sample_data(physeq)) ){ - cat("Nothing subset. No sample_data in physeq.\n") - return(physeq) - } else { - oldDF <- as(sample_data(physeq), "data.frame") - newDF <- subset(oldDF, ...) - if( class(physeq) == "sample_data" ){ - return(sample_data(newDF)) - } else { - sample_data(physeq) <- sample_data(newDF) - return(physeq) - } - } +#' # subset_samples(GlobalPatterns, SampleType=='Ocean') +subset_samples <- function(physeq, ...) { + if (is.null(sample_data(physeq))) { + cat("Nothing subset. No sample_data in physeq.\n") + return(physeq) + } else { + oldDF <- as(sample_data(physeq), "data.frame") + newDF <- subset(oldDF, ...) + if (class(physeq) == "sample_data") { + return(sample_data(newDF)) + } else { + sample_data(physeq) <- sample_data(newDF) + return(physeq) + } + } } -################################################################################ +################################################################################ diff --git a/R/show-methods.R b/R/show-methods.R index d58bd9d7..26ed6c59 100644 --- a/R/show-methods.R +++ b/R/show-methods.R @@ -1,35 +1,31 @@ -############################################################################ +############################################################################ #' @rdname show-methods -setMethod("show", "otu_table", function(object){ - # print otu_table (always there). - cat(paste("OTU Table: [", ntaxa(object), " taxa and ", - nsamples(object), " samples]", sep = ""), fill = TRUE) - if( taxa_are_rows(object) ){ - cat(" taxa are rows", fill=TRUE) - } else { - cat(" taxa are columns", fill=TRUE) - } - show(as(object, "matrix")) +setMethod("show", "otu_table", function(object) { + # print otu_table (always there). + cat(paste("OTU Table: [", ntaxa(object), " taxa and ", nsamples(object), + " samples]", sep = ""), fill = TRUE) + if (taxa_are_rows(object)) { + cat(" taxa are rows", fill = TRUE) + } else { + cat(" taxa are columns", fill = TRUE) + } + show(as(object, "matrix")) }) -############################################################################ +############################################################################ #' @rdname show-methods -setMethod("show", "sample_data", function(object){ - cat(paste("Sample Data: [", dim(sample_data(object))[1], " samples by ", - dim(sample_data(object))[2], - " sample variables]:", sep = ""), - fill = TRUE) - show(as(object, "data.frame")) +setMethod("show", "sample_data", function(object) { + cat(paste("Sample Data: [", dim(sample_data(object))[1], " samples by ", + dim(sample_data(object))[2], " sample variables]:", sep = ""), fill = TRUE) + show(as(object, "data.frame")) }) -############################################################################ +############################################################################ #' @rdname show-methods -setMethod("show", "taxonomyTable", function(object){ - cat(paste("Taxonomy Table: [", dim(object)[1], " taxa by ", - dim(object)[2], - " taxonomic ranks]:", sep = ""), - fill = TRUE) - show(as(object, "matrix")) +setMethod("show", "taxonomyTable", function(object) { + cat(paste("Taxonomy Table: [", dim(object)[1], " taxa by ", dim(object)[2], + " taxonomic ranks]:", sep = ""), fill = TRUE) + show(as(object, "matrix")) }) -############################################################################ +############################################################################ #' method extensions to show for phyloseq objects. #' #' See the general documentation of \code{\link[methods]{show}} method for @@ -44,39 +40,38 @@ setMethod("show", "taxonomyTable", function(object){ #' # data(GlobalPatterns) #' # show(GlobalPatterns) #' # GlobalPatterns -setMethod("show", "phyloseq", function(object){ - cat("phyloseq-class experiment-level object", fill=TRUE) - # print otu_table (always there). - cat(paste("otu_table() OTU Table: [ ", ntaxa(otu_table(object)), " taxa and ", - nsamples(otu_table(object)), " samples ]", sep = ""), fill = TRUE) - - # print Sample Data if there - if(!is.null(sample_data(object, FALSE))){ - cat(paste("sample_data() Sample Data: [ ", dim(sample_data(object))[1], " samples by ", - dim(sample_data(object))[2], - " sample variables ]", sep = ""), fill = TRUE) - } - - # print tax Tab if there - if(!is.null(tax_table(object, FALSE))){ - cat(paste("tax_table() Taxonomy Table: [ ", dim(tax_table(object))[1], " taxa by ", - dim(tax_table(object))[2], - " taxonomic ranks ]", sep = ""), fill = TRUE) - } - - # print tree if there - if(!is.null(phy_tree(object, FALSE))){ - cat(paste("phy_tree() Phylogenetic Tree: [ ", ntaxa(phy_tree(object)), " tips and ", - phy_tree(object)$Nnode, - " internal nodes ]", sep = ""), - fill = TRUE - ) - } - - # print refseq summary if there - if(!is.null(refseq(object, FALSE))){ - cat(paste("refseq() ", class(refseq(object))[1], ": [ ", ntaxa(refseq(object)), " reference sequences ]", sep = ""), fill=TRUE) - } - +setMethod("show", "phyloseq", function(object) { + cat("phyloseq-class experiment-level object", fill = TRUE) + # print otu_table (always there). + cat(paste("otu_table() OTU Table: [ ", ntaxa(otu_table(object)), " taxa and ", + nsamples(otu_table(object)), " samples ]", sep = ""), fill = TRUE) + + # print Sample Data if there + if (!is.null(sample_data(object, FALSE))) { + cat(paste("sample_data() Sample Data: [ ", dim(sample_data(object))[1], + " samples by ", dim(sample_data(object))[2], " sample variables ]", sep = ""), + fill = TRUE) + } + + # print tax Tab if there + if (!is.null(tax_table(object, FALSE))) { + cat(paste("tax_table() Taxonomy Table: [ ", dim(tax_table(object))[1], + " taxa by ", dim(tax_table(object))[2], " taxonomic ranks ]", sep = ""), + fill = TRUE) + } + + # print tree if there + if (!is.null(phy_tree(object, FALSE))) { + cat(paste("phy_tree() Phylogenetic Tree: [ ", ntaxa(phy_tree(object)), + " tips and ", phy_tree(object)$Nnode, " internal nodes ]", sep = ""), + fill = TRUE) + } + + # print refseq summary if there + if (!is.null(refseq(object, FALSE))) { + cat(paste("refseq() ", class(refseq(object))[1], ": [ ", ntaxa(refseq(object)), + " reference sequences ]", sep = ""), fill = TRUE) + } + }) -############################################################################ +############################################################################ diff --git a/R/taxonomyTable-class.R b/R/taxonomyTable-class.R index ce77d01b..19f0cf7f 100644 --- a/R/taxonomyTable-class.R +++ b/R/taxonomyTable-class.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Build or access the taxonomyTable. #' #' This is the suggested method for both constructing and accessing a table of @@ -36,48 +36,45 @@ #' @export #' #' @examples # -#' # tax1 <- tax_table(matrix("abc", 30, 8)) +#' # tax1 <- tax_table(matrix('abc', 30, 8)) #' # data(GlobalPatterns) #' # tax_table(GlobalPatterns) -setGeneric("tax_table", function(object, errorIfNULL=TRUE) standardGeneric("tax_table")) +setGeneric("tax_table", function(object, errorIfNULL = TRUE) standardGeneric("tax_table")) #' @rdname tax_table-methods #' @aliases tax_table,ANY-method -setMethod("tax_table", "ANY", function(object, errorIfNULL=TRUE){ - access(object, "tax_table", errorIfNULL) +setMethod("tax_table", "ANY", function(object, errorIfNULL = TRUE) { + access(object, "tax_table", errorIfNULL) }) # Constructor; for creating taxonomyTable from a matrix. #' @rdname tax_table-methods #' @aliases tax_table,matrix-method -setMethod("tax_table", "matrix", function(object){ +setMethod("tax_table", "matrix", function(object) { # Want dummy species/taxa index names if missing - if(is.null(rownames(object))){ - rownames(object) <- paste("sp", 1:nrow(object), sep="") + if (is.null(rownames(object))) { + rownames(object) <- paste("sp", 1:nrow(object), sep = "") } - if(is.null(colnames(object))){ - colnames(object) <- paste("ta", 1:ncol(object), sep="") - } - # instantiate as taxonomyTable - return(new("taxonomyTable", object)) + if (is.null(colnames(object))) { + colnames(object) <- paste("ta", 1:ncol(object), sep = "") + } + # instantiate as taxonomyTable + return(new("taxonomyTable", object)) }) # Constructor; coerce to matrix, then pass on for creating taxonomyTable. #' @rdname tax_table-methods #' @aliases tax_table,data.frame-method -setMethod("tax_table", "data.frame", function(object){ - # Warn first +setMethod("tax_table", "data.frame", function(object) { + # Warn first text = "Coercing from data.frame class to character matrix \n" text = paste0(text, "prior to building taxonomyTable. \n") text = paste0(text, "This could introduce artifacts. \n") text = paste0(text, "Check your taxonomyTable, or coerce to matrix manually.") - warning(text) - # Coerce everything to a matrix, then char-vector, then back to matrix. - TT <- matrix(as(as(object, "matrix"), "character"), - nrow=nrow(object), - ncol=ncol(object) - ) - # Pass on to matrix-method. - tax_table(TT) + warning(text) + # Coerce everything to a matrix, then char-vector, then back to matrix. + TT <- matrix(as(as(object, "matrix"), "character"), nrow = nrow(object), ncol = ncol(object)) + # Pass on to matrix-method. + tax_table(TT) }) -################################################################################ +################################################################################ #' Subset species by taxonomic expression #' #' This is a convenience wrapper around the \code{\link{subset}} function. @@ -111,22 +108,22 @@ setMethod("tax_table", "data.frame", function(object){ #' @export #' #' @examples -#' ## ex3 <- subset_taxa(GlobalPatterns, Phylum=="Bacteroidetes") -subset_taxa <- function(physeq, ...){ - if( is.null(tax_table(physeq)) ){ - cat("Nothing subset. No taxonomyTable in physeq.\n") - return(physeq) - } else { - oldMA <- as(tax_table(physeq), "matrix") - oldDF <- data.frame(oldMA) - newDF <- subset(oldDF, ...) - newMA <- as(newDF, "matrix") - if( inherits(physeq, "taxonomyTable") ){ - return(tax_table(newMA)) - } else { - tax_table(physeq) <- tax_table(newMA) - return(physeq) - } - } +#' ## ex3 <- subset_taxa(GlobalPatterns, Phylum=='Bacteroidetes') +subset_taxa <- function(physeq, ...) { + if (is.null(tax_table(physeq))) { + cat("Nothing subset. No taxonomyTable in physeq.\n") + return(physeq) + } else { + oldMA <- as(tax_table(physeq), "matrix") + oldDF <- data.frame(oldMA) + newDF <- subset(oldDF, ...) + newMA <- as(newDF, "matrix") + if (inherits(physeq, "taxonomyTable")) { + return(tax_table(newMA)) + } else { + tax_table(physeq) <- tax_table(newMA) + return(physeq) + } + } } -################################################################################ +################################################################################ diff --git a/R/transform_filter-methods.R b/R/transform_filter-methods.R index 9b26ce58..b91a6757 100644 --- a/R/transform_filter-methods.R +++ b/R/transform_filter-methods.R @@ -1,9 +1,6 @@ -################################################################################ -# Function to create subsampled dataset -# in which each sample has same number of total observations/counts/reads -# Note that the subsampling is random, so some noise is introduced making the -# relative abundances slightly different -################################################################################ +################################################################################ Function to create subsampled dataset in which each sample has same number of +################################################################################ total observations/counts/reads Note that the subsampling is random, so some +################################################################################ noise is introduced making the relative abundances slightly different #' Resample an OTU table such that all samples have the same library size. #' #' Please note that the authors of phyloseq do not advocate using this @@ -107,137 +104,132 @@ #' #' @examples #' # Test with esophagus dataset -#' data("esophagus") +#' data('esophagus') #' esorepT = rarefy_even_depth(esophagus, replace=TRUE) #' esorepF = rarefy_even_depth(esophagus, replace=FALSE) #' sample_sums(esophagus) #' sample_sums(esorepT) #' sample_sums(esorepF) #' ## NRun Manually: Too slow! -#' # data("GlobalPatterns") +#' # data('GlobalPatterns') #' # GPrepT = rarefy_even_depth(GlobalPatterns, 1E5, replace=TRUE) #' ## Actually just this one is slow #' # system.time(GPrepF <- rarefy_even_depth(GlobalPatterns, 1E5, replace=FALSE)) -rarefy_even_depth <- function(physeq, sample.size=min(sample_sums(physeq)), - rngseed=FALSE, replace=TRUE, trimOTUs=TRUE, verbose=TRUE){ - - if( as(rngseed, "logical") ){ - # Now call the set.seed using the value expected in phyloseq - set.seed(rngseed) - if(verbose){ +rarefy_even_depth <- function(physeq, sample.size = min(sample_sums(physeq)), rngseed = FALSE, + replace = TRUE, trimOTUs = TRUE, verbose = TRUE) { + + if (as(rngseed, "logical")) { + # Now call the set.seed using the value expected in phyloseq + set.seed(rngseed) + if (verbose) { # Print to screen this value message("`set.seed(", rngseed, ")` was used to initialize repeatable random subsampling.") message("Please record this for your records so others can reproduce.") - message("Try `set.seed(", rngseed,"); .Random.seed` for the full vector", sep="") - message("...") + message("Try `set.seed(", rngseed, "); .Random.seed` for the full vector", + sep = "") + message("...") } - } else if(verbose){ - message("You set `rngseed` to FALSE. Make sure you've set & recorded\n", - " the random seed of your session for reproducibility.\n", - "See `?set.seed`\n") - message("...") - } - - # Make sure sample.size is of length 1. - if( length(sample.size) > 1 ){ - warning("`sample.size` had more than one value. ", - "Using only the first. \n ... \n") - sample.size <- sample.size[1] - } - - if( sample.size <= 0 ){ - stop("sample.size less than or equal to zero. ", - "Need positive sample size to work.") - } - - # Instead of warning, expected behavior now is to prune samples - # that have fewer reads than `sample.size` - if( min(sample_sums(physeq)) < sample.size ){ - rmsamples = sample_names(physeq)[sample_sums(physeq) < sample.size] - if(verbose){ - message(length(rmsamples), " samples removed", - "because they contained fewer reads than `sample.size`.") + } else if (verbose) { + message("You set `rngseed` to FALSE. Make sure you've set & recorded\n", + " the random seed of your session for reproducibility.\n", "See `?set.seed`\n") + message("...") + } + + # Make sure sample.size is of length 1. + if (length(sample.size) > 1) { + warning("`sample.size` had more than one value. ", "Using only the first. \n ... \n") + sample.size <- sample.size[1] + } + + if (sample.size <= 0) { + stop("sample.size less than or equal to zero. ", "Need positive sample size to work.") + } + + # Instead of warning, expected behavior now is to prune samples that have fewer + # reads than `sample.size` + if (min(sample_sums(physeq)) < sample.size) { + rmsamples = sample_names(physeq)[sample_sums(physeq) < sample.size] + if (verbose) { + message(length(rmsamples), " samples removed", "because they contained fewer reads than `sample.size`.") message("Up to first five removed samples are: \n") - message(rmsamples[1:min(5, length(rmsamples))], sep="\t") - message("...") + message(rmsamples[1:min(5, length(rmsamples))], sep = "\t") + message("...") } - # Now done with notifying user of pruning, actually prune. - physeq = prune_samples(setdiff(sample_names(physeq), rmsamples), physeq) - } - # initialize the subsamples phyloseq instance, newsub - newsub <- physeq - # enforce orientation as species-are-rows, for assignment - if(!taxa_are_rows(newsub)){newsub <- t(newsub)} - # apply through each sample, and replace - newotu <- apply(otu_table(newsub), 2, rarefaction_subsample, - sample.size=sample.size, replace=replace) - # Add OTU names to the row indices - rownames(newotu) <- taxa_names(physeq) - # replace the otu_table. - otu_table(newsub) <- otu_table(newotu, TRUE) - if(trimOTUs){ - # Check for and remove empty OTUs - # 1. Notify user of empty OTUs being cut. - # 2. Cut empty OTUs + # Now done with notifying user of pruning, actually prune. + physeq = prune_samples(setdiff(sample_names(physeq), rmsamples), physeq) + } + # initialize the subsamples phyloseq instance, newsub + newsub <- physeq + # enforce orientation as species-are-rows, for assignment + if (!taxa_are_rows(newsub)) { + newsub <- t(newsub) + } + # apply through each sample, and replace + newotu <- apply(otu_table(newsub), 2, rarefaction_subsample, sample.size = sample.size, + replace = replace) + # Add OTU names to the row indices + rownames(newotu) <- taxa_names(physeq) + # replace the otu_table. + otu_table(newsub) <- otu_table(newotu, TRUE) + if (trimOTUs) { + # Check for and remove empty OTUs 1. Notify user of empty OTUs being cut. 2. Cut + # empty OTUs rmtaxa = taxa_names(newsub)[taxa_sums(newsub) <= 0] - if( length(rmtaxa) > 0 ){ - if(verbose){ - message(length(rmtaxa), "OTUs were removed because they are no longer \n", - "present in any sample after random subsampling\n") + if (length(rmtaxa) > 0) { + if (verbose) { + message(length(rmtaxa), "OTUs were removed because they are no longer \n", + "present in any sample after random subsampling\n") message("...") } newsub = prune_taxa(setdiff(taxa_names(newsub), rmtaxa), newsub) } } - # If the OTU table was transposed before rarefaction, transpose it - # back to the way it was in the original physeq object. - if(!taxa_are_rows(physeq)){newsub <- t(newsub)} - return(newsub) + # If the OTU table was transposed before rarefaction, transpose it back to the + # way it was in the original physeq object. + if (!taxa_are_rows(physeq)) { + newsub <- t(newsub) + } + return(newsub) } -################################################################################ -# rarefaction subsample function, one sample -################################################################################ +################################################################################ rarefaction subsample function, one sample #' @keywords internal -rarefaction_subsample <- function(x, sample.size, replace=FALSE){ - # This is a test - # x = sample(10, 10) - # x = 1:10 - # sample.size = 50 - #system.time(obsvec <- foreach(OTUi=1:length(x), times=x, .combine=c) %do% {rep(OTUi, times)}) - # data("GlobalPatterns") - # sample.size = sample_sums(GlobalPatterns)[which.min(sample_sums(GlobalPatterns))] - # x = get_taxa(GlobalPatterns, which.max(sample_sums(GlobalPatterns))) - # Create replacement species vector - rarvec <- numeric(length(x)) - # Perform the sub-sampling. Suppress warnings due to old R compat issue. - # Also, make sure to avoid errors from x summing to zero, - # and there are no observations to sample. - # The initialization of rarvec above is already sufficient. - if(sum(x) <= 0){ - # Protect against, and quickly return an empty vector, - # if x is already an empty count vector - return(rarvec) - } - if(replace){ - # resample with replacement - suppressWarnings(subsample <- sample(1:length(x), sample.size, replace=TRUE, prob=x)) - } else { - # resample without replacement - obsvec <- apply(data.frame(OTUi=1:length(x), times=x), 1, function(x){ - rep_len(x["OTUi"], x["times"]) - }) - obsvec <- unlist(obsvec, use.names=FALSE) - # use `sample` for subsampling. Hope that obsvec doesn't overflow. - suppressWarnings(subsample <- sample(obsvec, sample.size, replace=FALSE)) - } - # Tabulate the results (these are already named by the order in `x`) - sstab <- table(subsample) - # Assign the tabulated random subsample values to the species vector - rarvec[as(names(sstab), "integer")] <- sstab - # Return abundance vector. Let replacement happen elsewhere. - return(rarvec) +rarefaction_subsample <- function(x, sample.size, replace = FALSE) { + # This is a test x = sample(10, 10) x = 1:10 sample.size = 50 system.time(obsvec + # <- foreach(OTUi=1:length(x), times=x, .combine=c) %do% {rep(OTUi, times)}) + # data('GlobalPatterns') sample.size = + # sample_sums(GlobalPatterns)[which.min(sample_sums(GlobalPatterns))] x = + # get_taxa(GlobalPatterns, which.max(sample_sums(GlobalPatterns))) Create + # replacement species vector + rarvec <- numeric(length(x)) + # Perform the sub-sampling. Suppress warnings due to old R compat issue. Also, + # make sure to avoid errors from x summing to zero, and there are no observations + # to sample. The initialization of rarvec above is already sufficient. + if (sum(x) <= 0) { + # Protect against, and quickly return an empty vector, if x is already an empty + # count vector + return(rarvec) + } + if (replace) { + # resample with replacement + suppressWarnings(subsample <- sample(1:length(x), sample.size, replace = TRUE, + prob = x)) + } else { + # resample without replacement + obsvec <- apply(data.frame(OTUi = 1:length(x), times = x), 1, function(x) { + rep_len(x["OTUi"], x["times"]) + }) + obsvec <- unlist(obsvec, use.names = FALSE) + # use `sample` for subsampling. Hope that obsvec doesn't overflow. + suppressWarnings(subsample <- sample(obsvec, sample.size, replace = FALSE)) + } + # Tabulate the results (these are already named by the order in `x`) + sstab <- table(subsample) + # Assign the tabulated random subsample values to the species vector + rarvec[as(names(sstab), "integer")] <- sstab + # Return abundance vector. Let replacement happen elsewhere. + return(rarvec) } -################################################################################ +################################################################################ #' Agglomerate closely-related taxa using single-linkage clustering. #' #' All tips of the tree separated by a cophenetic distance smaller than @@ -296,23 +288,24 @@ rarefaction_subsample <- function(x, sample.size, replace=FALSE){ #' @export #' #' @examples -#' data("esophagus") +#' data('esophagus') #' # for speed #' esophagus = prune_taxa(taxa_names(esophagus)[1:25], esophagus) -#' plot_tree(esophagus, label.tips="taxa_names", size="abundance", title="Before tip_glom()") -#' plot_tree(tip_glom(esophagus, h=0.2), label.tips="taxa_names", size="abundance", title="After tip_glom()") -tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){ +#' plot_tree(esophagus, label.tips='taxa_names', size='abundance', title='Before tip_glom()') +#' plot_tree(tip_glom(esophagus, h=0.2), label.tips='taxa_names', size='abundance', title='After tip_glom()') +tip_glom = function(physeq, h = 0.2, hcfun = agnes, ...) { dd = as.dist(cophenetic.phylo(phy_tree(physeq))) - psclust = cutree(as.hclust(hcfun(dd, ...)), h=h) - cliques = levels(factor(psclust))[tapply(psclust, factor(psclust), function(x){length(x)>1})] + psclust = cutree(as.hclust(hcfun(dd, ...)), h = h) + cliques = levels(factor(psclust))[tapply(psclust, factor(psclust), function(x) { + length(x) > 1 + })] # For each clique, merge taxa in it... - for( i in cliques){ - physeq = merge_taxa(physeq, eqtaxa=names(psclust)[psclust == i]) + for (i in cliques) { + physeq = merge_taxa(physeq, eqtaxa = names(psclust)[psclust == i]) } return(physeq) } -################################################################################ -################################################################################ +################################################################################ #' Agglomerate taxa of the same type. #' #' This method merges species that have the same taxonomy at a certain @@ -325,7 +318,7 @@ tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){ #' for agglomeration will be replaced with \code{NA}, #' because they should be meaningless following agglomeration. #' -#' @usage tax_glom(physeq, taxrank=rank_names(physeq)[1], NArm=TRUE, bad_empty=c(NA, "", " ", "\t")) +#' @usage tax_glom(physeq, taxrank=rank_names(physeq)[1], NArm=TRUE, bad_empty=c(NA, '', ' ', '\t')) #' #' @param physeq (Required). \code{\link{phyloseq-class}} or \code{\link{otu_table}}. #' @@ -347,7 +340,7 @@ tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){ #' analysis, think about also trying the nomenclature-agnostic \code{\link{tip_glom}} #' method if you have a phylogenetic tree available. #' -#' @param bad_empty (Optional). Character vector. Default: \code{c(NA, "", " ", "\t")}. +#' @param bad_empty (Optional). Character vector. Default: \code{c(NA, '', ' ', '\t')}. #' Defines the bad/empty values #' that should be ignored and/or considered unknown. They will be removed #' from the internal agglomeration vector derived from the argument to \code{tax}, @@ -372,64 +365,65 @@ tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){ #' # ## print the available taxonomic ranks #' # colnames(tax_table(GlobalPatterns)) #' # ## agglomerate at the Family taxonomic rank -#' # (x1 <- tax_glom(GlobalPatterns, taxrank="Family") ) +#' # (x1 <- tax_glom(GlobalPatterns, taxrank='Family') ) #' # ## How many taxa before/after agglomeration? #' # ntaxa(GlobalPatterns); ntaxa(x1) #' # ## Look at enterotype dataset... #' # data(enterotype) #' # ## print the available taxonomic ranks. Shows only 1 rank available, not useful for tax_glom #' # colnames(tax_table(enterotype)) -tax_glom <- function(physeq, taxrank=rank_names(physeq)[1], - NArm=TRUE, bad_empty=c(NA, "", " ", "\t")){ - - # Error if tax_table slot is empty - if( is.null(access(physeq, "tax_table")) ){ - stop("The tax_glom() function requires that physeq contain a taxonomyTable") - } - - # Error if bad taxrank - if( !taxrank[1] %in% rank_names(physeq) ){ - stop("Bad taxrank argument. Must be among the values of rank_names(physeq)") - } - - # Make a vector from the taxonomic data. - CN <- which( rank_names(physeq) %in% taxrank[1] ) - tax <- as(access(physeq, "tax_table"), "matrix")[, CN] - - # if NArm is TRUE, remove the empty, white-space, NA values from - if( NArm ){ - keep_species <- names(tax)[ !(tax %in% bad_empty) ] - physeq <- prune_taxa(keep_species, physeq) - } - - # Concatenate data up to the taxrank column, use this for agglomeration - tax <- as(access(physeq, "tax_table"), "matrix")[, 1:CN, drop=FALSE] - tax <- apply(tax, 1, function(i){paste(i, sep=";_;", collapse=";_;")}) - - # Remove NAs and useless from the vector/factor for looping. - # This does not remove the taxa that have an unknown (NA) - # taxonomic designation at this particular taxonomic rank. - tax <- tax[ !(tax %in% bad_empty) ] - - # Define the OTU cliques to loop through - spCliques <- tapply(names(tax), factor(tax), list) - - # Successively merge taxa in physeq. - for( i in names(spCliques)){ - physeq <- merge_taxa(physeq, spCliques[[i]]) - } - - # "Empty" the values to the right of the rank, using NA_character_. - if( CN < length(rank_names(physeq)) ){ - badcolumns <- (CN+1):length(rank_names(physeq)) - tax_table(physeq)[, badcolumns] <- NA_character_ - } - - # Return. - return(physeq) +tax_glom <- function(physeq, taxrank = rank_names(physeq)[1], NArm = TRUE, bad_empty = c(NA, + "", " ", "\t")) { + + # Error if tax_table slot is empty + if (is.null(access(physeq, "tax_table"))) { + stop("The tax_glom() function requires that physeq contain a taxonomyTable") + } + + # Error if bad taxrank + if (!taxrank[1] %in% rank_names(physeq)) { + stop("Bad taxrank argument. Must be among the values of rank_names(physeq)") + } + + # Make a vector from the taxonomic data. + CN <- which(rank_names(physeq) %in% taxrank[1]) + tax <- as(access(physeq, "tax_table"), "matrix")[, CN] + + # if NArm is TRUE, remove the empty, white-space, NA values from + if (NArm) { + keep_species <- names(tax)[!(tax %in% bad_empty)] + physeq <- prune_taxa(keep_species, physeq) + } + + # Concatenate data up to the taxrank column, use this for agglomeration + tax <- as(access(physeq, "tax_table"), "matrix")[, 1:CN, drop = FALSE] + tax <- apply(tax, 1, function(i) { + paste(i, sep = ";_;", collapse = ";_;") + }) + + # Remove NAs and useless from the vector/factor for looping. This does not + # remove the taxa that have an unknown (NA) taxonomic designation at this + # particular taxonomic rank. + tax <- tax[!(tax %in% bad_empty)] + + # Define the OTU cliques to loop through + spCliques <- tapply(names(tax), factor(tax), list) + + # Successively merge taxa in physeq. + for (i in names(spCliques)) { + physeq <- merge_taxa(physeq, spCliques[[i]]) + } + + # 'Empty' the values to the right of the rank, using NA_character_. + if (CN < length(rank_names(physeq))) { + badcolumns <- (CN + 1):length(rank_names(physeq)) + tax_table(physeq)[, badcolumns] <- NA_character_ + } + + # Return. + return(physeq) } -################################################################################ -################################################################################ +################################################################################ #' Prune unwanted OTUs / taxa from a phylogenetic object. #' #' An S4 Generic method for removing (pruning) unwanted OTUs/taxa from phylogenetic @@ -464,115 +458,113 @@ tax_glom <- function(physeq, taxrank=rank_names(physeq)[1], #' @rdname prune_taxa-methods #' @export #' @examples -#' data("esophagus") +#' data('esophagus') #' esophagus -#' plot(sort(taxa_sums(esophagus), TRUE), type="h", ylim=c(0, 50)) +#' plot(sort(taxa_sums(esophagus), TRUE), type='h', ylim=c(0, 50)) #' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus) #' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus) #' identical(x1, x2) setGeneric("prune_taxa", function(taxa, x) standardGeneric("prune_taxa")) #' @aliases prune_taxa,NULL,ANY-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("NULL", "ANY"), function(taxa, x){ - return(x) +setMethod("prune_taxa", signature("NULL", "ANY"), function(taxa, x) { + return(x) }) -# Any prune_taxa call w/ signature starting with a logical -# converts the logical to a character vector, and then dispatches -# to more specific method. +# Any prune_taxa call w/ signature starting with a logical converts the logical +# to a character vector, and then dispatches to more specific method. #' @aliases prune_taxa,logical,ANY-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("logical", "ANY"), function(taxa, x){ - # Check that logical has same length as ntaxa, stop if not. - if( !identical(length(taxa), ntaxa(x)) ){ - stop("logical argument to taxa is wrong length. Should equal ntaxa(x)") - } else { - # Pass on to names-based prune_taxa method - return( prune_taxa(taxa_names(x)[taxa], x) ) - } +setMethod("prune_taxa", signature("logical", "ANY"), function(taxa, x) { + # Check that logical has same length as ntaxa, stop if not. + if (!identical(length(taxa), ntaxa(x))) { + stop("logical argument to taxa is wrong length. Should equal ntaxa(x)") + } else { + # Pass on to names-based prune_taxa method + return(prune_taxa(taxa_names(x)[taxa], x)) + } }) #' @importFrom ape drop.tip #' @aliases prune_taxa,character,phylo-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "phylo"), function(taxa, x){ - if( length(taxa) <= 1 ){ - # Can't have a tree with 1 or fewer tips - warning("prune_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.") - return(NULL) - } else if( setequal(taxa, taxa_names(x)) ){ - return(x) - } else { - return( drop.tip(x, setdiff(taxa_names(x), taxa)) ) - } +setMethod("prune_taxa", signature("character", "phylo"), function(taxa, x) { + if (length(taxa) <= 1) { + # Can't have a tree with 1 or fewer tips + warning("prune_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.") + return(NULL) + } else if (setequal(taxa, taxa_names(x))) { + return(x) + } else { + return(drop.tip(x, setdiff(taxa_names(x), taxa))) + } }) #' @aliases prune_taxa,character,otu_table-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "otu_table"), function(taxa, x){ - if( setequal(taxa, taxa_names(x)) ){ - return(x) - } else { - taxa = intersect( taxa, taxa_names(x) ) - if( taxa_are_rows(x) ){ - return(x[taxa, , drop=FALSE]) - } else { - return(x[, taxa, drop=FALSE]) - } - } +setMethod("prune_taxa", signature("character", "otu_table"), function(taxa, x) { + if (setequal(taxa, taxa_names(x))) { + return(x) + } else { + taxa = intersect(taxa, taxa_names(x)) + if (taxa_are_rows(x)) { + return(x[taxa, , drop = FALSE]) + } else { + return(x[, taxa, drop = FALSE]) + } + } }) #' @aliases prune_taxa,character,sample_data-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "sample_data"), function(taxa, x){ - return(x) +setMethod("prune_taxa", signature("character", "sample_data"), function(taxa, x) { + return(x) }) #' @aliases prune_taxa,character,phyloseq-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "phyloseq"), function(taxa, x){ - # Re-define `taxa` as the intersection of OTU names for each component AND `taxa` - taxa = intersect(intersect_taxa(x), taxa) - # Now prune them all. - # All phyloseq objects have an otu_table slot, no need to test for existence. - x@otu_table = prune_taxa(taxa, otu_table(x)) - # Test if slot is present. If so, perform the component prune. - if( !is.null(x@tax_table) ){ - x@tax_table = prune_taxa(taxa, tax_table(x)) - } - if( !is.null(x@phy_tree) ){ - x@phy_tree = prune_taxa(taxa, phy_tree(x)) - } - if( !is.null(x@refseq) ){ - x@refseq = prune_taxa(taxa, refseq(x)) - } - # Force index order after pruning to be the same, - # according to the same rules as in the constructor, phyloseq() - x = index_reorder(x, index_type="taxa") - return(x) +setMethod("prune_taxa", signature("character", "phyloseq"), function(taxa, x) { + # Re-define `taxa` as the intersection of OTU names for each component AND `taxa` + taxa = intersect(intersect_taxa(x), taxa) + # Now prune them all. All phyloseq objects have an otu_table slot, no need to + # test for existence. + x@otu_table = prune_taxa(taxa, otu_table(x)) + # Test if slot is present. If so, perform the component prune. + if (!is.null(x@tax_table)) { + x@tax_table = prune_taxa(taxa, tax_table(x)) + } + if (!is.null(x@phy_tree)) { + x@phy_tree = prune_taxa(taxa, phy_tree(x)) + } + if (!is.null(x@refseq)) { + x@refseq = prune_taxa(taxa, refseq(x)) + } + # Force index order after pruning to be the same, according to the same rules as + # in the constructor, phyloseq() + x = index_reorder(x, index_type = "taxa") + return(x) }) #' @aliases prune_taxa,character,taxonomyTable-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "taxonomyTable"), function(taxa, x){ - if( setequal(taxa, taxa_names(x)) ){ - return(x) - } else { - taxa = intersect( taxa, taxa_names(x) ) - return( x[taxa, , drop=FALSE] ) - } +setMethod("prune_taxa", signature("character", "taxonomyTable"), function(taxa, x) { + if (setequal(taxa, taxa_names(x))) { + return(x) + } else { + taxa = intersect(taxa, taxa_names(x)) + return(x[taxa, , drop = FALSE]) + } }) #' @importClassesFrom Biostrings XStringSet #' @aliases prune_taxa,character,XStringSet-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){ - if( setequal(taxa, taxa_names(x)) ){ - # Nothing to do, return x as-is. - return(x) - } else if( length(intersect(taxa, taxa_names(x))) == 0 ){ - # Informative error if intersection is zero. - stop("prune_taxa,XStringSet: taxa and taxa_names(x) do not overlap.") - } else { - # Pop the OTUs that are not in `taxa`, without reordering. - return(x[-which(!taxa_names(x) %in% taxa)]) - } +setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x) { + if (setequal(taxa, taxa_names(x))) { + # Nothing to do, return x as-is. + return(x) + } else if (length(intersect(taxa, taxa_names(x))) == 0) { + # Informative error if intersection is zero. + stop("prune_taxa,XStringSet: taxa and taxa_names(x) do not overlap.") + } else { + # Pop the OTUs that are not in `taxa`, without reordering. + return(x[-which(!taxa_names(x) %in% taxa)]) + } }) -################################################################################ -################################################################################ +################################################################################ #' Define a subset of samples to keep in a phyloseq object. #' #' An S4 Generic method for pruning/filtering unwanted samples @@ -599,67 +591,72 @@ setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){ #' @examples #' data(GlobalPatterns) #' # Subset to just the Chlamydiae phylum. -#' GP.chl <- subset_taxa(GlobalPatterns, Phylum=="Chlamydiae") +#' GP.chl <- subset_taxa(GlobalPatterns, Phylum=='Chlamydiae') #' # Remove the samples that have less than 20 total reads from Chlamydiae #' GP.chl <- prune_samples(sample_sums(GP.chl)>=20, GP.chl) -#' # (p <- plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="abundance")) +#' # (p <- plot_tree(GP.chl, color='SampleType', shape='Family', label.tips='Genus', size='abundance')) setGeneric("prune_samples", function(samples, x) standardGeneric("prune_samples")) #' @aliases prune_samples,character,otu_table-method #' @rdname prune_samples-methods -setMethod("prune_samples", signature("character", "otu_table"), function(samples, x){ - if( setequal(samples, sample_names(x)) ){ - # If the sets of `samples` and sample_names are the same, return as-is. - return(x) - } else { - samples = intersect(samples, sample_names(x)) - if( taxa_are_rows(x) ){ - return( x[, samples] ) - } else { - return( x[samples, ] ) - } - } +setMethod("prune_samples", signature("character", "otu_table"), function(samples, + x) { + if (setequal(samples, sample_names(x))) { + # If the sets of `samples` and sample_names are the same, return as-is. + return(x) + } else { + samples = intersect(samples, sample_names(x)) + if (taxa_are_rows(x)) { + return(x[, samples]) + } else { + return(x[samples, ]) + } + } }) #' @aliases prune_samples,character,sample_data-method #' @rdname prune_samples-methods -setMethod("prune_samples", signature("character", "sample_data"), function(samples, x){ - if( setequal(samples, sample_names(x)) ){ - # If the sets of `samples` and sample_names are the same, return as-is. - return(x) - } else { - samples = intersect(samples, sample_names(x)) - return(x[samples, ]) - } +setMethod("prune_samples", signature("character", "sample_data"), function(samples, + x) { + if (setequal(samples, sample_names(x))) { + # If the sets of `samples` and sample_names are the same, return as-is. + return(x) + } else { + samples = intersect(samples, sample_names(x)) + return(x[samples, ]) + } }) #' @aliases prune_samples,character,phyloseq-method #' @rdname prune_samples-methods -setMethod("prune_samples", signature("character", "phyloseq"), function(samples, x){ - # Re-define `samples` as the intersection of samples names for each component AND `samples` - samples = intersect(intersect_samples(x), samples) - # Now prune each component. - # All phyloseq objects have an otu_table slot, no need to test for existence. - x@otu_table = prune_samples(samples, otu_table(x)) - if( !is.null(x@sam_data) ){ - # protect missing sample_data component. Don't need to prune if empty - x@sam_data = prune_samples(samples, sample_data(x)) - } - # Force sample index order after pruning to be the same, - # according to the same rules as in the constructor, phyloseq() - x = index_reorder(x, index_type="samples") - return(x) +setMethod("prune_samples", signature("character", "phyloseq"), function(samples, + x) { + # Re-define `samples` as the intersection of samples names for each component AND + # `samples` + samples = intersect(intersect_samples(x), samples) + # Now prune each component. All phyloseq objects have an otu_table slot, no need + # to test for existence. + x@otu_table = prune_samples(samples, otu_table(x)) + if (!is.null(x@sam_data)) { + # protect missing sample_data component. Don't need to prune if empty + x@sam_data = prune_samples(samples, sample_data(x)) + } + # Force sample index order after pruning to be the same, according to the same + # rules as in the constructor, phyloseq() + x = index_reorder(x, index_type = "samples") + return(x) }) -# A logical should specify the samples to keep, or not. Have same length as nsamples(x) +# A logical should specify the samples to keep, or not. Have same length as +# nsamples(x) #' @aliases prune_samples,logical,ANY-method #' @rdname prune_samples-methods -setMethod("prune_samples", signature("logical", "ANY"), function(samples, x){ - # Check that logical has same length as nsamples, stop if not. - if( !identical(length(samples), nsamples(x)) ){ - stop("logical argument to samples is wrong length. Should equal nsamples(x)") - } else { - # Pass on to names-based prune_samples method - return( prune_samples(sample_names(x)[samples], x) ) - } +setMethod("prune_samples", signature("logical", "ANY"), function(samples, x) { + # Check that logical has same length as nsamples, stop if not. + if (!identical(length(samples), nsamples(x))) { + stop("logical argument to samples is wrong length. Should equal nsamples(x)") + } else { + # Pass on to names-based prune_samples method + return(prune_samples(sample_names(x)[samples], x)) + } }) -################################################################################ +################################################################################ #' Thresholded rank transformation. #' #' The lowest \code{thresh} values in \code{x} all get the value 'thresh'. @@ -689,15 +686,19 @@ setMethod("prune_samples", signature("logical", "ANY"), function(samples, x){ #' identical(x1, x2) #' (x3 <- otu_table(apply(otu_table(GP), 2, threshrank, thresh=500), taxa_are_rows(GP)) ) #' identical(x1, x3) -threshrank <- function(x, thresh, keep0s=FALSE, ...){ - if( keep0s ){ index0 <- which(x == 0) } - x <- rank(x, ...) - thresh <- thresh[1] - x[x= A}, A) +setMethod("genefilter_sample", signature("matrix"), function(X, flist, A = 1) { + TFmat = apply(X, 2, flist) + apply(TFmat, 1, function(x, A) { + sum(x) >= A + }, A) }) #' @rdname genefilter_sample-methods #' @aliases genefilter_sample,otu_table-method -setMethod("genefilter_sample", signature("otu_table"), function(X, flist, A=1){ - if( taxa_are_rows(X) ){ - genefilter_sample( as(X, "matrix"), flist, A) - } else { - genefilter_sample( t(as(X, "matrix")), flist, A) - } +setMethod("genefilter_sample", signature("otu_table"), function(X, flist, A = 1) { + if (taxa_are_rows(X)) { + genefilter_sample(as(X, "matrix"), flist, A) + } else { + genefilter_sample(t(as(X, "matrix")), flist, A) + } }) #' @rdname genefilter_sample-methods #' @aliases genefilter_sample,phyloseq-method -setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A=1){ - genefilter_sample(otu_table(X), flist, A) +setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A = 1) { + genefilter_sample(otu_table(X), flist, A) }) -################################################################################ +################################################################################ #' A sample-wise filter function builder #' analogous to \code{\link[genefilter]{filterfun}}. #' @@ -947,23 +950,25 @@ setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A=1){ #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -filterfun_sample = function(...){ - flist <- list(...) - if( length(flist) == 1 && is.list(flist[[1]])) { flist <- flist[[1]] } - f = function(x){ - # initialize fval (a logical vector) - fun = flist[[1]] - fval = fun(x) - # check the remaining functions. Compare & logic, element-wise, each loop. - for(fun in flist[-1]){ - fval = fval & fun(x) - } - return(fval) - } - class(f) <- "filterfun" - return(f) +filterfun_sample = function(...) { + flist <- list(...) + if (length(flist) == 1 && is.list(flist[[1]])) { + flist <- flist[[1]] + } + f = function(x) { + # initialize fval (a logical vector) + fun = flist[[1]] + fval = fun(x) + # check the remaining functions. Compare & logic, element-wise, each loop. + for (fun in flist[-1]) { + fval = fval & fun(x) + } + return(fval) + } + class(f) <- "filterfun" + return(f) } -################################################################################ +################################################################################ #' Filter taxa based on across-sample OTU abundance criteria #' #' This function is directly analogous to the @@ -975,7 +980,7 @@ filterfun_sample = function(...){ #' It takes as input a phyloseq object, #' and returns a logical vector #' indicating whether or not each OTU passed the criteria. -#' Alternatively, if the \code{"prune"} option is set to \code{FALSE}, +#' Alternatively, if the \code{'prune'} option is set to \code{FALSE}, #' it returns the already-trimmed version of the phyloseq object. #' #' @usage filter_taxa(physeq, flist, prune=FALSE) @@ -1003,37 +1008,37 @@ filterfun_sample = function(...){ #' \code{\link{filterfun_sample}} #' #' @examples -#' data("enterotype") -#' require("genefilter") +#' data('enterotype') +#' require('genefilter') #' flist <- filterfun(kOverA(5, 2e-05)) #' ent.logi <- filter_taxa(enterotype, flist) #' ent.trim <- filter_taxa(enterotype, flist, TRUE) #' identical(ent.trim, prune_taxa(ent.logi, enterotype)) #' identical(sum(ent.logi), ntaxa(ent.trim)) #' filter_taxa(enterotype, flist, TRUE) -filter_taxa <- function(physeq, flist, prune=FALSE){ - # access OTU table - OTU <- access(physeq, "otu_table", TRUE) - # Enforce orientation (we are filtering taxa, not samples) - if(!taxa_are_rows(OTU)) { - OTU <- t(OTU) - } - # Coerce to vanilla matrix - OTU <- as(OTU, "matrix") - # Apply filtering function(s), get logical of length ntaxa(physeq) - ans <- apply(OTU, 1, flist) - # sanity check - if( ntaxa(physeq) != length(ans) ){ - stop("Logic error in applying function(s). Logical result not same length as ntaxa(physeq)") - } - # Now return logical or pruned phyloseq-class instance. - if( prune ){ - return( prune_taxa(ans, physeq) ) - } else { - return( ans ) - } +filter_taxa <- function(physeq, flist, prune = FALSE) { + # access OTU table + OTU <- access(physeq, "otu_table", TRUE) + # Enforce orientation (we are filtering taxa, not samples) + if (!taxa_are_rows(OTU)) { + OTU <- t(OTU) + } + # Coerce to vanilla matrix + OTU <- as(OTU, "matrix") + # Apply filtering function(s), get logical of length ntaxa(physeq) + ans <- apply(OTU, 1, flist) + # sanity check + if (ntaxa(physeq) != length(ans)) { + stop("Logic error in applying function(s). Logical result not same length as ntaxa(physeq)") + } + # Now return logical or pruned phyloseq-class instance. + if (prune) { + return(prune_taxa(ans, physeq)) + } else { + return(ans) + } } -################################################################################ +################################################################################ #' Make filter fun. the most abundant \code{k} taxa #' #' @usage topk(k, na.rm=TRUE) @@ -1059,13 +1064,15 @@ filter_taxa <- function(physeq, flist, prune=FALSE){ #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -topk = function(k, na.rm=TRUE){ - function(x){ - if(na.rm){x = x[!is.na(x)]} - x >= sort(x, decreasing=TRUE)[k] +topk = function(k, na.rm = TRUE) { + function(x) { + if (na.rm) { + x = x[!is.na(x)] } + x >= sort(x, decreasing = TRUE)[k] + } } -############################################################ +############################################################ #' Make filter fun. that returns the most abundant \code{p} fraction of taxa #' #' @usage topp(p, na.rm=TRUE) @@ -1093,13 +1100,15 @@ topk = function(k, na.rm=TRUE){ #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -topp <- function(p, na.rm=TRUE){ - function(x){ - if(na.rm){x = x[!is.na(x)]} - x >= sort(x, decreasing=TRUE)[ceiling(length(x)*p)] +topp <- function(p, na.rm = TRUE) { + function(x) { + if (na.rm) { + x = x[!is.na(x)] } + x >= sort(x, decreasing = TRUE)[ceiling(length(x) * p)] + } } -################################################################################ +################################################################################ #' Make filter fun. that returns the top f fraction of taxa in a sample. #' #' As opposed to \code{\link{topp}}, which gives the @@ -1122,7 +1131,7 @@ topp <- function(p, na.rm=TRUE){ #' @export #' #' @examples -#' t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="") +#' t1 <- 1:10; names(t1)<-paste('t', 1:10, sep='') #' topf(0.6)(t1) #' ## Use simulated abundance matrix #' set.seed(711) @@ -1132,17 +1141,17 @@ topp <- function(p, na.rm=TRUE){ #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -topf <- function(f, na.rm=TRUE){ - function(x){ - if (na.rm){ - x = x[!is.na(x)] - } - y <- sort(x, decreasing = TRUE) - y <- cumsum(y)/sum(x) - return( (y <= f)[names(x)] ) +topf <- function(f, na.rm = TRUE) { + function(x) { + if (na.rm) { + x = x[!is.na(x)] } + y <- sort(x, decreasing = TRUE) + y <- cumsum(y)/sum(x) + return((y <= f)[names(x)]) + } } -################################################################################ +################################################################################ #' Set to FALSE any outlier species greater than f fractional abundance. #' #' This is for removing overly-abundant outlier taxa, not for trimming low-abundance @@ -1163,7 +1172,7 @@ topf <- function(f, na.rm=TRUE){ #' #' @export #' @examples -#' t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="") +#' t1 <- 1:10; names(t1)<-paste('t', 1:10, sep='') #' rm_outlierf(0.15)(t1) #' ## Use simulated abundance matrix #' set.seed(711) @@ -1174,13 +1183,13 @@ topf <- function(f, na.rm=TRUE){ #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -rm_outlierf <- function(f, na.rm=TRUE){ - function(x){ - if(na.rm){ - x = x[!is.na(x)] - } - y <- x / sum(x) - return( y < f ) +rm_outlierf <- function(f, na.rm = TRUE) { + function(x) { + if (na.rm) { + x = x[!is.na(x)] } + y <- x/sum(x) + return(y < f) + } } -################################################################################ +################################################################################ diff --git a/R/validity-methods.R b/R/validity-methods.R index a29bdc90..26dbb12b 100644 --- a/R/validity-methods.R +++ b/R/validity-methods.R @@ -1,117 +1,89 @@ -################################################################################ -# Validity methods: -# -# These are delicate, because they are effectively at the S4 infrastructure -# level, in between "new" and the constructor. Some of the issues that might -# otherwise go here for a check are handled by the constructors. In many -# cases it desirable to let the constructor handle this, because it allows -# greater flexibility and transparency. These tests should be limited to -# conditions that are not fixed automatically by the constructors, and/or -# could not be because the deficiency/error is too fundamental. By design, -# we expect the validity errors to cause a fault before (nearly) any action -# by the constructor. -# -# This is a special case where the accessors are not-used, in favor of the -# S4 @tags. E.g. object@otu_table instead of otu_table(object). This is to avoid -# any complications with the accessors interacting with objects early on. -# Perhaps this is a mistake, but its a very limited case and won't be difficult -# to change. -# -# Also, for now these are not documented at all at the user-level, -# and are not expected to ever -# be at the "user-level", so formal documentation probably unnecessary. Lots -# of comments throughout this code will need to compensate. -################################################################################ -######################################## -# otu_table: -# # # * all values must be numeric (otu_table()-constructor should probably round values by default)) -# # # * all values must be >= 0 (no negative abundances) -######################################## -validotu_table <- function(object){ - # Both dimensions must have non-zero length. - if( any(dim(object)==0) ){ - return("\n OTU abundance data must have non-zero dimensions.") - } - # Verify that it is numeric matrix - if( !is.numeric(object@.Data[, 1]) ){ - text = "\n Non-numeric matrix provided as OTU table.\n" +################################################################################ Validity methods: These are delicate, because they are effectively at the S4 +################################################################################ infrastructure level, in between 'new' and the constructor. Some of the issues +################################################################################ that might otherwise go here for a check are handled by the constructors. In +################################################################################ many cases it desirable to let the constructor handle this, because it allows +################################################################################ greater flexibility and transparency. These tests should be limited to +################################################################################ conditions that are not fixed automatically by the constructors, and/or could +################################################################################ not be because the deficiency/error is too fundamental. By design, we expect +################################################################################ the validity errors to cause a fault before (nearly) any action by the +################################################################################ constructor. This is a special case where the accessors are not-used, in favor +################################################################################ of the S4 @tags. E.g. object@otu_table instead of otu_table(object). This is to +################################################################################ avoid any complications with the accessors interacting with objects early on. +################################################################################ Perhaps this is a mistake, but its a very limited case and won't be difficult +################################################################################ to change. Also, for now these are not documented at all at the user-level, +################################################################################ and are not expected to ever be at the 'user-level', so formal documentation +################################################################################ probably unnecessary. Lots of comments throughout this code will need to +################################################################################ compensate. otu_table: # # * all values must be numeric +################################################################################ (otu_table()-constructor should probably round values by default)) # # * all +################################################################################ values must be >= 0 (no negative abundances) +validotu_table <- function(object) { + # Both dimensions must have non-zero length. + if (any(dim(object) == 0)) { + return("\n OTU abundance data must have non-zero dimensions.") + } + # Verify that it is numeric matrix + if (!is.numeric(object@.Data[, 1])) { + text = "\n Non-numeric matrix provided as OTU table.\n" text = paste0(text, "Abundance is expected to be numeric.") - return(text) - } - return(TRUE) + return(text) + } + return(TRUE) } ## assign the function as the validity method for the otu_table class setValidity("otu_table", validotu_table) -######################################## -######################################## -# sample_data: -######################################## -validsample_data <- function(object){ - if( any(dim(object)==0) ){ - return("Sample Data must have non-zero dimensions.") - } - return(TRUE) +######################################## sample_data: +validsample_data <- function(object) { + if (any(dim(object) == 0)) { + return("Sample Data must have non-zero dimensions.") + } + return(TRUE) } ## assign the function as the validity method for the sample_data class setValidity("sample_data", validsample_data) -######################################## -######################################## -# taxonomyTable: -######################################## -# # # * all values must be a character -# # # * at least some non-NULL (or equiv) values -# taxonomyTable validity function -######################################## -validTaxonomyTable <- function(object){ - # Both dimensions must have non-zero length. - if( any(dim(object)==0) ){ - return("\n Taxonomy Table must have non-zero dimensions.") - } - # Verify that it is character matrix - if( !is.character(object@.Data[, 1]) ){ +######################################## taxonomyTable: # # * all values must be a character # # * at least some +######################################## non-NULL (or equiv) values taxonomyTable validity function +validTaxonomyTable <- function(object) { + # Both dimensions must have non-zero length. + if (any(dim(object) == 0)) { + return("\n Taxonomy Table must have non-zero dimensions.") + } + # Verify that it is character matrix + if (!is.character(object@.Data[, 1])) { text = "\n Non-character matrix provided as Taxonomy Table.\n" text = paste0(text, "Taxonomy is expected to be characters.") - return(text) - } - return(TRUE) + return(text) + } + return(TRUE) } ## assign the function as the validity method for the sample_data class setValidity("taxonomyTable", validTaxonomyTable) -######################################## -######################################## -# tree: -######################################## -# # (Any rules about trees appropriate in this context?) +######################################## tree: # (Any rules about trees appropriate in this context?) -######################################## -######################################## -# phyloseq-class: -######################################## -# Because data-index complete-matching is checked/enforced by the phyloseq() constructor, -# it should not be checked here, or the constructor will fail validity tests before -# it gets the chance to groom the objects. -# Instead, the validity test can check if there is any intersection of the species names -# and/or sample names, prior to any attempt by the constructor to prune (which would end) -# in a mysterious index error, anyway -######################################## -validphyloseq <- function(object){ - # There must be an otu_table - if( is.null(object@otu_table) ){ - return("\n An otu_table is required for most analysis / graphics in the phyloseq-package") - } - # intersection of species-names must have non-zero length - if( length(intersect_taxa(object)) <= 0 ){ - return(paste("\n Component taxa/OTU names do not match.\n", - " Taxa indices are critical to analysis.\n Try taxa_names()", sep="")) - } - # If there is sample data, check that sample-names overlap - if( !is.null(object@sam_data) ){ - if( length(intersect(sample_names(object@sam_data), sample_names(object@otu_table))) <= 0 ){ - return("\n Component sample names do not match.\n Try sample_names()") - } - } - return(TRUE) +######################################## phyloseq-class: Because data-index complete-matching is checked/enforced by the +######################################## phyloseq() constructor, it should not be checked here, or the constructor will +######################################## fail validity tests before it gets the chance to groom the objects. Instead, +######################################## the validity test can check if there is any intersection of the species names +######################################## and/or sample names, prior to any attempt by the constructor to prune (which +######################################## would end) in a mysterious index error, anyway +validphyloseq <- function(object) { + # There must be an otu_table + if (is.null(object@otu_table)) { + return("\n An otu_table is required for most analysis / graphics in the phyloseq-package") + } + # intersection of species-names must have non-zero length + if (length(intersect_taxa(object)) <= 0) { + return(paste("\n Component taxa/OTU names do not match.\n", " Taxa indices are critical to analysis.\n Try taxa_names()", + sep = "")) + } + # If there is sample data, check that sample-names overlap + if (!is.null(object@sam_data)) { + if (length(intersect(sample_names(object@sam_data), sample_names(object@otu_table))) <= + 0) { + return("\n Component sample names do not match.\n Try sample_names()") + } + } + return(TRUE) } ## assign the function as the validity method for the otu_table class setValidity("phyloseq", validphyloseq) -######################################## +######################################## From 202c274c55682de376848dd88be0fc5ca0faa68c Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 11:51:27 +0200 Subject: [PATCH 03/11] Require the GUniFrac package --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 57316a1d..d437c71a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,9 +12,9 @@ Author: Paul J. McMurdie , contributions from Gregory Jordan and Scott Chamberlain License: AGPL-3 Imports: - BiocGenerics (>= 0.14.0), ade4 (>= 1.7.2), ape (>= 3.1.1), + BiocGenerics (>= 0.14.0), biom (>= 0.3.9), Biostrings (>= 2.28.0), cluster (>= 1.14.4), @@ -22,6 +22,7 @@ Imports: DESeq2 (>= 1.4.0), foreach (>= 1.4.2), ggplot2 (>= 1.0.0), + GUniFrac (>= 1.0), igraph (>= 0.7.0), methods (>= 3.1.0), multtest (>= 2.16.0), From 80aef4fd3f2100bf4998ed2aea7d8eaad85a7e3f Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 12:07:48 +0200 Subject: [PATCH 04/11] Making room for gUniFrac method --- R/distance-methods.R | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/R/distance-methods.R b/R/distance-methods.R index a25aa0bc..483bc3e3 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -462,7 +462,7 @@ JSD <- function(physeq, parallel = FALSE) { #' # UniFrac(esophagus, TRUE) #' ################################################################################ setGeneric("UniFrac", function(physeq, weighted = FALSE, normalized = TRUE, parallel = FALSE, - fast = TRUE) { + fast = TRUE, generalized = FALSE, alpha = 0.5) { standardGeneric("UniFrac") }) ################################################################################ @@ -471,7 +471,7 @@ setGeneric("UniFrac", function(physeq, weighted = FALSE, normalized = TRUE, para #' @importFrom ape is.rooted #' @importFrom ape root setMethod("UniFrac", "phyloseq", function(physeq, weighted = FALSE, normalized = TRUE, - parallel = FALSE, fast = TRUE) { + parallel = FALSE, fast = TRUE, generalized = FALSE, alpha = 0.5) { if (is.null(phy_tree(physeq)$edge.length)) { stop("Tree has no branch lengths. See tree$edge.length. Cannot compute UniFrac without branch lengths") } @@ -485,11 +485,15 @@ setMethod("UniFrac", "phyloseq", function(physeq, weighted = FALSE, normalized = stop("Problem automatically rooting tree. Make sure your tree is rooted before attempting UniFrac calculation. See ?ape::root") } } - if (fast) { - fastUniFrac(physeq, weighted, normalized, parallel) + if (generalized) { + gUniFrac(physeq, alpha) } else { - warning("Option `fast=FALSE` is deprecated. Only 'fast' UniFrac is supported in phyloseq.") - fastUniFrac(physeq, weighted, normalized, parallel) + if (fast) { + fastUniFrac(physeq, weighted, normalized, parallel) + } else { + warning("Option `fast=FALSE` is deprecated. Only 'fast' UniFrac is supported in phyloseq.") + fastUniFrac(physeq, weighted, normalized, parallel) + } } }) ################################################################################ Fast UniFrac for R. Adapted from The ISME Journal (2010) 4, 17-27; @@ -622,4 +626,13 @@ fastUniFrac <- function(physeq, weighted = FALSE, normalized = TRUE, parallel = UniFracMat[matIndices] <- unlist(distlist) return(as.dist(UniFracMat)) } -################################################################################ + +################################################################################ Generalized UniFrac (provided by the GUniFrac R package) +################################################################################ Chen et al., Bioinformatics (2012) 28 (16): 2106-2113 +################################################################################ doi: 10.1093/bioinformatics/bts342 +################################################################################ http://bioinformatics.oxfordjournals.org/content/28/16/2106 +#' @importFrom GUniFrac GUniFrac +#' @keywords internal +gUniFrac <- function(physeq, alpha = 0.5) { + +} From 9de866eb45b161023e8508577942aaf097ed4f13 Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 15:25:04 +0200 Subject: [PATCH 05/11] gunifrac distances added --- R/distance-methods.R | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/R/distance-methods.R b/R/distance-methods.R index 483bc3e3..5a09aefc 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -634,5 +634,18 @@ fastUniFrac <- function(physeq, weighted = FALSE, normalized = TRUE, parallel = #' @importFrom GUniFrac GUniFrac #' @keywords internal gUniFrac <- function(physeq, alpha = 0.5) { - + # The GUniFrac package is orphaned; it may be good to integrate it inside phyloseq eventually. + tree <- phy_tree(physeq) + OTU <- as.matrix(otu_table(physeq)) + if (taxa_are_rows(physeq)) { + OTU <- t(OTU) + } + unifracs <- GUniFrac::GUniFrac(OTU, tree=tree, alpha=alpha)$unifracs + #UniFracMat <- unifracs[, , "d_UW"] # Unweighted UniFrac + #UniFracMat <- unifracs[, , "d_0"] # GUniFrac with alpha 0 + #UniFracMat <- unifracs[, , "d_0.5"] # GUniFrac with alpha 0.5 + #UniFracMat <- unifracs[, , "d_1"] # Weighted UniFrac + #UniFracMat <- unifracs[, , "d_VAW"] # Variance adjusted weighted UniFrac + UniFracMat <- unifracs[, , paste("d", alpha, sep="_")] + return(as.dist(UniFracMat)) } From 4308d435cf48431bb33dfe3ae54b66247802cc87 Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 15:35:40 +0200 Subject: [PATCH 06/11] Added link for 'gunifrac' method --- R/distance-methods.R | 61 ++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/R/distance-methods.R b/R/distance-methods.R index 5a09aefc..7ab2cd7e 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -23,7 +23,7 @@ #' \code{\link[vegan]{betadiver}}, #' \code{\link[vegan]{designdist}}, or #' \code{\link{dist}}. -#' +#' #' @param physeq (Required). A \code{\link{phyloseq-class}} or #' an \code{\link{otu_table-class}} object. The latter is only appropriate #' for methods that do not require any additional data (one-table). @@ -34,26 +34,28 @@ #' @param method (Optional). A character string. Default is \code{'unifrac'}. #' Provide one of the 45 currently supported options. #' To see a list of supported options, enter the following into the command line: -#' +#' #' \code{distance('list')} -#' +#' #' For further details and additional arguments, #' see the documentation for the supprting functions, linked below #' under ``See Also''. -#' +#' #' In particular, there are three methods included #' by the \code{\link{phyloseq-package}}, and accessed by the following #' \code{method} options: -#' +#' #' \code{'unifrac'}, for (unweighted) UniFrac distance, \code{\link{UniFrac}}; -#' +#' #' \code{'wunifrac'}, for weighted-UniFrac distance, \code{\link{UniFrac}}; #' +#' \code{'gunifrac'}, for generalized UniFrac distance, \code{\link{UniFrac}}; +#' #' \code{'dpcoa'}, sample-wise distance from Double Principle #' Coordinate Analysis, \code{\link{DPCoA}}; -#' +#' #' \code{'jsd'}, for Jensen-Shannon Divergence, \code{\link{JSD}}; -#' +#' #' and it is recommended that you see their documentation #' for details, references, background and examples for use. #' @@ -70,7 +72,7 @@ #' #' @return An object of class ``\code{\link{dist}}'' suitable for certain #' ordination methods and other distance-based analyses. -#' +#' #' @seealso #' \code{\link{plot_ordination}}, #' \code{\link{UniFrac}}, @@ -89,6 +91,7 @@ #' data(esophagus) #' distance(esophagus) # Unweighted UniFrac #' distance(esophagus, 'wunifrac') # weighted UniFrac +#' distance(esophagus, 'gunifrac') # generalized UniFrac #' distance(esophagus, 'jaccard') # vegdist jaccard #' distance(esophagus, 'gower') # vegdist option 'gower' #' distance(esophagus, 'g') # designdist method option 'g' @@ -103,9 +106,9 @@ distance <- function(physeq, method = "unifrac", type = "samples", ...) { stop("`distance` only accepts one method at a time. ", "You provided ", length(method), " methods. ") } - # # Can't do partial matching for all options, # because too many similar - # options. # Do partial matching for wunifrac/unifrac. # Determine if method - # argument matches any options exactly. # If not, call designdist + # Can't do partial matching for all options, because too many similar options. + # Do partial matching for unifrac/wunifrac/gunifrac. + # Determine if argument matches any options exactly. If not, call designdist. vegdist_methods <- c("manhattan", "euclidean", "canberra", "bray", "kulczynski", "jaccard", "gower", "altGower", "morisita", "horn", "mountford", "raup", "binomial", "chao", "cao") @@ -116,7 +119,7 @@ distance <- function(physeq, method = "unifrac", type = "samples", ...) { # The methods supported by vegan::betadiver function. betadiver_methods <- c("w", "-1", "c", "wb", "r", "I", "e", "t", "me", "j", "sor", "m", "-2", "co", "cc", "g", "-3", "l", "19", "hk", "rlb", "sim", "gl", "z") - method.list <- list(UniFrac = c("unifrac", "wunifrac"), DPCoA = "dpcoa", JSD = "jsd", + method.list <- list(UniFrac = c("unifrac", "wunifrac", "gunifrac"), DPCoA = "dpcoa", JSD = "jsd", vegdist = vegdist_methods, betadiver = betadiver_methods, dist = dist_methods, designdist = "ANY") # User support, and method options definition. @@ -133,9 +136,10 @@ distance <- function(physeq, method = "unifrac", type = "samples", ...) { return(c(method.list)) } } - # Regular Expression detect/convert unifrac/weighted-UniFrac args + # Regular Expression detect/convert unifrac/weighted/generalized-UniFrac args method <- gsub("^(u.*)*unifrac$", "unifrac", method, ignore.case = TRUE) method <- gsub("^w.*unifrac$", "wunifrac", method, ignore.case = TRUE) + method <- gsub("^g.*unifrac$", "gunifrac", method, ignore.case = TRUE) # Return distance, or define the function call to build/pass call if (method == "unifrac") { return(UniFrac(physeq, ...)) @@ -143,6 +147,9 @@ distance <- function(physeq, method = "unifrac", type = "samples", ...) { if (method == "wunifrac") { return(UniFrac(physeq, weighted = TRUE, ...)) } + if (method == "gunifrac") { + return(UniFrac(physeq, generalized = TRUE, ...)) + } if (method == "jsd") { return(JSD(physeq, ...)) } @@ -220,7 +227,7 @@ phyloseq_JSD_pair <- function(x, y) { #' the analysis of the \code{\link{enterotype}} dataset. #' #' @usage JSD(physeq, parallel=FALSE) -#' +#' #' @param physeq (Required). \code{\link{phyloseq-class}}. #' The phyloseq data on which to compute the #' pairwise sample distance matrix. @@ -234,12 +241,12 @@ phyloseq_JSD_pair <- function(x, y) { #' @return An object of class ``\code{\link{dist}}'' suitable for certain #' ordination methods and other distance-based analyses. #' See \code{\link{distance}}. -#' +#' #' @seealso #' \code{\link{distance}} -#' +#' #' \code{\link{enterotype}} -#' +#' #' \url{http://en.wikipedia.org/wiki/Jensen-Shannon_divergence} #' #' @references @@ -321,7 +328,7 @@ JSD <- function(physeq, parallel = FALSE) { #' helper packages. One of the simplest seems to be the \emph{doParallel} package. #' #' For more information, see the following links on registering the ``backend'': -#' +#' #' \emph{foreach} package manual: #' #' \url{http://cran.r-project.org/web/packages/foreach/index.html} @@ -329,7 +336,7 @@ JSD <- function(physeq, parallel = FALSE) { #' Notes on parallel computing in \code{R}. Skip to the section describing #' the \emph{foreach Framework}. It gives off-the-shelf examples for registering #' a parallel backend using the \emph{doMC}, \emph{doSNOW}, or \emph{doMPI} packages: -#' +#' #' \url{http://trg.apbionet.org/euasiagrid/docs/parallelR.notes.pdf} #' #' Furthermore, as of \code{R} version \code{2.14.0} and higher, a parallel package @@ -368,7 +375,7 @@ JSD <- function(physeq, parallel = FALSE) { #' has registered a parallel ``backend'' prior to calling this function. #' Default is \code{FALSE}. If FALSE, UniFrac will register a serial backend #' so that \code{foreach::\%dopar\%} does not throw a warning. -#' +#' #' @param fast (Optional). Logical. DEPRECATED. #' Do you want to use the ``Fast UniFrac'' #' algorithm? Implemented natively in the \code{phyloseq-package}. @@ -389,20 +396,20 @@ JSD <- function(physeq, parallel = FALSE) { #' in order to avoid causing unsupported-argument errors. #' #' @return a sample-by-sample distance matrix, suitable for NMDS, etc. -#' +#' #' @seealso -#' +#' #' \code{\link{distance}} -#' +#' #' \code{unifrac} in the picante package. #' #' @references -#' +#' #' \url{http://bmf.colorado.edu/unifrac/} #' #' The main implementation (Fast UniFrac) is adapted from the algorithm's #' description in: -#' +#' #' Hamady, Lozupone, and Knight, #' ``\href{http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html}{Fast UniFrac:} #' facilitating high-throughput phylogenetic analyses of @@ -420,7 +427,7 @@ JSD <- function(physeq, parallel = FALSE) { #' #' Lozupone C, Knight R. ``UniFrac: a new phylogenetic method for comparing microbial #' communities.'' Appl Environ Microbiol. 2005 71 (12):8228-35. -#' +#' #' @docType methods #' @export #' @import foreach From f68102cf73b08d28341ebfd00d08f24694c63d0c Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 16:02:34 +0200 Subject: [PATCH 07/11] Added documentation for generalized UniFrac --- R/distance-methods.R | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/R/distance-methods.R b/R/distance-methods.R index 7ab2cd7e..0557d4b0 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -41,7 +41,7 @@ #' see the documentation for the supprting functions, linked below #' under ``See Also''. #' -#' In particular, there are three methods included +#' In particular, there are four methods included #' by the \code{\link{phyloseq-package}}, and accessed by the following #' \code{method} options: #' @@ -91,7 +91,7 @@ #' data(esophagus) #' distance(esophagus) # Unweighted UniFrac #' distance(esophagus, 'wunifrac') # weighted UniFrac -#' distance(esophagus, 'gunifrac') # generalized UniFrac +#' distance(esophagus, 'gunifrac', alpha=0.5) # generalized UniFrac #' distance(esophagus, 'jaccard') # vegdist jaccard #' distance(esophagus, 'gower') # vegdist option 'gower' #' distance(esophagus, 'g') # designdist method option 'g' @@ -303,7 +303,8 @@ JSD <- function(physeq, parallel = FALSE) { return(as.dist(DistMat)) } ############################################################################## -#' Calculate weighted or unweighted (Fast) UniFrac distance for all sample pairs. +#' Calculate weighted, unweighted or generalized UniFrac distance for all +#' all sample pairs. #' #' This function calculates the (Fast) UniFrac distance for all sample-pairs #' in a \code{\link{phyloseq-class}} object. @@ -358,19 +359,28 @@ JSD <- function(physeq, parallel = FALSE) { #' contingency table (\code{\link{otu_table-class}}). See #' examples below for coercions that might be necessary. #' -#' @param weighted (Optional). Logical. Should use weighted-UniFrac calculation? +#' @param weighted (Optional). Logical. Should weighted-UniFrac be calculated? #' Weighted-UniFrac takes into account the relative abundance of species/taxa #' shared between samples, whereas unweighted-UniFrac only considers #' presence/absence. Default is \code{FALSE}, meaning the unweighted-UniFrac #' distance is calculated for all pairs of samples. #' -#' @param normalized (Optional). Logical. Should the output be normalized such that values +#' @param normalized (Optional). Logical. Should the output be normalized such that values #' range from 0 to 1 independent of branch length values? Default is \code{TRUE}. -#' Note that (unweighted) \code{UniFrac} is always normalized by total branch-length, -#' and so this value is ignored when \code{weighted == FALSE}. +#' Note that (unweighted) UniFrac is always normalized by total branch-length, +#' and so this value is ignored when \code{weighted = FALSE}. #' -#' @param parallel (Optional). Logical. Should execute calculation in parallel, -#' using multiple CPU cores simultaneously? This can dramatically hasten the +#' @param generalized (Optional). Logical. Should weighted-UniFrac be calculated? +#' This uses the \code{GUniFrac} R package. Default is \code{FALSE}. +#' +#' @param alpha (Optional). The alpha value, i.e. a number between 0 and 1 used when +#' calculatin generalized UniFrac. Alpha limits the weight on abundant taxam so that +#' they do not dominate the distance. A value of 1 gives the same results as weighted +#' UniFrac. The default is \code{0.5}, which has the best power. +#' +#' @param parallel (Optional). Logical. Should calculation be executed in parallel, +#' using multiple CPU cores simultaneously? This is available for weighted and +#' unweighted UniFrac only (not generalized UniFrac) and can dramatically hasten the #' computation time for this function. However, it also requires that the user #' has registered a parallel ``backend'' prior to calling this function. #' Default is \code{FALSE}. If FALSE, UniFrac will register a serial backend @@ -403,6 +413,8 @@ JSD <- function(physeq, parallel = FALSE) { #' #' \code{unifrac} in the picante package. #' +#' \code{GUniFrac} in the GUniFrac package. +#' #' @references #' #' \url{http://bmf.colorado.edu/unifrac/} From 9e705139a7f04a82a705a2163d377c584c8b7f29 Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 16:09:28 +0200 Subject: [PATCH 08/11] Updated the bibliography --- R/distance-methods.R | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/R/distance-methods.R b/R/distance-methods.R index 0557d4b0..60c4a6f8 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -426,19 +426,25 @@ JSD <- function(physeq, parallel = FALSE) { #' ``\href{http://www.nature.com/ismej/journal/v4/n1/full/ismej200997a.html}{Fast UniFrac:} #' facilitating high-throughput phylogenetic analyses of #' microbial communities including analysis of pyrosequencing and PhyloChip data.'' -#' The ISME Journal (2010) 4, 17--27. +#' The ISME Journal (2010) 4, 17:27. #' #' See also additional descriptions of UniFrac in the following articles: #' #' Lozupone, Hamady and Knight, ``UniFrac - An Online Tool for Comparing Microbial -#' Community Diversity in a Phylogenetic Context.'', BMC Bioinformatics 2006, 7:371 +#' Community Diversity in a Phylogenetic Context.'', BMC Bioinformatics (2006), 7:371. #' #' Lozupone, Hamady, Kelley and Knight, ``Quantitative and qualitative (beta) #' diversity measures lead to different insights into factors that structure -#' microbial communities.'' Appl Environ Microbiol. 2007 +#' microbial communities.'' Appl Environ Microbiol (2007). #' #' Lozupone C, Knight R. ``UniFrac: a new phylogenetic method for comparing microbial -#' communities.'' Appl Environ Microbiol. 2005 71 (12):8228-35. +#' communities.'' Appl Environ Microbiol (2005) 71 (12):8228-35. +#' +#' The generalized UniFrac algorithm is described in: +#' +#' Chen, Bittinger, Charlson, Hoffmann et al., ``Associating microbiome composition +#' with environmental covariates using generalized UniFrac distances'', Bioinformatics +#' (2012) 28(16):2106-2113. #' #' @docType methods #' @export From beb68b1346a0a1f780255370a6d4075fafc1e52b Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Fri, 18 Sep 2015 17:10:50 +0200 Subject: [PATCH 09/11] Tests for generalized UniFrac --- R/distance-methods.R | 5 +++-- inst/extdata/gp500-guf.csv | 28 ++++++++++++++++++++++++++++ tests/testthat/test-distance.R | 23 ++++++++++++++++++++--- 3 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 inst/extdata/gp500-guf.csv diff --git a/R/distance-methods.R b/R/distance-methods.R index 60c4a6f8..5a1c4bcf 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -304,7 +304,7 @@ JSD <- function(physeq, parallel = FALSE) { } ############################################################################## #' Calculate weighted, unweighted or generalized UniFrac distance for all -#' all sample pairs. +#' sample pairs. #' #' This function calculates the (Fast) UniFrac distance for all sample-pairs #' in a \code{\link{phyloseq-class}} object. @@ -368,7 +368,8 @@ JSD <- function(physeq, parallel = FALSE) { #' @param normalized (Optional). Logical. Should the output be normalized such that values #' range from 0 to 1 independent of branch length values? Default is \code{TRUE}. #' Note that (unweighted) UniFrac is always normalized by total branch-length, -#' and so this value is ignored when \code{weighted = FALSE}. +#' and so this value is ignored when \code{weighted = FALSE}. It is also ignored +#' for generalized UniFrac. #' #' @param generalized (Optional). Logical. Should weighted-UniFrac be calculated? #' This uses the \code{GUniFrac} R package. Default is \code{FALSE}. diff --git a/inst/extdata/gp500-guf.csv b/inst/extdata/gp500-guf.csv new file mode 100644 index 00000000..9450371f --- /dev/null +++ b/inst/extdata/gp500-guf.csv @@ -0,0 +1,28 @@ +0,0.3804616273,0.3830571906,0.6637519315,0.6865794815,0.725964,0.7433793482,0.7382029447,0.7921199577,0.7244134467,0.7542063389,0.7143153856,0.8001207988,0.6938876241,0.6429624482,0.827193471,0.702390246,0.7429821365,0.7105324391,0.742266685,0.7183935877,0.7240318484,0.7528080014,0.6652628585,0.7669332161,0.7709290303,0.8290004091,0.8308328491 +0.3804616273,0,0.1354369361,0.6680093081,0.6859142302,0.7017444999,0.7119574853,0.751396895,0.8021077185,0.7569202124,0.7697870797,0.7590261077,0.8212007081,0.7335821636,0.7074287749,0.8442963909,0.7089457792,0.7579638038,0.7373412266,0.7710496785,0.7129069754,0.7664634011,0.7551603337,0.7103151345,0.7448385018,0.7843076831,0.8453396979,0.8446441876 +0.3830571906,0.1354369361,0,0.6617540176,0.6851892774,0.702037767,0.7086920069,0.7587201535,0.8038332138,0.7451374122,0.7522178118,0.7249157024,0.820180356,0.7224858342,0.691660461,0.8438746067,0.6969024886,0.7444997796,0.7415708542,0.77576477,0.7129082655,0.7368096379,0.7339293072,0.7014970071,0.7424700806,0.7864117261,0.843818449,0.8434201531 +0.6637519315,0.6680093081,0.6617540176,0,0.4136550349,0.7838276366,0.8004589339,0.7785956259,0.8328527238,0.6755534353,0.7772855876,0.7704188514,0.8265549723,0.6662008109,0.7516023319,0.8485381661,0.7418284524,0.7670059381,0.7437683765,0.7649882472,0.7742785791,0.7694254959,0.5933785062,0.7188612901,0.7414192195,0.8057415857,0.8510026618,0.8482228905 +0.6865794815,0.6859142302,0.6851892774,0.4136550349,0,0.7696945895,0.7804856378,0.7713058489,0.8257372421,0.7099356889,0.7593428451,0.7548251369,0.8194947922,0.6638354159,0.7388867463,0.8209388714,0.7445404319,0.7472480446,0.7630128673,0.7655481977,0.7860071633,0.755279311,0.6435063957,0.7385017876,0.7465386677,0.7948041806,0.8288542198,0.8254160315 +0.725964,0.7017444999,0.702037767,0.7838276366,0.7696945895,0,0.1573022376,0.3095301214,0.6429478569,0.7231561733,0.7198054455,0.7597353509,0.6183786507,0.720377398,0.637074574,0.7026503122,0.6698454879,0.7137296189,0.738726782,0.7843535765,0.7117215353,0.770592863,0.8146199332,0.7745294052,0.789815629,0.7651564695,0.6802926249,0.6865714267 +0.7433793482,0.7119574853,0.7086920069,0.8004589339,0.7804856378,0.1573022376,0,0.2779354515,0.6274153175,0.7117116799,0.6944725681,0.7461089128,0.6045538739,0.7115302606,0.6403079628,0.6880147575,0.6505744101,0.6876431839,0.7410033471,0.7759045597,0.7046615489,0.7577564173,0.8216674402,0.7811247116,0.7955279864,0.779340985,0.6601244947,0.6676828694 +0.7382029447,0.751396895,0.7587201535,0.7785956259,0.7713058489,0.3095301214,0.2779354515,0,0.5717563863,0.6821694429,0.6926011347,0.7471547579,0.5350105209,0.6911197028,0.6081875431,0.6469492961,0.6544572316,0.6714302156,0.730829535,0.7648026699,0.7744636525,0.7565759466,0.8189371149,0.7655148239,0.7892056363,0.7710128248,0.6247172668,0.6304576629 +0.7921199577,0.8021077185,0.8038332138,0.8328527238,0.8257372421,0.6429478569,0.6274153175,0.5717563863,0,0.7691024324,0.7600778713,0.7690728884,0.2898139487,0.7717995534,0.7122651949,0.3697270666,0.6826629177,0.7635496543,0.7648984687,0.7778716384,0.7756183827,0.7700878788,0.8676902783,0.8321794888,0.850433521,0.7814672189,0.3455955822,0.358595636 +0.7244134467,0.7569202124,0.7451374122,0.6755534353,0.7099356889,0.7231561733,0.7117116799,0.6821694429,0.7691024324,0,0.5926597956,0.6780620617,0.7806320756,0.412584158,0.5364403326,0.8100200261,0.45800013,0.5889066438,0.7145654428,0.7431610068,0.7579125623,0.6811645979,0.7172681435,0.6715362733,0.7102364649,0.8027571111,0.8228917002,0.8223144274 +0.7542063389,0.7697870797,0.7522178118,0.7772855876,0.7593428451,0.7198054455,0.6944725681,0.6926011347,0.7600778713,0.5926597956,0,0.4065180617,0.7446551856,0.6081464217,0.4309417589,0.7453940586,0.4542887723,0.1199049269,0.6759943453,0.6646016928,0.7143835305,0.4197132435,0.7784071136,0.7654565208,0.7568112181,0.8382098108,0.7292623488,0.728376247 +0.7143153856,0.7590261077,0.7249157024,0.7704188514,0.7548251369,0.7597353509,0.7461089128,0.7471547579,0.7690728884,0.6780620617,0.4065180617,0,0.7531524655,0.6292985677,0.4522306984,0.7564648524,0.5855598324,0.4362499067,0.6701709247,0.662859777,0.7064434315,0.1345184261,0.7683717988,0.7702982923,0.7625235576,0.8413743502,0.7362789162,0.7303072472 +0.8001207988,0.8212007081,0.820180356,0.8265549723,0.8194947922,0.6183786507,0.6045538739,0.5350105209,0.2898139487,0.7806320756,0.7446551856,0.7531524655,0,0.7543639697,0.7330025471,0.3138145502,0.7408736614,0.7450612957,0.7605529635,0.7602300699,0.7865414805,0.7561578348,0.8605479112,0.8282424921,0.8448134177,0.7716152205,0.2578824695,0.3034827326 +0.6938876241,0.7335821636,0.7224858342,0.6662008109,0.6638354159,0.720377398,0.7115302606,0.6911197028,0.7717995534,0.412584158,0.6081464217,0.6292985677,0.7543639697,0,0.5607753847,0.7723588734,0.4933970587,0.5939177555,0.7163767372,0.7141092181,0.7509482545,0.6339218515,0.6722847527,0.6650366417,0.7191716363,0.7661102601,0.7976728771,0.7943088535 +0.6429624482,0.7074287749,0.691660461,0.7516023319,0.7388867463,0.637074574,0.6403079628,0.6081875431,0.7122651949,0.5364403326,0.4309417589,0.4522306984,0.7330025471,0.5607753847,0,0.7747247613,0.4479804482,0.4153087685,0.6380431993,0.6825066168,0.7195558551,0.4563110236,0.777106465,0.7015572586,0.7672721029,0.8072860821,0.787412764,0.7826235802 +0.827193471,0.8442963909,0.8438746067,0.8485381661,0.8209388714,0.7026503122,0.6880147575,0.6469492961,0.3697270666,0.8100200261,0.7453940586,0.7564648524,0.3138145502,0.7723588734,0.7747247613,0,0.7774826747,0.7441713547,0.7944286257,0.7799557911,0.8117047172,0.748057282,0.8584775889,0.8443660802,0.8498787046,0.7751085333,0.2168858346,0.2180008707 +0.702390246,0.7089457792,0.6969024886,0.7418284524,0.7445404319,0.6698454879,0.6505744101,0.6544572316,0.6826629177,0.45800013,0.4542887723,0.5855598324,0.7408736614,0.4933970587,0.4479804482,0.7774826747,0,0.4545031127,0.6926833468,0.7075122635,0.7040520475,0.5895115069,0.7755772421,0.7390955724,0.7776935793,0.821125134,0.7861907068,0.7815295191 +0.7429821365,0.7579638038,0.7444997796,0.7670059381,0.7472480446,0.7137296189,0.6876431839,0.6714302156,0.7635496543,0.5889066438,0.1199049269,0.4362499067,0.7450612957,0.5939177555,0.4153087685,0.7441713547,0.4545031127,0,0.6726302605,0.6492714184,0.7083771994,0.4394318115,0.7635721324,0.7534587803,0.7388305499,0.824403782,0.7276519336,0.7279817912 +0.7105324391,0.7373412266,0.7415708542,0.7437683765,0.7630128673,0.738726782,0.7410033471,0.730829535,0.7648984687,0.7145654428,0.6759943453,0.6701709247,0.7605529635,0.7163767372,0.6380431993,0.7944286257,0.6926833468,0.6726302605,0,0.2262404476,0.4897585628,0.6795990974,0.8100080694,0.7126868178,0.748059365,0.8249091352,0.7956090229,0.7884813748 +0.742266685,0.7710496785,0.77576477,0.7649882472,0.7655481977,0.7843535765,0.7759045597,0.7648026699,0.7778716384,0.7431610068,0.6646016928,0.662859777,0.7602300699,0.7141092181,0.6825066168,0.7799557911,0.7075122635,0.6492714184,0.2262404476,0,0.522212326,0.6584659414,0.8117333393,0.7341678101,0.7638887748,0.8413858648,0.7709486793,0.7706917412 +0.7183935877,0.7129069754,0.7129082655,0.7742785791,0.7860071633,0.7117215353,0.7046615489,0.7744636525,0.7756183827,0.7579125623,0.7143835305,0.7064434315,0.7865414805,0.7509482545,0.7195558551,0.8117047172,0.7040520475,0.7083771994,0.4897585628,0.522212326,0,0.7152280853,0.8222039746,0.7282743764,0.7788334318,0.8443468545,0.807691729,0.8044434467 +0.7240318484,0.7664634011,0.7368096379,0.7694254959,0.755279311,0.770592863,0.7577564173,0.7565759466,0.7700878788,0.6811645979,0.4197132435,0.1345184261,0.7561578348,0.6339218515,0.4563110236,0.748057282,0.5895115069,0.4394318115,0.6795990974,0.6584659414,0.7152280853,0,0.7604938393,0.7678337114,0.7668958014,0.8423660976,0.7283048348,0.7231652086 +0.7528080014,0.7551603337,0.7339293072,0.5933785062,0.6435063957,0.8146199332,0.8216674402,0.8189371149,0.8676902783,0.7172681435,0.7784071136,0.7683717988,0.8605479112,0.6722847527,0.777106465,0.8584775889,0.7755772421,0.7635721324,0.8100080694,0.8117333393,0.8222039746,0.7604938393,0,0.735902155,0.694139168,0.6785968339,0.8638113276,0.8606356411 +0.6652628585,0.7103151345,0.7014970071,0.7188612901,0.7385017876,0.7745294052,0.7811247116,0.7655148239,0.8321794888,0.6715362733,0.7654565208,0.7702982923,0.8282424921,0.6650366417,0.7015572586,0.8443660802,0.7390955724,0.7534587803,0.7126868178,0.7341678101,0.7282743764,0.7678337114,0.735902155,0,0.4973227313,0.5945122102,0.8529627699,0.8519008857 +0.7669332161,0.7448385018,0.7424700806,0.7414192195,0.7465386677,0.789815629,0.7955279864,0.7892056363,0.850433521,0.7102364649,0.7568112181,0.7625235576,0.8448134177,0.7191716363,0.7672721029,0.8498787046,0.7776935793,0.7388305499,0.748059365,0.7638887748,0.7788334318,0.7668958014,0.694139168,0.4973227313,0,0.4796612994,0.8461339128,0.8405945232 +0.7709290303,0.7843076831,0.7864117261,0.8057415857,0.7948041806,0.7651564695,0.779340985,0.7710128248,0.7814672189,0.8027571111,0.8382098108,0.8413743502,0.7716152205,0.7661102601,0.8072860821,0.7751085333,0.821125134,0.824403782,0.8249091352,0.8413858648,0.8443468545,0.8423660976,0.6785968339,0.5945122102,0.4796612994,0,0.7916880328,0.7884665954 +0.8290004091,0.8453396979,0.843818449,0.8510026618,0.8288542198,0.6802926249,0.6601244947,0.6247172668,0.3455955822,0.8228917002,0.7292623488,0.7362789162,0.2578824695,0.7976728771,0.787412764,0.2168858346,0.7861907068,0.7276519336,0.7956090229,0.7709486793,0.807691729,0.7283048348,0.8638113276,0.8529627699,0.8461339128,0.7916880328,0,0.1250947677 +0.8308328491,0.8446441876,0.8434201531,0.8482228905,0.8254160315,0.6865714267,0.6676828694,0.6304576629,0.358595636,0.8223144274,0.728376247,0.7303072472,0.3034827326,0.7943088535,0.7826235802,0.2180008707,0.7815295191,0.7279817912,0.7884813748,0.7706917412,0.8044434467,0.7231652086,0.8606356411,0.8519008857,0.8405945232,0.7884665954,0.1250947677,0 diff --git a/tests/testthat/test-distance.R b/tests/testthat/test-distance.R index 4fac0792..20b102b1 100644 --- a/tests/testthat/test-distance.R +++ b/tests/testthat/test-distance.R @@ -17,12 +17,17 @@ GP500 = import_qiime(GP500File, treefilename = treeFile) gp500_uuf = read.csv(system.file("extdata", "gp500-uuf.csv", package = "phyloseq"), header = FALSE, fill = TRUE) gp500_wuf = read.csv(system.file("extdata", "gp500-wuf.csv", package = "phyloseq"), header = FALSE, fill = TRUE) gp500_wufu = read.csv(system.file("extdata", "gp500-wufu.csv", package = "phyloseq"), header = FALSE, fill = TRUE) +gp500_guf = read.csv(system.file("extdata", "gp500-guf.csv", package = "phyloseq"), header = FALSE, fill = TRUE) # Add the sample names -colnames(gp500_uuf) <- rownames(gp500_uuf) <- colnames(gp500_wuf) <- rownames(gp500_wuf) <- colnames(gp500_wufu) <- rownames(gp500_wufu) <- sample_names(GP500) +colnames(gp500_guf) <- rownames(gp500_guf) <- sample_names(GP500) +colnames(gp500_uuf) <- rownames(gp500_uuf) <- sample_names(GP500) +colnames(gp500_wuf) <- rownames(gp500_wuf) <- sample_names(GP500) +colnames(gp500_wufu) <- rownames(gp500_wufu) <- sample_names(GP500) # Coerce to Distance Matrices for comparison `"dist"` class gp500_wufu <- as.dist(gp500_wufu) gp500_wuf <- as.dist(gp500_wuf) gp500_uuf <- as.dist(gp500_uuf) +gp500_guf <- as.dist(gp500_guf) # Define numerical tolerance tol = 0.00000001 test_that("UniFrac produces correct values on an example subset from Global Patterns. 'Correct' values are results from pyCogent", { @@ -33,13 +38,21 @@ test_that("UniFrac produces correct values on an example subset from Global Patt label = "`UniFrac`: Weighted, normalized UniFrac results did not match reference answer.") expect_equal(gp500_uuf, UniFrac(GP500, weighted = FALSE), check.attributes = FALSE, tolerance = tol, label = "`UniFrac`: Unweighted UniFrac results did not match reference answer.") + expect_equal(gp500_guf, UniFrac(GP500, generalized = TRUE, alpha = 0.5), check.attributes = FALSE, tolerance = tol, + label = "`UniFrac`: Generalized UniFrac results (a=0.5) did not match reference answer.") + expect_equal(gp500_wuf, UniFrac(GP500, generalized = TRUE, alpha = 1.0), check.attributes = FALSE, tolerance = tol, + label = "`UniFrac`: Generalized UniFrac results (a=1.0) did not match reference answer.") # Using the `distance` wrapper expect_equal(gp500_wufu, distance(GP500, "unifrac", weighted = TRUE, normalized = FALSE), check.attributes = FALSE, tolerance = tol, label = "`distance`: Weighted but Un-normalized UniFrac results did not match reference answer.") expect_equal(gp500_wuf, distance(GP500, "unifrac", weighted = TRUE), check.attributes = FALSE, tolerance = tol, label = "`distance`: Weighted, normalized UniFrac results did not match reference answer.") expect_equal(gp500_uuf, distance(GP500, "unifrac", weighted = FALSE), check.attributes = FALSE, tolerance = tol, - label = "`distance`: Unweighted UniFrac results did not match reference answer.") + label = "`distance`: Unweighted UniFrac results did not match reference answer.") + expect_equal(gp500_guf, distance(GP500, "unifrac", generalized = TRUE, alpha = 0.5), check.attributes = FALSE, tolerance = tol, + label = "`distance`: Generalized UniFrac results (a=0.5) did not match reference answer.") + expect_equal(gp500_wuf, distance(GP500, "unifrac", generalized = TRUE, alpha = 1.0), check.attributes = FALSE, tolerance = tol, + label = "`distance`: Generalized UniFrac results (a=1.0) did not match reference answer.") # Make sure reference files are different (at the very least) expect_false({isTRUE(all.equal(gp500_uuf, gp500_wuf, check.attributes = FALSE, tolerance = 0.01))}, label = "The reference matrices for UniFrac testing should be different, but were not. uuf/wuf") @@ -47,12 +60,16 @@ test_that("UniFrac produces correct values on an example subset from Global Patt label = "The reference matrices for UniFrac testing should be different, but were not. uuf/wufu") expect_identical(distance(GP500, "wunifrac"), distance(GP500, "unifrac", weighted = TRUE), label = "wunifrac output is not identical to unifrac with weighted=T flag") + expect_identical(distance(GP500, "gunifrac"), distance(GP500, "unifrac", generalized = TRUE), + label = "gunifrac output is not identical to unifrac with generalized=T flag") }) test_that("Check that regular-expression matching for unifrac method flag is working", { expect_identical(distance(GP500, "w-UniFrac"), distance(GP500, "unifrac", weighted = TRUE)) expect_identical(distance(GP500, "weighted-UniFrac"), distance(GP500, "unifrac", weighted = TRUE)) - expect_identical(distance(GP500, "unweighted-UniFrac"), distance(GP500, "unifrac")) expect_identical(distance(GP500, "u-UniFrac"), distance(GP500, "unifrac")) + expect_identical(distance(GP500, "unweighted-UniFrac"), distance(GP500, "unifrac")) + expect_identical(distance(GP500, "g-UniFrac"), distance(GP500, "unifrac", generalized = TRUE)) + expect_identical(distance(GP500, "generalized-UniFrac"), distance(GP500, "unifrac", generalized = TRUE)) }) ################################################################################ ################################################################################ From 49a964a19a9ee3318a5c4417257e2f77d2c071c0 Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Sat, 19 Sep 2015 15:32:29 +0200 Subject: [PATCH 10/11] Making GUniFrac an optional dependency --- DESCRIPTION | 6 +++--- R/distance-methods.R | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d437c71a..8ed21b55 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,7 +22,6 @@ Imports: DESeq2 (>= 1.4.0), foreach (>= 1.4.2), ggplot2 (>= 1.0.0), - GUniFrac (>= 1.0), igraph (>= 0.7.0), methods (>= 3.1.0), multtest (>= 2.16.0), @@ -34,8 +33,9 @@ Depends: R (>= 3.1.0) Suggests: genefilter (>= 1.42.0), - testthat (>= 0.8), - knitr (>= 1.3) + GUniFrac (>= 1.0), + knitr (>= 1.3), + testthat (>= 0.8) VignetteBuilder: knitr Enhances: doParallel (>= 1.0.1) biocViews: Sequencing, Microbiome, Metagenomics, diff --git a/R/distance-methods.R b/R/distance-methods.R index 5a1c4bcf..3d85bc6c 100644 --- a/R/distance-methods.R +++ b/R/distance-methods.R @@ -661,6 +661,10 @@ fastUniFrac <- function(physeq, weighted = FALSE, normalized = TRUE, parallel = #' @keywords internal gUniFrac <- function(physeq, alpha = 0.5) { # The GUniFrac package is orphaned; it may be good to integrate it inside phyloseq eventually. + if (!requireNamespace("GUniFrac", quietly = TRUE)) { + stop("R package 'GUniFrac' needed to calculate generalized UniFrac distances. Please install it.", + call. = FALSE) + } tree <- phy_tree(physeq) OTU <- as.matrix(otu_table(physeq)) if (taxa_are_rows(physeq)) { From eee8e93b7dd8018629322f3a757a9f10b1416518 Mon Sep 17 00:00:00 2001 From: Florent Angly Date: Sat, 19 Sep 2015 15:40:36 +0200 Subject: [PATCH 11/11] Reverting formatR cleanup for all files except for distance calculation --- R/IO-methods.R | 1741 ++++++++++++++-------------- R/allClasses.R | 80 +- R/allData.R | 39 +- R/allPackage.R | 4 +- R/almostAllAccessors.R | 257 ++--- R/as-methods.R | 48 +- R/assignment-methods.R | 203 ++-- R/deprecated_functions.R | 204 +--- R/extend_DESeq2.R | 20 +- R/extend_vegan.R | 279 ++--- R/extract-methods.R | 56 +- R/merge-methods.R | 714 ++++++------ R/multtest-wrapper.R | 170 +-- R/network-methods.R | 183 ++- R/ordination-methods.R | 432 +++---- R/otuTable-class.R | 96 +- R/phylo-class.R | 25 +- R/phyloseq-class.R | 416 +++---- R/plot-methods.R | 2087 +++++++++++++++++----------------- R/sampleData-class.R | 88 +- R/show-methods.R | 117 +- R/taxonomyTable-class.R | 81 +- R/transform_filter-methods.R | 881 +++++++------- R/validity-methods.R | 174 +-- 24 files changed, 4195 insertions(+), 4200 deletions(-) diff --git a/R/IO-methods.R b/R/IO-methods.R index 63a299ac..5a34a8f3 100644 --- a/R/IO-methods.R +++ b/R/IO-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Universal import method (wrapper) for phyloseq-package #' #' A user must still understand the additional arguments required for each @@ -15,7 +15,7 @@ #' @param pipelineName (Required). Character string. The name of the #' analysis tool / pipeline / package #' that created the OTU-cluster data or other data that you now want to import. -#' Current options are \code{c('mothur', 'pyrotagger', 'QIIME', 'RDP')}, and +#' Current options are \code{c("mothur", "pyrotagger", "QIIME", "RDP")}, and #' only the first letter is necessary. #' #' @param ... (Required). Additional named arguments providing file paths, and possible @@ -66,32 +66,33 @@ #' @export #' @examples #' ## See documentation of a specific import function -import <- function(pipelineName, ...) { - # Reduce pipelineName to just its first letter, as all are different - pipelineName <- substr(pipelineName, 1, 1) - - # Test that it is in the set - if (!(pipelineName %in% c("B", "b", "M", "m", "P", "p", "Q", "q", "R", "r"))) { - stop("You need to select among available importer types:\n", "\"BIOM\", \"mothur\", \"pyrotagger\", \"QIIME\", \"RDP\" \n See ?import for details") - } - - if (pipelineName %in% c("B", "b")) { - return(import_biom(...)) - } - if (pipelineName %in% c("M", "m")) { - return(import_mothur(...)) - } - if (pipelineName %in% c("P", "p")) { - return(import_pyrotagger_tab(...)) - } - if (pipelineName %in% c("Q", "q")) { - return(import_qiime(...)) - } - if (pipelineName %in% c("R", "r")) { - return(import_RDP_cluster(...)) - } +import <- function(pipelineName, ...){ + # Reduce pipelineName to just its first letter, as all are different + pipelineName <- substr(pipelineName, 1, 1) + + # Test that it is in the set + if( !(pipelineName %in% c("B", "b", "M", "m", "P", "p", "Q", "q", "R", "r")) ){ + stop("You need to select among available importer types:\n", + "\"BIOM\", \"mothur\", \"pyrotagger\", \"QIIME\", \"RDP\" \n See ?import for details") + } + + if( pipelineName %in% c("B", "b") ){ + return( import_biom(...) ) + } + if( pipelineName %in% c("M", "m") ){ + return( import_mothur(...) ) + } + if( pipelineName %in% c("P", "p") ){ + return( import_pyrotagger_tab(...) ) + } + if( pipelineName %in% c("Q", "q") ){ + return( import_qiime(...) ) + } + if( pipelineName %in% c("R", "r") ){ + return( import_RDP_cluster(...) ) + } } -################################################################################ +################################################################################ #' Import function to read the now legacy-format QIIME OTU table. #' #' QIIME produces several files that can be directly imported by @@ -114,7 +115,7 @@ import <- function(pipelineName, ...) { #' phylogenetic tree with a tip for each OTU, which can also be imported #' specified here or imported separately using \code{\link{read_tree}}. #' -#' See \url{'http://www.qiime.org/'} for details on using QIIME. While there are +#' See \url{"http://www.qiime.org/"} for details on using QIIME. While there are #' many complex dependencies, QIIME can be downloaded as a pre-installed #' linux virtual machine that runs ``off the shelf''. #' @@ -193,7 +194,7 @@ import <- function(pipelineName, ...) { #' @param parseFunction (Optional). An optional custom function for parsing the #' character string that contains the taxonomic assignment of each OTU. #' The default parsing function is \code{\link{parse_taxonomy_qiime}}, -#' specialized for splitting the \code{';'}-delimited strings and also +#' specialized for splitting the \code{";"}-delimited strings and also #' attempting to interpret greengenes prefixes, if any, as that is a common #' format of the taxonomy string produced by QIIME. #' @@ -232,77 +233,76 @@ import <- function(pipelineName, ...) { #' @importFrom Biostrings readDNAStringSet #' @export #' @examples -#' otufile <- system.file('extdata', 'GP_otu_table_rand_short.txt.gz', package='phyloseq') -#' mapfile <- system.file('extdata', 'master_map.txt', package='phyloseq') -#' trefile <- system.file('extdata', 'GP_tree_rand_short.newick.gz', package='phyloseq') +#' otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq") +#' mapfile <- system.file("extdata", "master_map.txt", package="phyloseq") +#' trefile <- system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq") #' import_qiime(otufile, mapfile, trefile) -import_qiime <- function(otufilename = NULL, mapfilename = NULL, treefilename = NULL, - refseqfilename = NULL, refseqFunction = readDNAStringSet, refseqArgs = NULL, - parseFunction = parse_taxonomy_qiime, verbose = TRUE, ...) { - - # initialize the argument-list for phyloseq. Start empty. - argumentlist <- list() - - if (!is.null(mapfilename)) { - if (verbose) { - cat("Processing map file...", fill = TRUE) - } - QiimeMap <- import_qiime_sample_data(mapfilename) - argumentlist <- c(argumentlist, list(QiimeMap)) - } - - if (!is.null(otufilename)) { - if (verbose) { - cat("Processing otu/tax file...", fill = TRUE) - } - otutax <- import_qiime_otu_tax(otufilename, parseFunction, verbose = verbose) - otutab <- otu_table(otutax$otutab, TRUE) - taxtab <- tax_table(otutax$taxtab) - argumentlist <- c(argumentlist, list(otutab), list(taxtab)) - } - - if (!is.null(treefilename)) { - if (verbose) { - cat("Processing phylogenetic tree...\n", treefilename, "...\n") - } - if (inherits(treefilename, "phylo")) { - # If argument is already a tree, don't read, just assign. - tree = treefilename - } else { - # If it is not a tree, assume file and attempt to import. NULL is silently - # returned if tree is not read properly. - tree <- read_tree(treefilename, ...) - } - # Add to argument list or warn - if (is.null(tree)) { - warning("treefilename failed import. It will not be included.") - } else { - argumentlist <- c(argumentlist, list(tree)) - } - } - - if (!is.null(refseqfilename)) { - if (verbose) { - cat("Processing Reference Sequences...", fill = TRUE) - } - if (inherits(refseqfilename, "XStringSet")) { - # If argument is already a XStringSet, don't read, just assign. - refseq = refseqfilename - } else { - # call refseqFunction and read refseqfilename, either with or without additional - # args - if (!is.null(refseqArgs)) { - refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs)) - } else { - refseq = refseqFunction(refseqfilename) - } - } - argumentlist <- c(argumentlist, list(refseq)) - } - - do.call("phyloseq", argumentlist) +import_qiime <- function(otufilename=NULL, mapfilename=NULL, + treefilename=NULL, refseqfilename=NULL, + refseqFunction=readDNAStringSet, refseqArgs=NULL, + parseFunction=parse_taxonomy_qiime, verbose=TRUE, ...){ + + # initialize the argument-list for phyloseq. Start empty. + argumentlist <- list() + + if( !is.null(mapfilename) ){ + if( verbose ){ + cat("Processing map file...", fill=TRUE) + } + QiimeMap <- import_qiime_sample_data(mapfilename) + argumentlist <- c(argumentlist, list(QiimeMap)) + } + + if( !is.null(otufilename) ){ + if( verbose ){ + cat("Processing otu/tax file...", fill=TRUE) + } + otutax <- import_qiime_otu_tax(otufilename, parseFunction, verbose=verbose) + otutab <- otu_table(otutax$otutab, TRUE) + taxtab <- tax_table(otutax$taxtab) + argumentlist <- c(argumentlist, list(otutab), list(taxtab) ) + } + + if( !is.null(treefilename) ){ + if(verbose){cat("Processing phylogenetic tree...\n", treefilename, "...\n")} + if(inherits(treefilename, "phylo")){ + # If argument is already a tree, don't read, just assign. + tree = treefilename + } else { + # If it is not a tree, assume file and attempt to import. + # NULL is silently returned if tree is not read properly. + tree <- read_tree(treefilename, ...) + } + # Add to argument list or warn + if( is.null(tree) ){ + warning("treefilename failed import. It will not be included.") + } else { + argumentlist <- c(argumentlist, list(tree) ) + } + } + + if( !is.null(refseqfilename) ){ + if( verbose ){ + cat("Processing Reference Sequences...", fill=TRUE) + } + if( inherits(refseqfilename, "XStringSet") ){ + # If argument is already a XStringSet, don't read, just assign. + refseq = refseqfilename + } else { + # call refseqFunction and read refseqfilename, + # either with or without additional args + if( !is.null(refseqArgs) ){ + refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs)) + } else { + refseq = refseqFunction(refseqfilename) + } + } + argumentlist <- c(argumentlist, list(refseq) ) + } + + do.call("phyloseq", argumentlist) } -################################################################################ +################################################################################ #' Somewhat flexible tree-import function #' #' This function is a convenience wrapper around the @@ -346,33 +346,32 @@ import_qiime <- function(otufilename = NULL, mapfilename = NULL, treefilename = #' @importFrom ape read.tree #' @export #' @examples -#' read_tree(system.file('extdata', 'esophagus.tree.gz', package='phyloseq')) -#' read_tree(system.file('extdata', 'GP_tree_rand_short.newick.gz', package='phyloseq')) -read_tree <- function(treefile, errorIfNULL = FALSE, ...) { - # 'phylo' object provided directly - if (class(treefile)[1] %in% c("phylo")) { - tree <- treefile - } else { - # file path to tree file provided. Try Nexus first, protected, then newick if it - # fails - tree <- NULL - try(tree <- read.nexus(treefile, ...), TRUE) - # Try Newick if nexus didn't work. - if (is.null(tree)) - try(tree <- read.tree(treefile, ...), TRUE) - } - # If neither tree-import worked (still NULL), report warning - if (errorIfNULL & is.null(tree)) { - stop("tree file could not be read.\nPlease retry with valid tree.") - } - if (!is.null(tree)) { - # Perform any standard phyloseq checks/fixes E.g. Replace any NA branch-length - # values in the tree with zero. - tree = fix_phylo(tree) +#' read_tree(system.file("extdata", "esophagus.tree.gz", package="phyloseq")) +#' read_tree(system.file("extdata", "GP_tree_rand_short.newick.gz", package="phyloseq")) +read_tree <- function(treefile, errorIfNULL=FALSE, ...){ + # "phylo" object provided directly + if( class(treefile)[1] %in% c("phylo") ){ + tree <- treefile + } else { + # file path to tree file provided. + # Try Nexus first, protected, then newick if it fails + tree <- NULL + try(tree <- read.nexus(treefile, ...), TRUE) + # Try Newick if nexus didn't work. + if(is.null(tree)) try(tree <- read.tree(treefile, ...), TRUE) + } + # If neither tree-import worked (still NULL), report warning + if( errorIfNULL & is.null(tree) ){ + stop("tree file could not be read.\nPlease retry with valid tree.") + } + if( !is.null(tree) ){ + # Perform any standard phyloseq checks/fixes + # E.g. Replace any NA branch-length values in the tree with zero. + tree = fix_phylo(tree) } - return(tree) + return(tree) } -################################################################################ +################################################################################ #' Read GreenGenes tree released in annotated newick format #' #' In principal, this is a standard newick format, that can be imported @@ -411,7 +410,7 @@ read_tree <- function(treefile, errorIfNULL = FALSE, ...) { #' @examples #' # Read the May 2013, 73% similarity official tree, #' # included as extra data in phyloseq. -#' treefile = system.file('extdata', 'gg13-5-73.tree.gz', package='phyloseq') +#' treefile = system.file("extdata", "gg13-5-73.tree.gz", package="phyloseq") #' x = read_tree_greengenes(treefile) #' x #' class(x) @@ -419,29 +418,29 @@ read_tree <- function(treefile, errorIfNULL = FALSE, ...) { #' y #' class(y) #' ## Not run, causes an error: -#' # library('ape') +#' # library("ape") #' # read.tree(treefile) -read_tree_greengenes = function(treefile) { - alines = readLines(treefile, warn = FALSE) +read_tree_greengenes = function(treefile){ + alines = readLines(treefile, warn=FALSE) # Collapse to one line, in case it isn't already. - alines = paste0(alines, collapse = "") - # replace all semicolons with something weird that isn't already a special newick - # character. + alines = paste0(alines, collapse="") + # replace all semicolons with something weird + # that isn't already a special newick character. newdelim = "><-><" clines = gsub("\\;", newdelim, alines) # reinstate the final character as a semicolon clines = gsub(paste0(newdelim, "$"), ";", clines) # Convert your newick string into a phylo-class tree. - tree = read.tree("", text = clines) - # Now that it is phylo-class, reinstate semicolon as the delimiter in the node - # labels + tree = read.tree("", text=clines) + # Now that it is phylo-class, reinstate semicolon + # as the delimiter in the node labels gsub(newdelim, ";", tree$node.label) # Also get rid of those extra quotes gsub("'", "", tree$node.label) # Return the cleaned-up tree return(tree) } -################################################################################ +################################################################################ #' Import now legacy-format QIIME OTU table as a list of two matrices. #' #' Now a legacy-format, older versions of QIIME @@ -473,7 +472,7 @@ read_tree_greengenes = function(treefile) { #' @param parseFunction (Optional). An optional custom function for parsing the #' character string that contains the taxonomic assignment of each OTU. #' The default parsing function is \code{\link{parse_taxonomy_qiime}}, -#' specialized for splitting the \code{';'}-delimited strings and also +#' specialized for splitting the \code{";"}-delimited strings and also #' attempting to interpret greengenes prefixes, if any, as that is a common #' format of the taxonomy string produced by QIIME. #' @@ -514,55 +513,44 @@ read_tree_greengenes = function(treefile) { #' #' @export #' @examples -#' otufile <- system.file('extdata', 'GP_otu_table_rand_short.txt.gz', package='phyloseq') +#' otufile <- system.file("extdata", "GP_otu_table_rand_short.txt.gz", package="phyloseq") #' import_qiime_otu_tax(otufile) -import_qiime_otu_tax <- function(file, parseFunction = parse_taxonomy_qiime, verbose = TRUE, - parallel = FALSE) { - if (verbose) { - cat("Reading file into memory prior to parsing...\n") - } +import_qiime_otu_tax <- function(file, parseFunction=parse_taxonomy_qiime, + verbose=TRUE, parallel=FALSE){ + if(verbose){cat("Reading file into memory prior to parsing...\n")} x = readLines(file) - if (verbose) { - cat("Detecting first header line...\n") - } - # Check for commented lines, starting with line 1. The deepest commented line - # (largest n) is assumed to have header information. - skipLines = max(which(substr(x[1:25L], 1, 1) == "#")) - 1L - if (verbose) { - cat("Header is on line", (skipLines + 1L), " \n") - } - if (verbose) { - cat("Converting input file to a table...\n") - } - x = fread(input = paste0(x, collapse = "\n"), sep = "\t", header = TRUE, skip = skipLines) - if (verbose) { - cat("Defining OTU table... \n") - } + if(verbose){cat("Detecting first header line...\n")} + # Check for commented lines, starting with line 1. + # The deepest commented line (largest n) is assumed to have header information. + skipLines = max(which(substr(x[1:25L], 1, 1)=="#"))-1L + if(verbose){cat("Header is on line", (skipLines + 1L), " \n")} + if(verbose){cat("Converting input file to a table...\n")} + x = fread(input=paste0(x, collapse="\n"), sep="\t", header=TRUE, skip=skipLines) + if(verbose){cat("Defining OTU table... \n")} taxstring = x$`Consensus Lineage` # This pops the taxonomy (Consensus Lineage) column, in-place statement - x[, `:=`(`Consensus Lineage`, NULL)] + x[, `Consensus Lineage`:=NULL] # Store the OTU names, you will pop the column OTUnames = x$`#OTU ID` # This pops the OTUID column, in-place statement - x[, `:=`(`#OTU ID`, NULL)] + x[, `#OTU ID`:=NULL] x <- as(x, "matrix") rownames(x) <- OTUnames rm(OTUnames) - if (verbose) { - cat("Parsing taxonomy table...\n") - } - # Split into 'jagged' list (vectors of different lengths) - taxlist = llply(taxstring, parseFunction, .parallel = parallel) + if(verbose){cat("Parsing taxonomy table...\n")} + # Split into "jagged" list (vectors of different lengths) + taxlist = llply(taxstring, parseFunction, .parallel=parallel) # Add OTU names to list element names names(taxlist) <- rownames(x) - # Build the tax table from the jagged list. + # Build the tax table from the jagged list. taxtab <- build_tax_table(taxlist) - # Call garbage collection one more time. Lots of unneeded stuff. - garbage.collection <- gc(FALSE) + # Call garbage collection one more time. Lots of unneeded stuff. + garbage.collection <- gc(FALSE) # Return the named list - return(list(otutab = x, taxtab = taxtab)) + return(list(otutab=x, taxtab=taxtab)) } -################################################################################ +################################################################################ +################################################################################ #' Import just \code{sample_data} file from QIIME pipeline. #' #' QIIME produces several files that can be analyzed in the phyloseq-package, @@ -602,15 +590,16 @@ import_qiime_otu_tax <- function(file, parseFunction = parse_taxonomy_qiime, ver #' #' @export #' @examples -#' mapfile <- system.file('extdata', 'master_map.txt', package = 'phyloseq') +#' mapfile <- system.file("extdata", "master_map.txt", package = "phyloseq") #' import_qiime_sample_data(mapfile) -import_qiime_sample_data <- function(mapfilename) { - # Process mapfile. Name rows as samples. - QiimeMap <- read.table(file = mapfilename, header = TRUE, sep = "\t", comment.char = "") - rownames(QiimeMap) <- as.character(QiimeMap[, 1]) - return(sample_data(QiimeMap)) +import_qiime_sample_data <- function(mapfilename){ + # Process mapfile. Name rows as samples. + QiimeMap <- read.table(file=mapfilename, header=TRUE, + sep="\t", comment.char="") + rownames(QiimeMap) <- as.character(QiimeMap[,1]) + return( sample_data(QiimeMap) ) } -################################################################################ +################################################################################ #' Read a UniFrac-formatted ENV file. #' #' Convenience wrapper function to read the environment-file, as formatted for @@ -619,7 +608,7 @@ import_qiime_sample_data <- function(mapfilename) { #' each row specifies (in order) the sequence name, source sample, and (optionally) #' the number of times the sequence was observed. #' -#' @usage import_env_file(envfilename, tree=NULL, sep='\t', ...) +#' @usage import_env_file(envfilename, tree=NULL, sep="\t", ...) #' #' @param envfilename (Required). A charater string of the ENV filename (relative or absolute) #' @@ -627,7 +616,7 @@ import_qiime_sample_data <- function(mapfilename) { #' the output otu_table. #' #' @param sep A character string indicating the delimiter used in the file. -#' The default is \code{'\t'}. +#' The default is \code{"\t"}. #' #' @param ... Additional parameters passed on to \code{\link{read.table}}. #' @@ -643,19 +632,18 @@ import_qiime_sample_data <- function(mapfilename) { #' @export #' @examples #' # import_env_file(myEnvFile, myTree) -import_env_file <- function(envfilename, tree = NULL, sep = "\t", ...) { - tipSampleTable <- read.table(envfilename, sep = sep, ...) - # Convert to otu_table-class table (trivial table) - physeq <- envHash2otu_table(tipSampleTable) - # If tree is provided, combine it with the OTU Table - if (class(tree) == "phylo") { - # Create phyloseq-class with a tree and OTU Table (will perform any needed - # pruning) - physeq <- phyloseq(physeq, tree) - } - return(physeq) +import_env_file <- function(envfilename, tree=NULL, sep="\t", ...){ + tipSampleTable <- read.table(envfilename, sep=sep, ...) + # Convert to otu_table-class table (trivial table) + physeq <- envHash2otu_table(tipSampleTable) + # If tree is provided, combine it with the OTU Table + if( class(tree) == "phylo" ){ + # Create phyloseq-class with a tree and OTU Table (will perform any needed pruning) + physeq <- phyloseq(physeq, tree) + } + return(physeq) } -################################################################################ +################################################################################ #' Convert a sequence-sample hash (like ENV file) into an OTU table. #' #' Parses an ENV-file into a sparse matrix of species-by-sample, where @@ -685,33 +673,34 @@ import_env_file <- function(envfilename, tree = NULL, sep = "\t", ...) { #' #' @keywords internal #' @examples # -#' ## fakeSeqNameVec <- paste('seq_', 1:8, sep='') -#' ## fakeSamNameVec <- c(rep('A', 4), rep('B', 4)) +#' ## fakeSeqNameVec <- paste("seq_", 1:8, sep="") +#' ## fakeSamNameVec <- c(rep("A", 4), rep("B", 4)) #' ## fakeSeqAbunVec <- sample(1:50, 8, TRUE) #' ## test <- cbind(fakeSeqNameVec, fakeSamNameVec, fakeSeqAbunVec) #' ## testotu <- envHash2otu_table( test ) #' ## test <- cbind(fakeSeqNameVec, fakeSamNameVec) #' ## testotu <- envHash2otu_table( test ) -envHash2otu_table <- function(tipSampleTable) { - if (ncol(tipSampleTable) > 2) { - tst <- tipSampleTable - trivialOTU <- matrix(0, nrow = nrow(tst), ncol = length(unique(tst[, 2]))) - colnames(trivialOTU) <- unique(tst[, 2]) - rownames(trivialOTU) <- tst[, 1] - for (i in 1:nrow(tst)) { - trivialOTU[tst[i, 1], tst[i, 2]] <- as.integer(tst[i, 3]) - } - } else { - trivialOTU <- table(as.data.frame(tipSampleTable)) - trivialOTU <- as(trivialOTU, "matrix") - } - return(otu_table(trivialOTU, taxa_are_rows = TRUE)) +envHash2otu_table <- function(tipSampleTable){ + if( ncol(tipSampleTable) > 2 ){ + tst <- tipSampleTable + trivialOTU <- matrix(0, nrow=nrow(tst), ncol=length(unique(tst[,2]))) + colnames(trivialOTU) <- unique(tst[,2]) + rownames(trivialOTU) <- tst[,1] + for( i in 1:nrow(tst) ){ + trivialOTU[tst[i, 1], tst[i, 2]] <- as.integer(tst[i, 3]) + } + } else { + trivialOTU <- table(as.data.frame(tipSampleTable)) + trivialOTU <- as(trivialOTU, "matrix") + } + return( otu_table(trivialOTU, taxa_are_rows=TRUE) ) } -################################################################################ +################################################################################ +################################################################################ #' Import RDP cluster file and return otu_table (abundance table). #' #' The RDP cluster pipeline (specifically, the output of the complete linkage clustering step) -#' has no formal documentation for the \code{'.clust'} +#' has no formal documentation for the \code{".clust"} #' file or its apparent sequence naming convention. #' #' \code{http://pyro.cme.msu.edu/index.jsp} @@ -720,113 +709,109 @@ envHash2otu_table <- function(tipSampleTable) { #' the names of all sequences contained in input alignment. If the upstream #' barcode and aligment processing steps are also done with the RDP pipeline, #' then the sequence names follow a predictable naming convention wherein each -#' sequence is named by its sample and sequence ID, separated by a \code{'_'} as +#' sequence is named by its sample and sequence ID, separated by a \code{"_"} as #' delimiter: #' -#' \code{'sampleName_sequenceIDnumber'} +#' \code{"sampleName_sequenceIDnumber"} #' #' This import function assumes that the sequence names in the cluster file follow -#' this convention, and that the sample name does not contain any \code{'_'}. It +#' this convention, and that the sample name does not contain any \code{"_"}. It #' is unlikely to work if this is not the case. It is likely to work if you used #' the upstream steps in the RDP pipeline to process your raw (barcoded, untrimmed) #' fasta/fastq data. #' -#' This function first loops through the \code{'.clust'} file and collects all -#' of the sample names that appear. It secondly loops through each OTU (\code{'cluster'}; +#' This function first loops through the \code{".clust"} file and collects all +#' of the sample names that appear. It secondly loops through each OTU (\code{"cluster"}; #' each row of the cluster file) and sums the number of sequences (reads) from #' each sample. The resulting abundance table of OTU-by-sample is trivially #' coerced to an \code{\link{otu_table}} object, and returned. #' #' @usage import_RDP_cluster(RDP_cluster_file) #' -#' @param RDP_cluster_file A character string. The name of the \code{'.clust'} +#' @param RDP_cluster_file A character string. The name of the \code{".clust"} #' file produced by the #' the complete linkage clustering step of the RDP pipeline. #' -#' @return An \code{\link{otu_table}} object parsed from the \code{'.clust'} file. +#' @return An \code{\link{otu_table}} object parsed from the \code{".clust"} file. #' #' @references \url{http://pyro.cme.msu.edu/index.jsp} #' #' @export #' -import_RDP_cluster <- function(RDP_cluster_file) { - - # Read file and pop the header lines - RDP_raw_otu_lines_only <- readLines(RDP_cluster_file)[-(1:5)] - - # internal function: - make_verbose_sample_list <- function(RDP_raw_otu_lines_only) { - # Each OTU line has a 3 element 'line header' that indicates the OTUID, the name - # of the file, and the number of sequences that are included in this cluster. - # From each line, remove the header elements - get_sample_names_from_one_line <- function(otuline) { - # first split the line on tabs '\t' - splittabs <- strsplit(otuline, "\t")[[1]] - - # next, remove the header by keeping on the 4th element. - seqIDonly <- splittabs[4] - - # Finally, split on white space - seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]] - - # For each element in seqIDonly, split on the underscore delimiter - splitseqnames <- strsplit(seqIDonly, "_", fixed = TRUE) - - # Return the sample names from the first element (assumes no '_' in sample names) - return(sapply(splitseqnames, function(i) { - i[1] - })) - } - return(sapply(RDP_raw_otu_lines_only, get_sample_names_from_one_line)) - } - - ## Get the verbose sample name list, and then shrink to the unique sample names in - ## the entire dataset. Need this unique list for initializing the OTU abundance - ## matrix - RDPsamplenameslist <- make_verbose_sample_list(RDP_raw_otu_lines_only) - RDPsamplenames <- unique(unlist(RDPsamplenameslist)) - - # remove NAs - RDPsamplenames <- RDPsamplenames[!is.na(RDPsamplenames)] - - # Initialize otu abundance matrix. - otumat <- matrix(0, nrow = length(RDP_raw_otu_lines_only), ncol = length(RDPsamplenames)) - rownames(otumat) <- paste("OTUID_", 1:length(RDP_raw_otu_lines_only)) - colnames(otumat) <- RDPsamplenames - - # Now re-loop through the cluster file (by OTU) and sum the abundance of - # sequences from each sample - for (i in 1:length(RDP_raw_otu_lines_only)) { - # i = 1 - - # first split the line on tabs '\t' - splittabs <- strsplit(RDP_raw_otu_lines_only[i], "\t")[[1]] - - # next, remove the header by keeping on the 4th element. - seqIDonly <- splittabs[4] - - # Finally, split on white space - seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]] - - # For each element in seqIDonly, split on the underscore delimiter - splitseqnames <- strsplit(seqIDonly, "_", fixed = TRUE) - - # make the verbose vector - verbosesamplenamesi <- sapply(splitseqnames, function(i) { - i[1] - }) - - # sum the reads from each sample with tapply - OTUi <- tapply(verbosesamplenamesi, factor(verbosesamplenamesi), length) - - # store results of this OTU in abundance matrix - otumat[i, names(OTUi)] <- OTUi - } - - # Return the abundance table. - return(otu_table(otumat, taxa_are_rows = TRUE)) +import_RDP_cluster <- function(RDP_cluster_file){ + + # Read file and pop the header lines + RDP_raw_otu_lines_only <- readLines(RDP_cluster_file)[-(1:5)] + + # internal function: + make_verbose_sample_list <- function(RDP_raw_otu_lines_only){ + # Each OTU line has a 3 element "line header" that indicates the OTUID, the name of the file, + # and the number of sequences that are included in this cluster. + # From each line, remove the header elements + get_sample_names_from_one_line <- function(otuline){ + # first split the line on tabs "\t" + splittabs <- strsplit(otuline, "\t")[[1]] + + # next, remove the header by keeping on the 4th element. + seqIDonly <- splittabs[4] + + # Finally, split on white space + seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]] + + # For each element in seqIDonly, split on the underscore delimiter + splitseqnames <- strsplit(seqIDonly, "_", fixed=TRUE) + + # Return the sample names from the first element (assumes no "_" in sample names) + return( sapply(splitseqnames, function(i){i[1]}) ) + } + return( sapply(RDP_raw_otu_lines_only, get_sample_names_from_one_line) ) + } + + ## Get the verbose sample name list, and then shrink to the + ## unique sample names in the entire dataset. + ## Need this unique list for initializing the OTU abundance matrix + RDPsamplenameslist <- make_verbose_sample_list(RDP_raw_otu_lines_only) + RDPsamplenames <- unique(unlist(RDPsamplenameslist)) + + # remove NAs + RDPsamplenames <- RDPsamplenames[!is.na(RDPsamplenames)] + + # Initialize otu abundance matrix. + otumat <- matrix(0, nrow=length(RDP_raw_otu_lines_only), ncol=length(RDPsamplenames)) + rownames(otumat) <- paste("OTUID_", 1:length(RDP_raw_otu_lines_only)) + colnames(otumat) <- RDPsamplenames + + # Now re-loop through the cluster file (by OTU) and sum the + # abundance of sequences from each sample + for( i in 1:length(RDP_raw_otu_lines_only) ){ + # i = 1 + + # first split the line on tabs "\t" + splittabs <- strsplit(RDP_raw_otu_lines_only[i], "\t")[[1]] + + # next, remove the header by keeping on the 4th element. + seqIDonly <- splittabs[4] + + # Finally, split on white space + seqIDonly <- strsplit(seqIDonly, "[[:space:]]+")[[1]] + + # For each element in seqIDonly, split on the underscore delimiter + splitseqnames <- strsplit(seqIDonly, "_", fixed=TRUE) + + # make the verbose vector + verbosesamplenamesi <- sapply(splitseqnames, function(i){i[1]}) + + # sum the reads from each sample with tapply + OTUi <- tapply(verbosesamplenamesi, factor(verbosesamplenamesi), length) + + # store results of this OTU in abundance matrix + otumat[i, names(OTUi)] <- OTUi + } + + # Return the abundance table. + return( otu_table(otumat, taxa_are_rows=TRUE) ) } -################################################################################ +################################################################################ #' Import new RDP OTU-table format #' #' Recently updated tools on RDP Pyro site make it easier to import Pyrosequencing output @@ -834,7 +819,7 @@ import_RDP_cluster <- function(RDP_cluster_file) { #' (generated from RDP Clustering tools) to create a community data matrix file #' for distance cutoff range you are interested in. The resulting output file #' is a tab-delimited file containing the number of sequences for each sample -#' for each OTU. The OTU header naming convention is \code{'OTU_'} followed by the OTU +#' for each OTU. The OTU header naming convention is \code{"OTU_"} followed by the OTU #' number in the cluster file. It pads ``0''s to make the OTU header easy to sort. #' The OTU numbers are not necessarily in order. #' @@ -855,7 +840,7 @@ import_RDP_cluster <- function(RDP_cluster_file) { #' #' @export #' @examples -#' otufile <- system.file('extdata', 'rformat_dist_0.03.txt.gz', package='phyloseq') +#' otufile <- system.file("extdata", "rformat_dist_0.03.txt.gz", package="phyloseq") #' ### the gzipped file is automatically recognized, and read using R-connections #' ex_otu <- import_RDP_otu(otufile) #' class(ex_otu) @@ -863,24 +848,28 @@ import_RDP_cluster <- function(RDP_cluster_file) { #' nsamples(ex_otu) #' sample_sums(ex_otu) #' head(t(ex_otu)) -import_RDP_otu <- function(otufile) { - otumat <- read.table(otufile, TRUE, sep = "\t", row.names = 1) - return(otu_table(otumat, FALSE)) +import_RDP_otu <- function(otufile){ + otumat <- read.table(otufile, TRUE, sep="\t", row.names=1) + return(otu_table(otumat, FALSE)) } -################################################################################ +################################################################################ +################################################################################ +################################################################################ +################################################################################ +################################################################################ #' Imports a tab-delimited version of the pyrotagger output file. #' #' PyroTagger is a web-server that takes raw, barcoded 16S rRNA amplicon sequences -#' and returns an excel spreadsheet (\code{'.xls'}) with both abundance and +#' and returns an excel spreadsheet (\code{".xls"}) with both abundance and #' taxonomy data. It also includes some confidence information related to the #' taxonomic assignment. #' #' PyroTagger is created and maintained by the Joint Genome Institute -#' at \code{'http://pyrotagger.jgi-psf.org/'} +#' at \code{"http://pyrotagger.jgi-psf.org/"} #' -#' The typical output form PyroTagger is a spreadsheet format \code{'.xls'}, which poses +#' The typical output form PyroTagger is a spreadsheet format \code{".xls"}, which poses #' additional import challenges. However, virtually all spreadsheet applications -#' support the \code{'.xls'} format, and can further export this file in a +#' support the \code{".xls"} format, and can further export this file in a #' tab-delimited format. It is recommended that you convert the xls-file without #' any modification (as tempting as it might be once you have loaded it) into a #' tab-delimited text file. Deselect any options to encapsulate fields in quotes, @@ -891,7 +880,7 @@ import_RDP_otu <- function(otufile) { #' #' A highly-functional and free spreadsheet application can be obtained as part #' of the cross-platform \code{OpenOffice} suite. It works for the above -#' required conversion. Go to \code{'http://www.openoffice.org/'}. +#' required conversion. Go to \code{"http://www.openoffice.org/"}. #' #' It is regrettable that this importer does not take the xls-file directly #' as input. However, because of the moving-target nature of spreadsheet @@ -903,7 +892,7 @@ import_RDP_otu <- function(otufile) { #' long-run. #' #' @usage import_pyrotagger_tab(pyrotagger_tab_file, -#'\tstrict_taxonomy=FALSE, keep_potential_chimeras=FALSE) +#' strict_taxonomy=FALSE, keep_potential_chimeras=FALSE) #' #' @param pyrotagger_tab_file (Required). A character string. The name of the tab-delimited #' pyrotagger output table. @@ -926,78 +915,87 @@ import_RDP_otu <- function(otufile) { #' #' @examples #' ## New_otuTaxObject <- import_pyrotagger_tab(pyrotagger_tab_file) -import_pyrotagger_tab <- function(pyrotagger_tab_file, strict_taxonomy = FALSE, keep_potential_chimeras = FALSE) { - - x <- readLines(pyrotagger_tab_file, warn = FALSE) - # Get the header - pyro_header <- strsplit(x[1], "\t", TRUE)[[1]] - # Pop the first (header) line from the list. - x <- x[-1] - - ######################################## There are 'Potential chimeras' listed in the typical output, separated by 2 - ######################################## completely blank lines after the last confidently-good OTU. - chimera_line <- grep("Potential chimeras", x, fixed = TRUE) - if (keep_potential_chimeras) { - # Pop just the blank lines that delimit the chimeras at the bottom of the table - x <- x[-((chimera_line - 2):chimera_line)] - } else { - x <- x[-((chimera_line - 2):length(x))] - } - - ######################################## The tab-split character list, z - z <- strsplit(x, "\t", TRUE) - names(z) <- sapply(z, function(z) { - z[1] - }) - - # The table switches from abundance to taxonomy at the '% Identity' column - taxonomy_table_column_index <- which(pyro_header == "% identity") - - ######################################## Initialize the two matrices (otu_table and taxonomyTable) Initialize abundance - ######################################## matrix, a - a <- matrix(0, nrow = length(x), ncol = (taxonomy_table_column_index - 2)) - colnames(a) <- pyro_header[2:(taxonomy_table_column_index - 1)] - rownames(a) <- names(z) - - ###### Initialize the raw pyrotagger taxonomy matrix, w - ntax_tablecols <- (max(sapply(z, length)) - taxonomy_table_column_index + 1) - w <- matrix("", nrow = length(x), ncol = ntax_tablecols) - rownames(w) <- names(z) - colnamesw <- pyro_header[-(1:(taxonomy_table_column_index - 1))] - colnamesw <- colnamesw[1:which(colnamesw == "Taxonomy")] - colnamesw <- c(colnamesw, paste("col", (which(colnamesw == "Taxonomy") + 1):ntax_tablecols, - sep = "")) - colnames(w) <- colnamesw - - # Rename the taxonomy columns - biotaxonomy <- c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species", - "Strain") - colnames(w)[which(colnames(w) == "Taxonomy"):length(colnames(w))][1:length(biotaxonomy)] <- biotaxonomy - - # Loop through each line and add to appropriate matrix. - for (i in rownames(a)) { - # i <- rownames(a)[[1]] cut out just the abundance part, and convert to integer - y <- as.integer(z[[i]][2:(taxonomy_table_column_index - 1)]) - y[is.na(y)] <- 0 - a[i, ] <- y - - # Taxonomy data is jagged - taxi <- z[[i]][-(1:(taxonomy_table_column_index - 1))] - w[i, 1:length(taxi)] <- taxi - } - - # Create the component objects - OTU <- otu_table(a, taxa_are_rows = TRUE) - if (strict_taxonomy) { - TAX <- tax_table[, biotaxonomy] - } else { - TAX <- tax_table(w) - } - - return(phyloseq(OTU, TAX)) - +import_pyrotagger_tab <- function(pyrotagger_tab_file, + strict_taxonomy=FALSE, keep_potential_chimeras=FALSE){ + + x <- readLines(pyrotagger_tab_file, warn=FALSE) + # Get the header + pyro_header <- strsplit(x[1], "\t", TRUE)[[1]] + # Pop the first (header) line from the list. + x <- x[-1] + + ######################################## + ### There are "Potential chimeras" + ### listed in the typical output, separated by 2 completely blank lines + ### after the last confidently-good OTU. + ######################################## + chimera_line <- grep("Potential chimeras", x, fixed=TRUE) + if( keep_potential_chimeras ){ + # Pop just the blank lines that delimit the chimeras + # at the bottom of the table + x <- x[-((chimera_line-2):chimera_line)] + } else { + x <- x[-((chimera_line-2):length(x))] + } + + ######################################## + # The tab-split character list, z + ######################################## + z <- strsplit(x, "\t", TRUE) + names(z) <- sapply(z, function(z){z[1]}) + + # The table switches from abundance to taxonomy at the "% Identity" column + taxonomy_table_column_index <- which( pyro_header == "% identity" ) + + ######################################## + # Initialize the two matrices + # (otu_table and taxonomyTable) + ######################################## + ### Initialize abundance matrix, a + a <- matrix(0, nrow=length(x), ncol=(taxonomy_table_column_index-2)) + colnames(a) <- pyro_header[2:(taxonomy_table_column_index-1)] + rownames(a) <- names(z) + + ###### Initialize the raw pyrotagger taxonomy matrix, w + ntax_tablecols <- (max(sapply(z, length)) - taxonomy_table_column_index + 1) + w <- matrix("", nrow=length(x), ncol=ntax_tablecols) + rownames(w) <- names(z) + colnamesw <- pyro_header[-(1:(taxonomy_table_column_index-1))] + colnamesw <- colnamesw[1:which(colnamesw=="Taxonomy")] + colnamesw <- c(colnamesw, paste("col", (which(colnamesw=="Taxonomy")+1):ntax_tablecols, sep="") ) + colnames(w) <- colnamesw + + # Rename the taxonomy columns + biotaxonomy <- c("Domain", "Phylum", "Class", "Order", + "Family", "Genus", "Species", "Strain") + colnames(w)[which(colnames(w)=="Taxonomy"):length(colnames(w))][1:length(biotaxonomy)] <- biotaxonomy + + # Loop through each line and add to appropriate matrix. + for( i in rownames(a) ){ + # i <- rownames(a)[[1]] + # cut out just the abundance part, and convert to integer + y <- as.integer(z[[i]][2:(taxonomy_table_column_index-1)]) + y[is.na(y)] <- 0 + a[i, ] <- y + + # Taxonomy data is jagged + taxi <- z[[i]][-(1:(taxonomy_table_column_index-1))] + w[i, 1:length(taxi)] <- taxi + } + + # Create the component objects + OTU <- otu_table(a, taxa_are_rows=TRUE) + if( strict_taxonomy ){ + TAX <- tax_table[, biotaxonomy] + } else { + TAX <- tax_table(w) + } + + return( phyloseq(OTU, TAX) ) + } -################################################################################ +################################################################################ +################################################################################ #' Show cutoff values available in a mothur file. #' #' This is a helper function to report back to the user the different cutoff @@ -1018,17 +1016,17 @@ import_pyrotagger_tab <- function(pyrotagger_tab_file, strict_taxonomy = FALSE, #' #' @seealso \code{\link{import_mothur}} #' -show_mothur_cutoffs <- function(mothur_list_file) { - unique(scan(mothur_list_file, "character", comment.char = "\t", quiet = TRUE)) +show_mothur_cutoffs <- function(mothur_list_file){ + unique(scan(mothur_list_file, "character", comment.char="\t", quiet=TRUE)) } -################################################################################ +################################################################################ #' Import mothur list file and return as list object in R. #' #' This is a user-available module of a more comprehensive function for importing #' OTU clustering/abundance data using the \emph{mothur} package. The list object #' returned by this function is not immediately useable by other \emph{phyloseq} #' functions, and must be first parsed in conjunction with a separate \emph{mothur} -#' \code{'group'} file. This function is made accessible to \emph{phyloseq} users +#' \code{"group"} file. This function is made accessible to \emph{phyloseq} users #' for troubleshooting and inspection, but the \code{link{import_mothur()}} function #' is suggested if the goal is to import the OTU clustering results from \emph{mothur} #' into \emph{phyloseq}. @@ -1037,7 +1035,7 @@ show_mothur_cutoffs <- function(mothur_list_file) { #' #' @param mothur_list_file The list file name and/or location as produced by \emph{mothur}. #' -#' @param cutoff A character string indicating the cutoff value, (or \code{'unique'}), +#' @param cutoff A character string indicating the cutoff value, (or \code{"unique"}), #' that matches one of the cutoff-values used to produce the OTU clustering #' results contained within the list-file created by \emph{mothur}. The default #' is to take the largest value among the cutoff values contained in the list @@ -1056,61 +1054,59 @@ show_mothur_cutoffs <- function(mothur_list_file) { #' @seealso \code{\link{show_mothur_cutoffs}}, \code{\link{import_mothur}} #' @keywords internal #' -import_mothur_otulist <- function(mothur_list_file, cutoff = NULL) { - # mothur_list_file = system.file('extdata', 'esophagus.fn.list.gz', - # package='phyloseq') cutoff = 0.04 +import_mothur_otulist <- function(mothur_list_file, cutoff=NULL){ + # mothur_list_file = system.file("extdata", "esophagus.fn.list.gz", package="phyloseq") + # cutoff = 0.04 cutoffs = show_mothur_cutoffs(mothur_list_file) cutoff = select_mothur_cutoff(cutoff, cutoffs) - # Read only the line corresponding to that cutoff + # Read only the line corresponding to that cutoff inputline = which(cutoffs == cutoff) - rawlines = scan(mothur_list_file, "character", sep = "\t", skip = (inputline - - 1), nlines = 1, na.strings = "", quiet = TRUE) + rawlines = scan(mothur_list_file, "character", sep="\t", skip=(inputline-1), nlines=1, na.strings="", quiet=TRUE) rawlines = rawlines[!is.na(rawlines)] - # The first two elements are the cutoff and the number of OTUs. skip, and read to - # first comma for OTUnames - OTUnames = scan(text = rawlines, what = "character", comment.char = ",", quiet = TRUE)[3:as.integer(rawlines[2])] + # The first two elements are the cutoff and the number of OTUs. skip, and read to first comma for OTUnames + OTUnames = scan(text=rawlines, what="character", comment.char=",", quiet=TRUE)[3:as.integer(rawlines[2])] # split each element on commas - OTUs <- strsplit(rawlines[3:as.integer(rawlines[2])], ",", fixed = TRUE) - # Name each OTU (currently as the first seq name in each cluster), and return the - # list + OTUs <- strsplit(rawlines[3:as.integer(rawlines[2])], ",", fixed=TRUE) + # Name each OTU (currently as the first seq name in each cluster), and return the list names(OTUs) <- OTUnames # return as-is return(OTUs) } -################################################################################ Need to select a cutoff if none was provided by user. Take the largest -################################################################################ non-'unique' cutoff possible, if 'unique' is the only cutoff included in the -################################################################################ list file, use that. Multiple cutoffs are provided in both `.shared` and -################################################################################ `.list` files. This function consolidates the heuristic for selecting/checking -################################################################################ a specified cutoff. +################################################################################ +# Need to select a cutoff if none was provided by user. +# Take the largest non-"unique" cutoff possible, +# if "unique" is the only cutoff included in the list file, use that. +# Multiple cutoffs are provided in both `.shared` and `.list` files. +# This function consolidates the heuristic for selecting/checking a specified cutoff. #' @keywords internal -select_mothur_cutoff = function(cutoff, cutoffs) { - if (is.null(cutoff)) { +select_mothur_cutoff = function(cutoff, cutoffs){ + if( is.null(cutoff) ){ # cutoff was NULL, need to select one. - if (length(cutoffs) > 1) { - # Select the largest value, avoiding the 'unique' option. + if( length(cutoffs) > 1 ){ + # Select the largest value, avoiding the "unique" option. selectCutoffs <- as(cutoffs[cutoffs != "unique"], "numeric") cutoff <- as.character(max(selectCutoffs)) } else { - # There is only one cutoff value, so use it. Don't have to specify a cutoff, in - # this case + # There is only one cutoff value, so use it. + # Don't have to specify a cutoff, in this case cutoff <- cutoffs } } else { # Provided by user, non-null. Coerce to character for indexing cutoff <- as.character(cutoff) # Check that it is in set of available cutoffs. - if (!cutoff %in% cutoffs) { + if( !cutoff %in% cutoffs ){ stop("The cutoff value you provided is not among those available. Try show_mothur_cutoffs()") } } } -################################################################################ +################################################################################ #' Parse mothur group file into a simple hash table. #' #' The data.frame object #' returned by this function is not immediately useable by other \emph{phyloseq} #' functions, and must be first parsed in conjunction with a separate \emph{mothur} -#' \code{'list'} file. This function is made accessible to \emph{phyloseq} users +#' \code{"list"} file. This function is made accessible to \emph{phyloseq} users #' for troubleshooting and inspection, but the \code{link{import_mothur()}} function #' is suggested if the goal is to import the OTU clustering results from \emph{mothur} #' into \emph{phyloseq}. You will need both a group file and a list file for that end. @@ -1128,11 +1124,10 @@ select_mothur_cutoff = function(cutoff, cutoffs) { #' @seealso \code{\link{import_mothur}} #' @keywords internal #' -import_mothur_groups <- function(mothur_group_file) { - read.table(mothur_group_file, sep = "\t", as.is = TRUE, stringsAsFactors = FALSE, - colClasses = "character", row.names = 1) +import_mothur_groups <- function(mothur_group_file){ + read.table(mothur_group_file, sep="\t", as.is=TRUE, stringsAsFactors=FALSE, colClasses="character", row.names=1) } -################################################################################ +################################################################################ #' Import mothur list and group files and return an otu_table #' #' @usage import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff=NULL) @@ -1145,7 +1140,7 @@ import_mothur_groups <- function(mothur_group_file) { #' species/taxa abundance table (\code{otu_table}). See #' \code{http://www.mothur.org/wiki/Make.group} #' -#' @param cutoff A character string indicating the cutoff value, (or \code{'unique'}), +#' @param cutoff A character string indicating the cutoff value, (or \code{"unique"}), #' that matches one of the cutoff-values used to produce the OTU clustering #' results contained within the list-file created by \emph{mothur} (and specified #' by the \code{mothur_list_file} argument). @@ -1164,34 +1159,32 @@ import_mothur_groups <- function(mothur_group_file) { #' @keywords internal #' @importFrom plyr ldply #' @importFrom plyr ddply -import_mothur_otu_table <- function(mothur_list_file, mothur_group_file, cutoff = NULL) { - otulist <- import_mothur_otulist(mothur_list_file, cutoff) - mothur_groups <- import_mothur_groups(mothur_group_file) - # Initialize abundance matrix with zeros for sparse assignment +import_mothur_otu_table <- function(mothur_list_file, mothur_group_file, cutoff=NULL){ + otulist <- import_mothur_otulist(mothur_list_file, cutoff) + mothur_groups <- import_mothur_groups(mothur_group_file) + # Initialize abundance matrix with zeros for sparse assignment samplenames = unique(mothur_groups[, 1]) - mothur_otu_table <- matrix(0, nrow = length(otulist), ncol = length(samplenames)) - colnames(mothur_otu_table) <- samplenames - rownames(mothur_otu_table) <- names(otulist) - - # Write a sparse versino of the abundance table - df = ldply(otulist, function(x) { - data.frame(read = x, stringsAsFactors = FALSE) - }) - colnames(df)[1] <- "OTU" - df = data.frame(df, sample = mothur_groups[df[, "read"], 1], stringsAsFactors = FALSE) - adf = ddply(df, c("OTU", "sample"), function(x) { - # x = subset(df, OTU=='59_3_17' & sample=='C') - data.frame(x[1, c("OTU", "sample"), drop = FALSE], abundance = nrow(x)) - }) - - # Vectorized for speed using matrix indexing. See help('Extract') for details - # about matrix indexing. Diff than 2-vec index. - mothur_otu_table[as(adf[, c("OTU", "sample")], "matrix")] <- adf[, "abundance"] - - # Finally, return the otu_table as a phyloseq otu_table object. - return(otu_table(mothur_otu_table, taxa_are_rows = TRUE)) + mothur_otu_table <- matrix(0, nrow=length(otulist), ncol=length(samplenames)) + colnames(mothur_otu_table) <- samplenames + rownames(mothur_otu_table) <- names(otulist) + + # Write a sparse versino of the abundance table + df = ldply(otulist, function(x){data.frame(read=x, stringsAsFactors=FALSE)}) + colnames(df)[1] <- "OTU" + df = data.frame(df, sample=mothur_groups[df[, "read"], 1], stringsAsFactors=FALSE) + adf = ddply(df, c("OTU", "sample"), function(x){ + # x = subset(df, OTU=="59_3_17" & sample=="C") + data.frame(x[1, c("OTU", "sample"), drop=FALSE], abundance=nrow(x)) + }) + + # Vectorized for speed using matrix indexing. + # See help("Extract") for details about matrix indexing. Diff than 2-vec index. + mothur_otu_table[as(adf[, c("OTU", "sample")], "matrix")] <- adf[, "abundance"] + + # Finally, return the otu_table as a phyloseq otu_table object. + return(otu_table(mothur_otu_table, taxa_are_rows=TRUE)) } -################################################################################ +################################################################################ #' Import mothur shared file and return an otu_table #' #' @param mothur_shared_file (Required). A @@ -1202,19 +1195,18 @@ import_mothur_otu_table <- function(mothur_list_file, mothur_group_file, cutoff #' #' @seealso \code{\link{import_mothur}} #' @keywords internal -import_mothur_shared = function(mothur_shared_file, cutoff = NULL) { - # mothur_shared_file = '~/github/phyloseq/inst/extdata/esophagus.fn.shared.gz' +import_mothur_shared = function(mothur_shared_file, cutoff=NULL){ + #mothur_shared_file = "~/github/phyloseq/inst/extdata/esophagus.fn.shared.gz" # Check that cutoff is in cutoffs, or select a cutoff if none given. cutoffs = show_mothur_cutoffs(mothur_shared_file) cutoffs = cutoffs[!cutoffs %in% "label"] cutoff = select_mothur_cutoff(cutoff, cutoffs) x = readLines(mothur_shared_file) - rawtab = read.table(text = x[grep(paste0("^", cutoff), x)], header = FALSE, row.names = 2, - stringsAsFactors = FALSE)[, -(1:2)] - colnames(rawtab) <- strsplit(x[1], "\t")[[1]][4:(ncol(rawtab) + 3)] - return(otu_table(t(as.matrix(rawtab)), taxa_are_rows = TRUE)) + rawtab = read.table(text=x[grep(paste0("^", cutoff), x)], header=FALSE, row.names=2, stringsAsFactors=FALSE)[, -(1:2)] + colnames(rawtab) <- strsplit(x[1], "\t")[[1]][4:(ncol(rawtab)+3)] + return(otu_table(t(as.matrix(rawtab)), taxa_are_rows=TRUE)) } -################################################################################ +################################################################################ #' Import mothur constaxonomy file and return a taxonomyTable #' #' @param mothur_constaxonomy_file (Required). A @@ -1237,20 +1229,19 @@ import_mothur_shared = function(mothur_shared_file, cutoff = NULL) { #' \code{\link{phyloseq}} #' #' @keywords internal -import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction = parse_taxonomy_default) { +import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction=parse_taxonomy_default){ read.table(mothur_constaxonomy_file) - rawtab = read.table(mothur_constaxonomy_file, header = TRUE, row.names = 1, stringsAsFactors = FALSE)[, - "Taxonomy", drop = FALSE] - if (identical(parseFunction, parse_taxonomy_default)) { - # Proceed with default parsing stuff. Remove the confidence strings inside the - # parentheses, if present + rawtab = read.table(mothur_constaxonomy_file, header=TRUE, row.names=1, stringsAsFactors=FALSE)[, "Taxonomy", drop=FALSE] + if( identical(parseFunction, parse_taxonomy_default) ){ + # Proceed with default parsing stuff. + # Remove the confidence strings inside the parentheses, if present rawtab[, "Taxonomy"] = gsub("\\([[:digit:]]+\\)", "", rawtab[, "Taxonomy"]) # Remove the quotation marks, if present rawtab[, "Taxonomy"] = gsub("\"", "", rawtab[, "Taxonomy"]) # Remove trailing semicolon rawtab[, "Taxonomy"] = gsub(";$", "", rawtab[, "Taxonomy"]) # Split on semicolon - taxlist = strsplit(rawtab[, "Taxonomy"], ";", fixed = TRUE) + taxlist = strsplit(rawtab[, "Taxonomy"], ";", fixed=TRUE) taxlist = lapply(taxlist, parseFunction) } else { taxlist = lapply(rawtab[, "Taxonomy"], parseFunction) @@ -1258,7 +1249,7 @@ import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction = names(taxlist) <- rownames(rawtab) return(build_tax_table(taxlist)) } -################################################################################ +################################################################################ #' General function for importing mothur data files into phyloseq. #' #' Technically all parameters are optional, @@ -1286,9 +1277,9 @@ import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction = #' @param mothur_tree_file (Optional). #' A tree file, presumably produced by \emph{mothur}, #' and readable by \code{\link{read_tree}}. -#' The file probably has extension \code{'.tree'}. +#' The file probably has extension \code{".tree"}. #' -#' @param cutoff (Optional). A character string indicating the cutoff value, (or \code{'unique'}), +#' @param cutoff (Optional). A character string indicating the cutoff value, (or \code{"unique"}), #' that matches one of the cutoff-values used to produce the OTU clustering #' results contained within the list-file created by \emph{mothur} (and specified #' by the \code{mothur_list_file} argument). The default @@ -1350,53 +1341,54 @@ import_mothur_constaxonomy = function(mothur_constaxonomy_file, parseFunction = #' @examples #' # # The following example assumes you have downloaded the esophagus example #' # # dataset from the mothur wiki: -#' # # 'http://www.mothur.org/wiki/Esophageal_community_analysis' -#' # # 'http://www.mothur.org/w/images/5/55/Esophagus.zip' +#' # # "http://www.mothur.org/wiki/Esophageal_community_analysis" +#' # # "http://www.mothur.org/w/images/5/55/Esophagus.zip" #' # # The path on your machine may (probably will) vary -#' # mothur_list_file <- '~/Downloads/mothur/Esophagus/esophagus.an.list' -#' # mothur_group_file <- '~/Downloads/mothur/Esophagus/esophagus.good.groups' -#' # mothur_tree_file <- '~/Downloads/mothur/Esophagus/esophagus.tree' +#' # mothur_list_file <- "~/Downloads/mothur/Esophagus/esophagus.an.list" +#' # mothur_group_file <- "~/Downloads/mothur/Esophagus/esophagus.good.groups" +#' # mothur_tree_file <- "~/Downloads/mothur/Esophagus/esophagus.tree" #' # # # Actual examples follow: #' # show_mothur_cutoffs(mothur_list_file) #' # test1 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file) -#' # test2 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file, cutoff='0.02') +#' # test2 <- import_mothur(mothur_list_file, mothur_group_file, mothur_tree_file, cutoff="0.02") #' # # Returns just a tree #' # import_mothur(mothur_list_file, mothur_tree_file=mothur_tree_file) #' # # Returns just an otu_table #' # import_mothur(mothur_list_file, mothur_group_file=mothur_group_file) #' # # Returns an error #' # import_mothur(mothur_list_file) -#' # # Should return an 'OMG, you must provide the list file' error +#' # # Should return an "OMG, you must provide the list file" error #' # import_mothur() -import_mothur <- function(mothur_list_file = NULL, mothur_group_file = NULL, mothur_tree_file = NULL, - cutoff = NULL, mothur_shared_file = NULL, mothur_constaxonomy_file = NULL, parseFunction = parse_taxonomy_default) { - +import_mothur <- function(mothur_list_file=NULL, mothur_group_file=NULL, + mothur_tree_file=NULL, cutoff=NULL, + mothur_shared_file=NULL, mothur_constaxonomy_file=NULL, parseFunction=parse_taxonomy_default){ + pslist = vector("list") - if (!is.null(mothur_group_file) & !is.null(mothur_list_file)) { - # If list & group files provided, you can make an OTU table. - groupOTU = import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff) - pslist = c(pslist, list(groupOTU)) - } - - if (!is.null(mothur_tree_file)) { - tree <- read_tree(mothur_tree_file) - pslist = c(pslist, list(tree)) - } + if( !is.null(mothur_group_file) & !is.null(mothur_list_file) ){ + # If list & group files provided, you can make an OTU table. + groupOTU = import_mothur_otu_table(mothur_list_file, mothur_group_file, cutoff) + pslist = c(pslist, list(groupOTU)) + } + + if( !is.null(mothur_tree_file) ){ + tree <- read_tree(mothur_tree_file) + pslist = c(pslist, list(tree)) + } - if (!is.null(mothur_shared_file)) { + if( !is.null(mothur_shared_file) ){ OTUshared <- import_mothur_shared(mothur_shared_file) pslist = c(pslist, list(OTUshared)) } - if (!is.null(mothur_constaxonomy_file)) { + if( !is.null(mothur_constaxonomy_file) ){ tax <- import_mothur_constaxonomy(mothur_constaxonomy_file, parseFunction) pslist = c(pslist, list(tax)) - } + } return(do.call("phyloseq", pslist)) } -################################################################################ +################################################################################ #' Import mothur-formatted distance file #' #' The mothur application will produce a file containing the pairwise distances @@ -1416,43 +1408,37 @@ import_mothur <- function(mothur_list_file = NULL, mothur_group_file = NULL, mot #' #' @examples #' # # Take a look at the dataset shown here as an example: -#' # # 'http://www.mothur.org/wiki/Esophageal_community_analysis' -#' # # find the file ending with extension '.dist', download to your system +#' # # "http://www.mothur.org/wiki/Esophageal_community_analysis" +#' # # find the file ending with extension ".dist", download to your system #' # # The location of your file may vary -#' # mothur_dist_file <- '~/Downloads/mothur/Esophagus/esophagus.dist' +#' # mothur_dist_file <- "~/Downloads/mothur/Esophagus/esophagus.dist" #' # myNewDistObject <- import_mothur_dist(mothur_dist_file) -import_mothur_dist <- function(mothur_dist_file) { - # Read the raw distance matrix file produced by mothur: - raw_dist_lines <- readLines(mothur_dist_file) - - # split each line on white space, and begin modifying into dist-matrix format - dist_char <- strsplit(raw_dist_lines, "[[:space:]]+") - dist_char <- dist_char[-1] - # add name to each list element - names(dist_char) <- sapply(dist_char, function(i) { - i[1] - }) - # pop out the names from each vector - dist_char <- sapply(dist_char, function(i) { - i[-1] - }) - # convert to numeric vectors - dist_char <- sapply(dist_char, function(i) { - as(i, "numeric") - }) - - # Initialize and fill the matrix - distm <- matrix(0, nrow = length(dist_char), ncol = length(dist_char)) - rownames(distm) <- names(dist_char) - colnames(distm) <- names(dist_char) - for (i in names(dist_char)[-1]) { - distm[i, 1:length(dist_char[[i]])] <- dist_char[[i]] - } - diag(distm) <- 1 - distd <- as.dist(distm) - return(distd) +import_mothur_dist <- function(mothur_dist_file){ + # Read the raw distance matrix file produced by mothur: + raw_dist_lines <- readLines(mothur_dist_file) + + # split each line on white space, and begin modifying into dist-matrix format + dist_char <- strsplit(raw_dist_lines, "[[:space:]]+") + dist_char <- dist_char[-1] + # add name to each list element + names(dist_char) <- sapply(dist_char, function(i){i[1]}) + # pop out the names from each vector + dist_char <- sapply(dist_char, function(i){i[-1]}) + # convert to numeric vectors + dist_char <- sapply(dist_char, function(i){as(i, "numeric")}) + + # Initialize and fill the matrix + distm <- matrix(0, nrow=length(dist_char), ncol=length(dist_char)) + rownames(distm) <- names(dist_char); colnames(distm) <- names(dist_char) + for( i in names(dist_char)[-1] ){ + distm[i, 1:length(dist_char[[i]])] <- dist_char[[i]] + } + diag(distm) <- 1 + distd <- as.dist(distm) + return(distd) } -################################################################################ +################################################################################ +################################################################################ #' Export a distance object as \code{.names} and \code{.dist} files for mothur #' #' The purpose of this function is to allow a user to easily export a distance object @@ -1462,7 +1448,7 @@ import_mothur_dist <- function(mothur_dist_file) { #' #' @usage export_mothur_dist(x, out=NULL, makeTrivialNamesFile=NULL) #' -#' @param x (Required). A \code{'dist'} object, or a symmetric matrix. +#' @param x (Required). A \code{"dist"} object, or a symmetric matrix. #' #' @param out (Optional). The desired output filename for the \code{.dist} file, OR #' left \code{NULL}, the default, in which case the mothur-formated distance table @@ -1485,54 +1471,47 @@ import_mothur_dist <- function(mothur_dist_file) { #' data(esophagus) #' myDistObject <- as.dist(ape::cophenetic.phylo(phy_tree(esophagus))) #' export_mothur_dist(myDistObject) -export_mothur_dist <- function(x, out = NULL, makeTrivialNamesFile = NULL) { - if (class(x) == "matrix") { - x <- as.dist(x) - } - if (class(x) != "dist") { - stop("x must be a dist object, or symm matrix") - } - - # While x is a dist-object, get the length of unique pairs to initialize the dist - # table. - distdf <- matrix("", nrow = length(x), ncol = 3) - - # Now convert x to matrix for looping, indexing. - x <- as(x, "matrix") - colnames(distdf) <- c("i", "j", "d") - # distdf row counter - z <- 1 - - # The big loop. i <- 2 - for (i in 2:nrow(x)) { - thisvec <- x[i, 1:(i - 1)] - for (j in 1:length(thisvec)) { - # j <- 1 - distdf[z, "i"] <- rownames(x)[i] - distdf[z, "j"] <- colnames(x)[j] - distdf[z, "d"] <- thisvec[j] - z <- z + 1 - } - } - - # mothur requires a .names file, in case you removed identical sequences from - # within mothur and need to keep track and add them back. - if (!is.null(makeTrivialNamesFile)) { - namestab <- matrix(rownames(x), nrow = length(rownames(x)), ncol = 2) - write.table(namestab, file = makeTrivialNamesFile, quote = FALSE, sep = "\t", - row.names = FALSE, col.names = FALSE) - } - - # If is.null(out)==TRUE, then return two-column table. If it's a character, - # write.table-it - if (is.null(out)) { - return(distdf) - } else { - write.table(distdf, file = out, quote = FALSE, sep = "\t", row.names = FALSE, - col.names = FALSE) - } +export_mothur_dist <- function(x, out=NULL, makeTrivialNamesFile=NULL){ + if( class(x)== "matrix" ){ x <- as.dist(x) } + if( class(x)!= "dist" ){ stop("x must be a dist object, or symm matrix") } + + # While x is a dist-object, get the length of unique pairs + # to initialize the dist table. + distdf <- matrix("", nrow=length(x), ncol=3) + + # Now convert x to matrix for looping, indexing. + x <- as(x, "matrix") + colnames(distdf) <- c("i", "j", "d") + # distdf row counter + z <- 1 + + # The big loop. + for( i in 2:nrow(x) ){ # i <- 2 + thisvec <- x[i, 1:(i-1)] + for( j in 1:length(thisvec) ){ # j <- 1 + distdf[z, "i"] <- rownames(x)[i] + distdf[z, "j"] <- colnames(x)[j] + distdf[z, "d"] <- thisvec[j] + z <- z + 1 + } + } + + # mothur requires a .names file, in case you removed identical sequences + # from within mothur and need to keep track and add them back. + if( !is.null(makeTrivialNamesFile) ){ + namestab <- matrix(rownames(x), nrow=length(rownames(x)), ncol=2) + write.table(namestab, file=makeTrivialNamesFile, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + } + + # If is.null(out)==TRUE, then return two-column table. + # If it's a character, write.table-it + if( is.null(out) ){ + return(distdf) + } else { + write.table(distdf, file=out, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + } } -################################################################################ +################################################################################ #' Export environment (ENV) file for UniFrac Server. #' #' Creates the environment table that is needed for the original UniFrac @@ -1546,7 +1525,7 @@ export_mothur_dist <- function(x, out = NULL, makeTrivialNamesFile = NULL) { #' #' @param file (Optional). The file path for export. If not-provided, the #' expectation is that you will want to set \code{return} to \code{TRUE}, -#' and manipulate the ENV table on your own. Default is \code{''}, skipping +#' and manipulate the ENV table on your own. Default is \code{""}, skipping #' the ENV file from being written to a file. #' #' @param writeTree (Optional). Write the phylogenetic tree as well as the @@ -1561,54 +1540,59 @@ export_mothur_dist <- function(x, out = NULL, makeTrivialNamesFile = NULL) { #' @examples #' # # Load example data #' # data(esophagus) -#' # export_env_file(esophagus, '~/Desktop/esophagus.txt') -export_env_file <- function(physeq, file = "", writeTree = TRUE, return = FALSE) { - # data(esophagus) physeq <- esophagus - - # Create otu_table matrix and force orientation - OTU <- as(otu_table(physeq), "matrix") - if (!taxa_are_rows(physeq)) { - OTU <- t(OTU) - } - - # initialize sequence/sample names - seqs <- taxa_names(physeq) - samples <- sample_names(physeq) - - # initialize output table as matrix - ENV <- matrix("", nrow = sum(OTU >= 1), ncol = 3) - - # i counts the row of the output , ENV - i = 1 - while (i < nrow(ENV)) { - for (j in seqs) { - for (k in which(OTU[j, ] > 0)) { - ENV[i, 1] <- j - ENV[i, 2] <- samples[k] - ENV[i, 3] <- OTU[j, k] - i <- i + 1 - } - } - } - # If a file path is provided, write the table to that file - if (file != "") { - write.table(ENV, file = file, quote = FALSE, sep = "\t", row.names = FALSE, - col.names = FALSE) - } - - # If needed, also write the associated tree-file. - if (writeTree) { - fileTree <- paste(file, ".nex", sep = "") - write.nexus(phy_tree(physeq), file = fileTree, original.data = FALSE) - } - - # If return argument is TRUE, return the environment table - if (return) { - return(ENV) - } +#' # export_env_file(esophagus, "~/Desktop/esophagus.txt") +export_env_file <- function(physeq, file="", writeTree=TRUE, return=FALSE){ + # data(esophagus) + # physeq <- esophagus + + # Create otu_table matrix and force orientation + OTU <- as(otu_table(physeq), "matrix") + if( !taxa_are_rows(physeq) ){ OTU <- t(OTU) } + + # initialize sequence/sample names + seqs <- taxa_names(physeq) + samples <- sample_names(physeq) + + # initialize output table as matrix + ENV <- matrix("", nrow=sum(OTU >= 1), ncol=3) + + # i counts the row of the output , ENV + i=1 + while( i < nrow(ENV) ){ + for( j in seqs){ + for( k in which(OTU[j, ]>0) ){ + ENV[i, 1] <- j + ENV[i, 2] <- samples[k] + ENV[i, 3] <- OTU[j, k] + i <- i + 1 + } + } + } + # If a file path is provided, write the table to that file + if(file != ""){ + write.table(ENV, file=file, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + } + + # If needed, also write the associated tree-file. + if( writeTree ){ + fileTree <- paste(file, ".nex", sep="") + write.nexus(phy_tree(physeq), file=fileTree, original.data=FALSE) + } + + # If return argument is TRUE, return the environment table + if(return){ return(ENV) } } -################################################################################ UniFrac ENV files have the form: SEQ1 ENV1 1 SEQ1 ENV2 2 SEQ2 ENV1 15 SEQ3 ENV1 -################################################################################ 2 SEQ4 ENV2 8 SEQ5 ENV1 4 http://128.138.212.43/unifrac/help.psp#env_file +################################################################################ +# UniFrac ENV files have the form: +# +# SEQ1 ENV1 1 +# SEQ1 ENV2 2 +# SEQ2 ENV1 15 +# SEQ3 ENV1 2 +# SEQ4 ENV2 8 +# SEQ5 ENV1 4 +# http://128.138.212.43/unifrac/help.psp#env_file +################################################################################ #' Import phyloseq data from biom-format file #' #' New versions of QIIME produce a more-comprehensive and formally-defined @@ -1768,95 +1752,104 @@ export_env_file <- function(physeq, file = "", writeTree = TRUE, return = FALSE) #' @export #' @examples #' # An included example of a rich dense biom file -#' rich_dense_biom <- system.file('extdata', 'rich_dense_otu_table.biom', package='phyloseq') +#' rich_dense_biom <- system.file("extdata", "rich_dense_otu_table.biom", package="phyloseq") #' import_biom(rich_dense_biom, parseFunction=parse_taxonomy_greengenes) #' # An included example of a sparse dense biom file -#' rich_sparse_biom <- system.file('extdata', 'rich_sparse_otu_table.biom', package='phyloseq') +#' rich_sparse_biom <- system.file("extdata", "rich_sparse_otu_table.biom", package="phyloseq") #' import_biom(rich_sparse_biom, parseFunction=parse_taxonomy_greengenes) #' # # # Example code for importing large file with parallel backend -#' # library('doParallel') +#' # library("doParallel") #' # registerDoParallel(cores=6) -#' # import_biom('my/file/path/file.biom', parseFunction=parse_taxonomy_greengenes, parallel=TRUE) -import_biom <- function(BIOMfilename, treefilename = NULL, refseqfilename = NULL, - refseqFunction = readDNAStringSet, refseqArgs = NULL, parseFunction = parse_taxonomy_default, - parallel = FALSE, version = 1, ...) { - - # initialize the argument-list for phyloseq. Start empty. - argumentlist <- list() - - # Read the data - x = read_biom(biom_file = BIOMfilename) - - ######################################## OTU table: - otutab = otu_table(as(biom_data(x), "matrix"), taxa_are_rows = TRUE) - argumentlist <- c(argumentlist, list(otutab)) - - ######################################## Taxonomy Table Need to check if taxonomy information is empty (minimal BIOM - ######################################## file) - if (all(sapply(sapply(x$rows, function(i) { - i$metadata - }), is.null))) { - taxtab <- NULL +#' # import_biom("my/file/path/file.biom", parseFunction=parse_taxonomy_greengenes, parallel=TRUE) +import_biom <- function(BIOMfilename, + treefilename=NULL, refseqfilename=NULL, refseqFunction=readDNAStringSet, refseqArgs=NULL, + parseFunction=parse_taxonomy_default, parallel=FALSE, version=1.0, ...){ + + # initialize the argument-list for phyloseq. Start empty. + argumentlist <- list() + + # Read the data + x = read_biom(biom_file=BIOMfilename) + + ######################################## + # OTU table: + ######################################## + otutab = otu_table(as(biom_data(x), "matrix"), taxa_are_rows=TRUE) + argumentlist <- c(argumentlist, list(otutab)) + + ######################################## + # Taxonomy Table + ######################################## + # Need to check if taxonomy information is empty (minimal BIOM file) + if( all( sapply(sapply(x$rows, function(i){i$metadata}), is.null) ) ){ + taxtab <- NULL } else { # parse once each character vector, save as a list - taxlist = lapply(x$rows, function(i) { + taxlist = lapply(x$rows, function(i){ parseFunction(i$metadata$taxonomy) }) - names(taxlist) = sapply(x$rows, function(i) { - i$id - }) + names(taxlist) = sapply(x$rows, function(i){i$id}) taxtab = build_tax_table(taxlist) - } - argumentlist <- c(argumentlist, list(taxtab)) - - ######################################## Sample Data ('columns' in QIIME/BIOM) If there is no metadata (all NULL), then - ######################################## set sam_data <- NULL - if (is.null(sample_metadata(x))) { - samdata <- NULL - } else { - samdata = sample_data(sample_metadata(x)) - } - argumentlist <- c(argumentlist, list(samdata)) - - ######################################## Tree data - if (!is.null(treefilename)) { - if (inherits(treefilename, "phylo")) { - # If argument is already a tree, don't read, just assign. - tree = treefilename - } else { - # NULL is silently returned if tree is not read properly. - tree <- read_tree(treefilename, ...) - } - # Add to argument list or warn - if (is.null(tree)) { - warning("treefilename failed import. It not included.") - } else { - argumentlist <- c(argumentlist, list(tree)) - } - } - - ######################################## Reference Sequence data - if (!is.null(refseqfilename)) { - if (inherits(refseqfilename, "XStringSet")) { - # If argument is already a XStringSet, don't read, just assign. - refseq = refseqfilename - } else { - # call refseqFunction and read refseqfilename, either with or without additional - # args - if (!is.null(refseqArgs)) { - refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs)) - } else { - refseq = refseqFunction(refseqfilename) - } - } - argumentlist <- c(argumentlist, list(refseq)) - } - - ######################################## Put together into a phyloseq object - return(do.call("phyloseq", argumentlist)) - + } + argumentlist <- c(argumentlist, list(taxtab)) + + ######################################## + # Sample Data ("columns" in QIIME/BIOM) + ######################################## + # If there is no metadata (all NULL), then set sam_data <- NULL + if( is.null(sample_metadata(x)) ){ + samdata <- NULL + } else { + samdata = sample_data(sample_metadata(x)) + } + argumentlist <- c(argumentlist, list(samdata)) + + ######################################## + # Tree data + ######################################## + if( !is.null(treefilename) ){ + if( inherits(treefilename, "phylo") ){ + # If argument is already a tree, don't read, just assign. + tree = treefilename + } else { + # NULL is silently returned if tree is not read properly. + tree <- read_tree(treefilename, ...) + } + # Add to argument list or warn + if( is.null(tree) ){ + warning("treefilename failed import. It not included.") + } else { + argumentlist <- c(argumentlist, list(tree) ) + } + } + + ######################################## + # Reference Sequence data + ######################################## + if( !is.null(refseqfilename) ){ + if( inherits(refseqfilename, "XStringSet") ){ + # If argument is already a XStringSet, don't read, just assign. + refseq = refseqfilename + } else { + # call refseqFunction and read refseqfilename, either with or without additional args + if( !is.null(refseqArgs) ){ + refseq = do.call("refseqFunction", c(list(refseqfilename), refseqArgs)) + } else { + refseq = refseqFunction(refseqfilename) + } + } + argumentlist <- c(argumentlist, list(refseq) ) + } + + ######################################## + # Put together into a phyloseq object + ######################################## + return( do.call("phyloseq", argumentlist) ) + } -################################################################################ Need to export these parsing functions as examples... +################################################################################ +# Need to export these parsing functions as examples... +################################################################################ #' Parse elements of a taxonomy vector #' #' These are provided as both example and default functions for @@ -1881,7 +1874,7 @@ import_biom <- function(BIOMfilename, treefilename = NULL, refseqfilename = NULL #' \code{parse_taxonomy_greengenes} function clips the first 3 characters that #' identify the rank, and uses these to name the corresponding element according #' to the appropriate taxonomic rank name used by greengenes -#' (e.g. \code{'p__'} at the beginning of an element means that element is +#' (e.g. \code{"p__"} at the beginning of an element means that element is #' the name of the phylum to which this OTU belongs). #' Most importantly, the expectations for these functions described above #' make them compatible to use during data import, @@ -1898,8 +1891,8 @@ import_biom <- function(BIOMfilename, treefilename = NULL, refseqfilename = NULL #' #' @return A character vector in which each element is a different #' taxonomic rank of the same OTU, and each element name is the name of -#' the rank level. For example, an element might be \code{'Firmicutes'} -#' and named \code{'phylum'}. +#' the rank level. For example, an element might be \code{"Firmicutes"} +#' and named \code{"phylum"}. #' These parsed, named versions of the taxonomic vector should #' reflect embedded information, naming conventions, #' desired length limits, etc; or in the case of \code{\link{parse_taxonomy_default}}, @@ -1913,58 +1906,60 @@ import_biom <- function(BIOMfilename, treefilename = NULL, refseqfilename = NULL #' \code{\link{import_qiime}} #' #' @examples -#' taxvec1 = c('Root', 'k__Bacteria', 'p__Firmicutes', 'c__Bacilli', 'o__Bacillales', 'f__Staphylococcaceae') +#' taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae") #' parse_taxonomy_default(taxvec1) #' parse_taxonomy_greengenes(taxvec1) -#' taxvec2 = c('Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae') +#' taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae") #' parse_taxonomy_qiime(taxvec2) -parse_taxonomy_default = function(char.vec) { - # Remove any leading empty space - char.vec = gsub("^[[:space:]]{1,}", "", char.vec) - # Remove any trailing space - char.vec = gsub("[[:space:]]{1,}$", "", char.vec) - if (length(char.vec) > 0) { - # Add dummy element (rank) name - names(char.vec) = paste("Rank", 1:length(char.vec), sep = "") - } else { - warning("Empty taxonomy vector encountered.") - } - return(char.vec) +parse_taxonomy_default = function(char.vec){ + # Remove any leading empty space + char.vec = gsub("^[[:space:]]{1,}", "", char.vec) + # Remove any trailing space + char.vec = gsub("[[:space:]]{1,}$", "", char.vec) + if( length(char.vec) > 0 ){ + # Add dummy element (rank) name + names(char.vec) = paste("Rank", 1:length(char.vec), sep="") + } else { + warning("Empty taxonomy vector encountered.") + } + return(char.vec) } #' @rdname parseTaxonomy-functions #' @aliases parse_taxonomy_default #' @export -parse_taxonomy_greengenes <- function(char.vec) { - # Use default to assign names to elements in case problem with greengenes prefix - char.vec = parse_taxonomy_default(char.vec) - # Define the meaning of each prefix according to GreenGenes taxonomy - Tranks = c(k = "Kingdom", p = "Phylum", c = "Class", o = "Order", f = "Family", - g = "Genus", s = "Species") - # Check for prefix using regexp, warn if there were none. trim indices, ti - ti = grep("[[:alpha:]]{1}\\_\\_", char.vec) - if (length(ti) == 0L) { - warning("No greengenes prefixes were found. \n", "Consider using parse_taxonomy_default() instead if true for all OTUs. \n", - "Dummy ranks may be included among taxonomic ranks now.") - # Will want to return without further modifying char.vec - taxvec = char.vec - # Replace names of taxvec according to prefix, if any present... - } else { - # Remove prefix using sub-'' regexp, call result taxvec - taxvec = gsub("[[:alpha:]]{1}\\_\\_", "", char.vec) - # Define the ranks that will be replaced - repranks = Tranks[substr(char.vec[ti], 1, 1)] - # Replace, being sure to avoid prefixes not present in Tranks - names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)] - } - return(taxvec) +parse_taxonomy_greengenes <- function(char.vec){ + # Use default to assign names to elements in case problem with greengenes prefix + char.vec = parse_taxonomy_default(char.vec) + # Define the meaning of each prefix according to GreenGenes taxonomy + Tranks = c(k="Kingdom", p="Phylum", c="Class", o="Order", f="Family", g="Genus", s="Species") + # Check for prefix using regexp, warn if there were none. trim indices, ti + ti = grep("[[:alpha:]]{1}\\_\\_", char.vec) + if( length(ti) == 0L ){ + warning( + "No greengenes prefixes were found. \n", + "Consider using parse_taxonomy_default() instead if true for all OTUs. \n", + "Dummy ranks may be included among taxonomic ranks now." + ) + # Will want to return without further modifying char.vec + taxvec = char.vec + # Replace names of taxvec according to prefix, if any present... + } else { + # Remove prefix using sub-"" regexp, call result taxvec + taxvec = gsub("[[:alpha:]]{1}\\_\\_", "", char.vec) + # Define the ranks that will be replaced + repranks = Tranks[substr(char.vec[ti], 1, 1)] + # Replace, being sure to avoid prefixes not present in Tranks + names(taxvec)[ti[!is.na(repranks)]] = repranks[!is.na(repranks)] + } + return(taxvec) } #' @rdname parseTaxonomy-functions #' @aliases parse_taxonomy_default #' @export -parse_taxonomy_qiime <- function(char.vec) { - parse_taxonomy_greengenes(strsplit(char.vec, ";", TRUE)[[1]]) +parse_taxonomy_qiime <- function(char.vec){ + parse_taxonomy_greengenes(strsplit(char.vec, ";", TRUE)[[1]]) } -################################################################################ +################################################################################ #' Build a \code{\link{tax_table}} from a named possibly-jagged list #' #' @param taxlist (Required). A list in which each element is a vector of @@ -1988,38 +1983,39 @@ parse_taxonomy_qiime <- function(char.vec) { #' @export #' #' @examples -#' taxvec1 = c('Root', 'k__Bacteria', 'p__Firmicutes', 'c__Bacilli', 'o__Bacillales', 'f__Staphylococcaceae') +#' taxvec1 = c("Root", "k__Bacteria", "p__Firmicutes", "c__Bacilli", "o__Bacillales", "f__Staphylococcaceae") #' parse_taxonomy_default(taxvec1) #' parse_taxonomy_greengenes(taxvec1) -#' taxvec2 = c('Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae') +#' taxvec2 = c("Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Staphylococcaceae") #' parse_taxonomy_qiime(taxvec2) #' taxlist1 = list(OTU1=parse_taxonomy_greengenes(taxvec1), OTU2=parse_taxonomy_qiime(taxvec2)) #' taxlist2 = list(OTU1=parse_taxonomy_default(taxvec1), OTU2=parse_taxonomy_qiime(taxvec2)) #' build_tax_table(taxlist1) #' build_tax_table(taxlist2) -build_tax_table = function(taxlist) { - # Determine column headers (rank names) of taxonomy table - columns = unique(unlist(lapply(taxlist, names))) - # Initialize taxonomic character matrix - taxmat <- matrix(NA_character_, nrow = length(taxlist), ncol = length(columns)) - colnames(taxmat) = columns - # Fill in the matrix by row. - for (i in 1:length(taxlist)) { - # Protect against empty taxonomy - if (length(taxlist[[i]]) > 0) { - # The extra column name check solves issues with raggedness, and disorder. - taxmat[i, names(taxlist[[i]])] <- taxlist[[i]] - } - } - # Convert functionally empty elements, '', to NA - taxmat[taxmat == ""] <- NA_character_ - # Now coerce to matrix, name the rows as 'id' (the taxa name), coerce to - # taxonomyTable - taxmat <- as(taxmat, "matrix") - rownames(taxmat) = names(taxlist) - return(tax_table(taxmat)) +build_tax_table = function(taxlist){ + # Determine column headers (rank names) of taxonomy table + columns = unique(unlist(lapply(taxlist, names))) + # Initialize taxonomic character matrix + taxmat <- matrix(NA_character_, nrow=length(taxlist), ncol=length(columns)) + colnames(taxmat) = columns + # Fill in the matrix by row. + for( i in 1:length(taxlist) ){ + # Protect against empty taxonomy + if( length(taxlist[[i]]) > 0 ){ + # The extra column name check solves issues with raggedness, and disorder. + taxmat[i, names(taxlist[[i]])] <- taxlist[[i]] + } + } + # Convert functionally empty elements, "", to NA + taxmat[taxmat==""] <- NA_character_ + # Now coerce to matrix, name the rows as "id" (the taxa name), coerce to taxonomyTable + taxmat <- as(taxmat, "matrix") + rownames(taxmat) = names(taxlist) + return( tax_table(taxmat) ) } -################################################################################ +################################################################################ +################################################################################ +################################################################################ #' Download and import directly from microbio.me/qiime #' #' This function is for accessing microbiome datasets from the @@ -2052,8 +2048,8 @@ build_tax_table = function(taxlist) { #' this function will complete the remainder of the ftp URL hosted at #' \href{http://www.microbio.me/qiime/index.psp}{microbio.me/qiime}. #' For example, instead of the full URL string, -#' \code{'ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_494_split_library_seqs_and_mapping.zip'}, -#' you could simply provide \code{494} or \code{'494'} +#' \code{"ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_494_split_library_seqs_and_mapping.zip"}, +#' you could simply provide \code{494} or \code{"494"} #' as the first (`zipftp`) argument. #' #' @param ext (Optional). A \code{\link{character}} string of the expected @@ -2102,12 +2098,12 @@ build_tax_table = function(taxlist) { #' # This should return TRUE on your system if you have internet turned on #' # and a standard R installation. Indicates whether this is likely to #' # work on your system for a URL or local file, respectively. -#' capabilities('http/ftp'); capabilities('fifo') +#' capabilities("http/ftp"); capabilities("fifo") #' # A working example with a local example file included in phyloseq -#' zipfile = 'study_816_split_library_seqs_and_mapping.zip' -#' zipfile = system.file('extdata', zipfile, package='phyloseq') -#' tarfile = 'study_816_split_library_seqs_and_mapping.tar.gz' -#' tarfile = system.file('extdata', tarfile, package='phyloseq') +#' zipfile = "study_816_split_library_seqs_and_mapping.zip" +#' zipfile = system.file("extdata", zipfile, package="phyloseq") +#' tarfile = "study_816_split_library_seqs_and_mapping.tar.gz" +#' tarfile = system.file("extdata", tarfile, package="phyloseq") #' tarps = microbio_me_qiime(tarfile) #' zipps = microbio_me_qiime(zipfile) #' identical(tarps, zipps) @@ -2115,108 +2111,109 @@ build_tax_table = function(taxlist) { #' plot_heatmap(tarps) #' # A real example #' # # Smokers dataset -#' # smokezip = 'ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_524_split_library_seqs_and_mapping.zip' +#' # smokezip = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_524_split_library_seqs_and_mapping.zip" #' # smokers1 = microbio_me_qiime(smokezip) #' # # Alternatively, just use the study number #' # smokers2 = microbio_me_qiime(524) #' # identical(smokers1, smokers2) -microbio_me_qiime = function(zipftp, ext = ".zip", parsef = parse_taxonomy_greengenes, - ...) { - # Define naming convention - front = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_" - if (!is.na(as.integer(zipftp))) { - # If study number instead of string, create the ftp URL using ext and convention - back = paste0("_split_library_seqs_and_mapping", ext) - zipftp = paste0(front, zipftp, back) - } else { - # Determine file extension from the file path itself - ext = substring(zipftp, regexpr("\\.([[:alnum:]]+)$", zipftp)[1]) - back = paste0("_split_library_seqs_and_mapping", ext) - } - # Check if zipftp is clearly an externally located file, ftp, http, etc. - externprefixes = c("http://", "https://", "ftp://") - prefix = regexpr("^([[:alnum:]]+)\\://", zipftp) - if (substr(zipftp, 1, attr(prefix, "match.length")[1]) %in% externprefixes) { - # If external, then create temporary file and download - zipfile = tempfile() - download.file(zipftp, zipfile, ...) - } else { - # Else it is a local zipfile - zipfile = zipftp - } - # Use the apparent file naming convention for microbio.me/qiime as the de facto - # guide for this API. In particular, the expectation o fthe study name (already - # used above) - studyname = gsub("\\_split\\_.+$", "", basename(zipftp)) - # The output of tempdir() is always the same in the same R session To avoid - # conflict with multiple microbio.me/qiime unpacks in the same session, pre-pend - # the study name and datestamp +microbio_me_qiime = function(zipftp, ext=".zip", parsef=parse_taxonomy_greengenes, ...){ + # Define naming convention + front = "ftp://thebeast.colorado.edu/pub/QIIME_DB_Public_Studies/study_" + if( !is.na(as.integer(zipftp)) ){ + # If study number instead of string, + # create the ftp URL using ext and convention + back = paste0("_split_library_seqs_and_mapping", ext) + zipftp = paste0(front, zipftp, back) + } else { + # Determine file extension from the file path itself + ext = substring(zipftp, regexpr("\\.([[:alnum:]]+)$", zipftp)[1]) + back = paste0("_split_library_seqs_and_mapping", ext) + } + # Check if zipftp is clearly an externally located file, ftp, http, etc. + externprefixes = c("http://", "https://", "ftp://") + prefix = regexpr("^([[:alnum:]]+)\\://", zipftp) + if( substr(zipftp, 1, attr(prefix, "match.length")[1]) %in% externprefixes ){ + # If external, then create temporary file and download + zipfile = tempfile() + download.file(zipftp, zipfile, ...) + } else { + # Else it is a local zipfile + zipfile = zipftp + } + # Use the apparent file naming convention for microbio.me/qiime + # as the de facto guide for this API. In particular, + # the expectation o fthe study name (already used above) + studyname = gsub("\\_split\\_.+$", "", basename(zipftp)) + # The output of tempdir() is always the same in the same R session + # To avoid conflict with multiple microbio.me/qiime unpacks + # in the same session, pre-pend the study name and datestamp unpackdir = paste0(studyname, "_", gsub("[[:blank:][:punct:]]", "", date())) # Add the temp path - unpackdir = file.path(tempdir(), unpackdir) + unpackdir = file.path(tempdir(), unpackdir) # Create the unpack directory if needed (most likely). - if (!file.exists(unpackdir)) { - dir.create(unpackdir) - } - # Unpack to the temporary directory using unzip or untar - if (ext == ".zip") { - unzip(zipfile, exdir = unpackdir, overwrite = TRUE) - } else if (ext %in% c("tar.gz", ".tgz", ".gz", ".gzip", ".bzip2", ".xz")) { - # untar the tarfile to the new temp dir - untar(zipfile, exdir = unpackdir) - } else { - # The compression format was not recognized. Provide informative error msg. - msg = paste("Could not determine the compression type.", "Expected extensions are (mostly):", - ".zip, .tgz, .tar.gz", sep = "\n") - stop(msg) - } - # Define a list of imported objects that might grow if the right file types are - # present and imported correctly. - imported_objects = vector("list") - # Search recursively in the unpacked directory for the .biom file and parse if it - # is. There should be only one. Throw warning if more than one, take the first. - biomfile = list.files(unpackdir, "\\.biom", full.names = TRUE, recursive = TRUE) - if (length(biomfile) > 1) { + if( !file.exists(unpackdir) ){dir.create(unpackdir)} + # Unpack to the temporary directory using unzip or untar + if( ext == ".zip" ){ + unzip(zipfile, exdir=unpackdir, overwrite=TRUE) + } else if( ext %in% c("tar.gz", ".tgz", ".gz", ".gzip", ".bzip2", ".xz") ){ + # untar the tarfile to the new temp dir + untar(zipfile, exdir=unpackdir) + } else { + # The compression format was not recognized. Provide informative error msg. + msg = paste("Could not determine the compression type.", + "Expected extensions are (mostly):", + ".zip, .tgz, .tar.gz", sep="\n") + stop(msg) + } + # Define a list of imported objects that might grow + # if the right file types are present and imported correctly. + imported_objects = vector("list") + # Search recursively in the unpacked directory for the .biom file + # and parse if it is. + # There should be only one. Throw warning if more than one, take the first. + biomfile = list.files(unpackdir, "\\.biom", full.names=TRUE, recursive=TRUE) + if( length(biomfile) > 1 ){ warning("more than one .biom file found in compressed archive. Importing first only.") biomfile = biomfile[1] - } else if (length(biomfile) == 1) { - cat("Found biom-format file, now parsing it... \n") - biom = import_biom(biomfile, parseFunction = parsef) - cat("Done parsing biom... \n") - imported_objects = c(imported_objects, list(biom)) - } - # Check if sample_data (qiime mapping) file present, and parse if it is. - sdfile = list.files(unpackdir, "\\_mapping\\_file\\.txt", full.names = TRUE, - recursive = TRUE) - if (length(sdfile) > 1) { - warning("more than one mapping file found in compressed archive. Importing first only.") - sdfile = sdfile[1] - } else if (length(sdfile) == 1) { - cat("Importing Sample Metdadata from mapping file...", fill = TRUE) - sample_metadata = import_qiime_sample_data(sdfile) - imported_objects = c(imported_objects, list(sample_metadata)) - } - # Check success, notify user, and return. - if (length(imported_objects) > 1) { - # If there are more than one imported objects, merge them and return - cat("Merging the imported objects... \n") - physeq = do.call("merge_phyloseq", imported_objects) - if (inherits(physeq, "phyloseq")) { - cat("Successfully merged, phyloseq-class created. \n Returning... \n") - } - return(physeq) - } else if (length(imported_objects) == 1) { - cat("Note: only on object in the zip file was imported. \n") - cat("It was ", class(imported_objects[[1]]), " class. \n") - return(imported_objects[[1]]) - } else { - cat("PLEASE NOTE: No objects were imported. \n", "You chould check the zip file, \n", - "as well as the naming conventions in the zipfile \n", "to make sure that they match microbio.me/qiime. \n", - "Instead returning NULL... \n") - return(NULL) - } + } else if( length(biomfile) == 1 ){ + cat("Found biom-format file, now parsing it... \n") + biom = import_biom(biomfile, parseFunction=parsef) + cat("Done parsing biom... \n") + imported_objects = c(imported_objects, list(biom)) + } + # Check if sample_data (qiime mapping) file present, and parse if it is. + sdfile = list.files(unpackdir, "\\_mapping\\_file\\.txt", full.names=TRUE, recursive=TRUE) + if( length(sdfile) > 1 ){ + warning("more than one mapping file found in compressed archive. Importing first only.") + sdfile = sdfile[1] + } else if( length(sdfile)==1 ){ + cat("Importing Sample Metdadata from mapping file...", fill=TRUE) + sample_metadata = import_qiime_sample_data(sdfile) + imported_objects = c(imported_objects, list(sample_metadata)) + } + # Check success, notify user, and return. + if( length(imported_objects) > 1 ){ + # If there are more than one imported objects, merge them and return + cat("Merging the imported objects... \n") + physeq = do.call("merge_phyloseq", imported_objects) + if( inherits(physeq, "phyloseq") ){ + cat("Successfully merged, phyloseq-class created. \n Returning... \n") + } + return(physeq) + } else if( length(imported_objects) == 1 ){ + cat("Note: only on object in the zip file was imported. \n") + cat("It was ", class(imported_objects[[1]]), " class. \n") + return(imported_objects[[1]]) + } else { + cat("PLEASE NOTE: No objects were imported. \n", + "You chould check the zip file, \n", + "as well as the naming conventions in the zipfile \n", + "to make sure that they match microbio.me/qiime. \n", + "Instead returning NULL... \n") + return(NULL) + } } -################################################################################ +################################################################################ #' Import usearch table format (\code{.uc}) to OTU table #' #' UPARSE is an algorithm for OTU-clustering implemented within usearch. @@ -2242,7 +2239,7 @@ microbio_me_qiime = function(zipftp, ext = ".zip", parsef = parse_taxonomy_green #' assumed that the 9th and 10th columns of the \code{.uc} table #' hold the read-label and OTU ID, respectively; #' and it is also assumed that the delimiter between sample-name and read -#' in the read-name entries is a single \code{'_'}. +#' in the read-name entries is a single \code{"_"}. #' #' @param ucfile (Required). A file location character string #' or \code{\link{connection}} @@ -2263,7 +2260,7 @@ microbio_me_qiime = function(zipftp, ext = ".zip", parsef = parse_taxonomy_green #' This should be the delimiter that separates the sample ID #' from the original ID in the demultiplexed read ID of your sequence file. #' The default is plain underscore, which in this \code{\link{regex}} context -#' is \code{'_'}. +#' is \code{"_"}. #' #' @param verbose (Optional). A \code{\link{logical}}. #' Default is \code{TRUE}. @@ -2280,34 +2277,36 @@ microbio_me_qiime = function(zipftp, ext = ".zip", parsef = parse_taxonomy_green #' \code{\link{import_qiime}} #' #' @examples -#' usearchfile <- system.file('extdata', 'usearch.uc', package='phyloseq') +#' usearchfile <- system.file("extdata", "usearch.uc", package="phyloseq") #' import_usearch_uc(usearchfile) -import_usearch_uc <- function(ucfile, colRead = 9, colOTU = 10, readDelimiter = "_", - verbose = TRUE) { - if (verbose) { - cat("Reading `ucfile` into memory and parsing into table \n") - } - # fread is one of the fastest and most-efficient importers for R. It creates a - # data.table object, suitable for large size objects - x = fread(ucfile, sep = "\t", header = FALSE, na.strings = c("*", "*", "NA", - "N/A", ""), select = c(colRead, colOTU), colClasses = "character", showProgress = TRUE) +import_usearch_uc <- function(ucfile, colRead=9, colOTU=10, + readDelimiter="_", verbose=TRUE){ + if(verbose){cat("Reading `ucfile` into memory and parsing into table \n")} + # fread is one of the fastest and most-efficient importers for R. + # It creates a data.table object, suitable for large size objects + x = fread(ucfile, sep="\t", header=FALSE, na.strings=c("*", '*', "NA","N/A",""), + select=c(colRead, colOTU), colClasses="character", showProgress=TRUE) setnames(x, c("read", "OTU")) NrawEntries = nrow(x) - if (verbose) { + if(verbose){ cat("Initially read", NrawEntries, "entries. \n") cat("... Now removing unassigned OTUs (* or NA)... \n") } x = x[!is.na(OTU), ] - if (verbose) { + if(verbose){ cat("Removed", NrawEntries - nrow(x), "entries that had no OTU assignment. \n") cat("A total of", nrow(x), "will be assigned to the OTU table.\n") } # Process sequence label to be sample label only - x[, `:=`(sample, gsub(paste0(readDelimiter, ".+$"), "", read))] + x[, sample:=gsub(paste0(readDelimiter, ".+$"), "", read)] # Convert long (melted) table into a sample-by-OTU OTU table, and return OTU <- as(table(x$sample, x$OTU), "matrix") - # system.time({setkey(x, OTU, sample) OTU2 <- dcast.data.table(x, sample ~ OTU, - # fun.aggregate=length, fill=0L) }) - return(otu_table(OTU, taxa_are_rows = FALSE)) + # system.time({setkey(x, OTU, sample) + # OTU2 <- dcast.data.table(x, sample ~ OTU, fun.aggregate=length, fill=0L) + # }) + return(otu_table(OTU, taxa_are_rows=FALSE)) } -################################################################################ +################################################################################ +################################################################################ +################################################################################ +################################################################################ \ No newline at end of file diff --git a/R/allClasses.R b/R/allClasses.R index c4d3f82d..5eae1974 100644 --- a/R/allClasses.R +++ b/R/allClasses.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' The S4 class for storing taxa-abundance information. #' #' Because orientation of these tables can vary by method, the orientation is @@ -10,7 +10,7 @@ #' #' \describe{ #' \item{taxa_are_rows}{ -#'\t\tA single logical specifying the orientation of the abundance table. +#' A single logical specifying the orientation of the abundance table. #' } #' #' \item{.Data}{This slot is inherited from the \code{\link{matrix}} class.} @@ -18,8 +18,8 @@ #' @name otu_table-class #' @rdname otu_table-class #' @exportClass otu_table -setClass("otu_table", representation(taxa_are_rows = "logical"), contains = "matrix") -################################################################################ +setClass("otu_table", representation(taxa_are_rows="logical"), contains = "matrix") +################################################################################ #' The S4 for storing sample variables. #' #' Row indices represent samples, while column indices represent experimental @@ -30,7 +30,7 @@ setClass("otu_table", representation(taxa_are_rows = "logical"), contains = "mat #' \item{.Data}{data-frame data, inherited from the data.frame class.} #' #' \item{row.names}{ -#'\t Also inherited from the data.frame class; +#' Also inherited from the data.frame class; #' it should contain the sample names. #' } #' @@ -41,8 +41,8 @@ setClass("otu_table", representation(taxa_are_rows = "logical"), contains = "mat #' @name sample_data-class #' @rdname sample_data-class #' @exportClass sample_data -setClass("sample_data", contains = "data.frame") -################################################################################ +setClass("sample_data", contains="data.frame") +################################################################################ #' An S4 class that holds taxonomic classification data as a character #' matrix. #' @@ -56,7 +56,8 @@ setClass("sample_data", contains = "data.frame") #' @rdname taxonomyTable-class #' @exportClass taxonomyTable setClass("taxonomyTable", contains = "matrix") -# metaMDS +#metaMDS +################################################################################ #' S3 class placeholder definition (list) for metaMDS #' #' The ape package does export a version of its \code{\link[vegan]{metaMDS}}-class, @@ -75,7 +76,10 @@ setClass("taxonomyTable", contains = "matrix") #' #' @keywords internal metaMDS <- structure(list(), class = "metaMDS") -### Remove if this ever works @importClassesFrom vegan metaMDS +### +# Remove if this ever works +# @importClassesFrom vegan metaMDS +################################################################################ #' S3 class placeholder definition (list) for decorana #' #' The ape package does export a version of its \code{\link[vegan]{decorana}}-class, @@ -94,7 +98,10 @@ metaMDS <- structure(list(), class = "metaMDS") #' #' @keywords internal decorana <- structure(list(), class = "decorana") -### Remove if this ever works @importClassesFrom vegan decorana +### +# Remove if this ever works +# @importClassesFrom vegan decorana +################################################################################ #' S3 class placeholder definition (list) for dpcoa #' #' The ade4 package does not export a version of its \code{\link[ade4]{dpcoa}}-class, @@ -121,8 +128,13 @@ decorana <- structure(list(), class = "decorana") #' #' @keywords internal dpcoa <- structure(list(), class = "dpcoa") -################################################################################ # @keywords internal print.dpcoa <- ade4:::print.dpcoa If this ever works -################################################################################ @importClassesFrom ade4 dpcoa +################################################################################ +## # @keywords internal +## print.dpcoa <- ade4:::print.dpcoa +################################################################################ +# If this ever works +# @importClassesFrom ade4 dpcoa +################################################################################ #' S3 class for ape-calculated MDS results #' #' Nothing to import, because ape doesn't (yet) export this S3 class. @@ -134,7 +146,9 @@ dpcoa <- structure(list(), class = "dpcoa") #' #' @keywords internal pcoa <- structure(list(), class = "pcoa") -# @importMethodsFrom ape print phyloseq-specific definition of 'phylo' class, +# @importMethodsFrom ape print +# phyloseq-specific definition of "phylo" class, +################################################################################ #' S3 class placeholder definition (list) for phylogenetic trees. #' #' The ape package does not export a version of its \code{\link[ape]{phylo}}-class, @@ -162,7 +176,10 @@ pcoa <- structure(list(), class = "pcoa") #' #' @keywords internal phylo <- structure(list(), class = "phylo") -################################################################################ If this ever works @importClassesFrom ape phylo +################################################################################ +# If this ever works +# @importClassesFrom ape phylo +################################################################################ #' An S4 placeholder of the main phylogenetic tree class from the ape package. #' #' See the \code{\link[ape]{ape}} package for details about this type of @@ -175,7 +192,7 @@ phylo <- structure(list(), class = "phylo") #' @rdname phylo-class #' @exportClass phylo setOldClass("phylo") -################################################################################ +################################################################################ #' An S4 placeholder for the \code{\link[stats]{dist}} class. #' #' See \code{\link[stats]{dist}} for details @@ -187,10 +204,12 @@ setOldClass("phylo") #' @rdname dist-class #' @exportClass dist setOldClass("dist") -################################################################################ Use setClassUnion to define the unholy NULL-data union as a virtual class. -################################################################################ This is a way of dealing with the expected scenarios in which one or more of -################################################################################ the component data classes is not available, in which case NULL will be used -################################################################################ instead. +################################################################################ +# Use setClassUnion to define the unholy NULL-data union as a virtual class. +# This is a way of dealing with the expected scenarios in which one or more of +# the component data classes is not available, in which case NULL will be used +# instead. +################################################################################ #' @keywords internal setClassUnion("otu_tableOrNULL", c("otu_table", "NULL")) #' @keywords internal @@ -215,15 +234,15 @@ setClassUnion("phyloOrNULL", c("phylo", "NULL")) #' @importClassesFrom Biostrings XStringSet #' @keywords internal setClassUnion("XStringSetOrNULL", c("XStringSet", "NULL")) -################################################################################ +################################################################################ #' The main experiment-level class for phyloseq data #' #' Contains all currently-supported component data classes: #' \code{\link{otu_table-class}}, #' \code{\link{sample_data-class}}, -#' \code{\link{taxonomyTable-class}} (\code{'tax_table'} slot), -#' \code{\link[ape]{phylo}}-class (\code{'phy_tree'} slot), -#' and the \code{\link[Biostrings]{XStringSet-class}} (\code{'refseq'} slot). +#' \code{\link{taxonomyTable-class}} (\code{"tax_table"} slot), +#' \code{\link[ape]{phylo}}-class (\code{"phy_tree"} slot), +#' and the \code{\link[Biostrings]{XStringSet-class}} (\code{"refseq"} slot). #' There are several advantages #' to storing your phylogenetic sequencing experiment as an instance of the #' phyloseq class, not the least of which is that it is easy to return to the @@ -264,8 +283,13 @@ setClassUnion("XStringSetOrNULL", c("XStringSet", "NULL")) #' @name phyloseq-class #' @rdname phyloseq-class #' @exportClass phyloseq -setClass(Class = "phyloseq", representation = representation(otu_table = "otu_tableOrNULL", - tax_table = "taxonomyTableOrNULL", sam_data = "sample_dataOrNULL", phy_tree = "phyloOrNULL", - refseq = "XStringSetOrNULL"), prototype = prototype(otu_table = NULL, tax_table = NULL, - sam_data = NULL, phy_tree = NULL, refseq = NULL)) -################################################################################ +setClass(Class="phyloseq", + representation=representation( + otu_table="otu_tableOrNULL", + tax_table="taxonomyTableOrNULL", + sam_data="sample_dataOrNULL", + phy_tree="phyloOrNULL", + refseq = "XStringSetOrNULL"), + prototype=prototype(otu_table=NULL, tax_table=NULL, sam_data=NULL, phy_tree=NULL, refseq=NULL) +) +################################################################################ diff --git a/R/allData.R b/R/allData.R index 317583ad..d8bd2d35 100644 --- a/R/allData.R +++ b/R/allData.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' (Data) Small example dataset from a human esophageal community (2004) #' #' Includes just 3 samples, 1 each from 3 subjects. Although the research article mentions 4 subjects, @@ -34,15 +34,15 @@ #' data(esophagus) #' UniFrac(esophagus, weighted=TRUE) #' # How to re-create the esophagus dataset using import_mothur function -#' mothlist <- system.file('extdata', 'esophagus.fn.list.gz', package='phyloseq') -#' mothgroup <- system.file('extdata', 'esophagus.good.groups.gz', package='phyloseq') -#' mothtree <- system.file('extdata', 'esophagus.tree.gz', package='phyloseq') +#' mothlist <- system.file("extdata", "esophagus.fn.list.gz", package="phyloseq") +#' mothgroup <- system.file("extdata", "esophagus.good.groups.gz", package="phyloseq") +#' mothtree <- system.file("extdata", "esophagus.tree.gz", package="phyloseq") #' show_mothur_cutoffs(mothlist) -#' cutoff <- '0.10' -#' esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff)\t -################################################################################ +#' cutoff <- "0.10" +#' esophman <- import_mothur(mothlist, mothgroup, mothtree, cutoff) +################################################################################ NA -################################################################################ +################################################################################ #' (Data) Enterotypes of the human gut microbiome (2011) #' #' Published in Nature in early 2011, this work compared (among other things), @@ -81,11 +81,11 @@ NA #' @keywords data #' @examples #' data(enterotype) -#' ig <- make_network(enterotype, 'samples', max.dist=0.3) -#' plot_network(ig, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) -################################################################################ +#' ig <- make_network(enterotype, "samples", max.dist=0.3) +#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) +################################################################################ NA -################################################################################ +################################################################################ #' (Data) Reproducibility of soil microbiome data (2011) #' #' Published in early 2011, @@ -159,11 +159,12 @@ NA #' # No convincing difference in species richness between warmed/unwarmed soils. #' ################################################################################ #' # Graphically compare richness between the different treatments. -#' man.col <- c(WC='red', WU='brown', UC='blue', UU='darkgreen') -#' plot_richness(soilrep, x='Treatment', color='Treatment', measures=c('Observed', 'Chao1', 'Shannon')) -################################################################################ +#' man.col <- c(WC="red", WU="brown", UC="blue", UU="darkgreen") +#' plot_richness(soilrep, x="Treatment", color="Treatment", measures=c("Observed", "Chao1", "Shannon")) +################################################################################ NA -################################################################################ +################################################################################ +################################################################################ #' (Data) Global patterns of 16S rRNA diversity at a depth of millions of sequences per sample (2011) #' #' Published in PNAS in early 2011. This work compared the microbial @@ -206,7 +207,7 @@ NA #' #' @examples #' data(GlobalPatterns) -#' plot_richness(GlobalPatterns, x='SampleType', measures=c('Observed', 'Chao1', 'Shannon')) -################################################################################ +#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Observed", "Chao1", "Shannon")) +################################################################################ NA -################################################################################ +################################################################################ diff --git a/R/allPackage.R b/R/allPackage.R index 9de2e027..9945db72 100644 --- a/R/allPackage.R +++ b/R/allPackage.R @@ -1,4 +1,4 @@ -############################################### +############################################### #' Handling and analysis of high-throughput phylogenetic sequence data. #' #' There are already several ecology and phylogenetic packages available in R, @@ -21,4 +21,4 @@ #' @docType package #' @keywords package NA -############################################### +############################################### diff --git a/R/almostAllAccessors.R b/R/almostAllAccessors.R index 456adbdd..76c97090 100644 --- a/R/almostAllAccessors.R +++ b/R/almostAllAccessors.R @@ -1,4 +1,7 @@ -################################################################################ Accessor / subset methods. +################################################################################ +### Accessor / subset methods. +################################################################################ +################################################################################ #' Retrieve reference sequences (\code{\link[Biostrings]{XStringSet}}-class) from object. #' #' This is the suggested method @@ -35,20 +38,18 @@ #' @examples #' data(GlobalPatterns) #' refseq(GlobalPatterns, FALSE) -setGeneric("refseq", function(physeq, errorIfNULL = TRUE) standardGeneric("refseq")) +setGeneric("refseq", function(physeq, errorIfNULL=TRUE) standardGeneric("refseq")) #' @rdname refseq-methods #' @aliases refseq,ANY-method -setMethod("refseq", "ANY", function(physeq, errorIfNULL = TRUE) { - access(physeq, "refseq", errorIfNULL) +setMethod("refseq", "ANY", function(physeq, errorIfNULL=TRUE){ + access(physeq, "refseq", errorIfNULL) }) -# Return as-is if already a 'XStringSet' object +# Return as-is if already a "XStringSet" object #' @importClassesFrom Biostrings XStringSet #' @rdname refseq-methods #' @aliases refseq,XStringSet-method -setMethod("refseq", "XStringSet", function(physeq) { - return(physeq) -}) -################################################################################ +setMethod("refseq", "XStringSet", function(physeq){ return(physeq) }) +################################################################################ #' Retrieve phylogenetic tree (\code{\link[ape]{phylo}}-class) from object. #' #' This is the suggested method @@ -93,19 +94,17 @@ setMethod("refseq", "XStringSet", function(physeq) { #' @examples #' data(GlobalPatterns) #' phy_tree(GlobalPatterns) -setGeneric("phy_tree", function(physeq, errorIfNULL = TRUE) standardGeneric("phy_tree")) +setGeneric("phy_tree", function(physeq, errorIfNULL=TRUE) standardGeneric("phy_tree")) #' @rdname phy_tree-methods #' @aliases phy_tree,ANY-method -setMethod("phy_tree", "ANY", function(physeq, errorIfNULL = TRUE) { - access(physeq, "phy_tree", errorIfNULL) +setMethod("phy_tree", "ANY", function(physeq, errorIfNULL=TRUE){ + access(physeq, "phy_tree", errorIfNULL) }) -# Return as-is if already a 'phylo' object +# Return as-is if already a "phylo" object #' @rdname phy_tree-methods #' @aliases phy_tree,phylo-method -setMethod("phy_tree", "phylo", function(physeq) { - return(physeq) -}) -################################################################################ +setMethod("phy_tree", "phylo", function(physeq){ return(physeq) }) +################################################################################ #' Access taxa_are_rows slot from otu_table objects. #' #' @usage taxa_are_rows(physeq) @@ -122,20 +121,16 @@ setMethod("phy_tree", "phylo", function(physeq) { setGeneric("taxa_are_rows", function(physeq) standardGeneric("taxa_are_rows")) #' @rdname taxa_are_rows-methods #' @aliases taxa_are_rows,ANY-method -setMethod("taxa_are_rows", "ANY", function(physeq) { - NULL -}) +setMethod("taxa_are_rows", "ANY", function(physeq){NULL}) #' @rdname taxa_are_rows-methods #' @aliases taxa_are_rows,otu_table-method -setMethod("taxa_are_rows", "otu_table", function(physeq) { - physeq@taxa_are_rows -}) +setMethod("taxa_are_rows", "otu_table", function(physeq){physeq@taxa_are_rows}) #' @rdname taxa_are_rows-methods #' @aliases taxa_are_rows,phyloseq-method -setMethod("taxa_are_rows", "phyloseq", function(physeq) { - taxa_are_rows(otu_table(physeq)) +setMethod("taxa_are_rows", "phyloseq", function(physeq){ + taxa_are_rows(otu_table(physeq)) }) -################################################################################ +################################################################################ #' Get the number of taxa/species. #' #' @usage ntaxa(physeq) @@ -153,46 +148,44 @@ setMethod("taxa_are_rows", "phyloseq", function(physeq) { #' @export #' #' @examples -#' data('esophagus') +#' data("esophagus") #' ntaxa(esophagus) #' phy_tree(esophagus) #' ntaxa(phy_tree(esophagus)) setGeneric("ntaxa", function(physeq) standardGeneric("ntaxa")) #' @rdname ntaxa-methods #' @aliases ntaxa,ANY-method -setMethod("ntaxa", "ANY", function(physeq) { - return(NULL) -}) +setMethod("ntaxa", "ANY", function(physeq){ return(NULL) }) #' @rdname ntaxa-methods #' @aliases ntaxa,phyloseq-method -setMethod("ntaxa", "phyloseq", function(physeq) { - ntaxa(otu_table(physeq)) +setMethod("ntaxa", "phyloseq", function(physeq){ + ntaxa(otu_table(physeq)) }) #' @rdname ntaxa-methods #' @aliases ntaxa,otu_table-method -setMethod("ntaxa", "otu_table", function(physeq) { - if (taxa_are_rows(physeq)) { - return(nrow(physeq)) - } else { - return(ncol(physeq)) - } +setMethod("ntaxa", "otu_table", function(physeq){ + if( taxa_are_rows(physeq) ){ + return( nrow(physeq) ) + } else { + return( ncol(physeq) ) + } }) #' @rdname ntaxa-methods #' @aliases ntaxa,taxonomyTable-method -setMethod("ntaxa", "taxonomyTable", function(physeq) { - nrow(physeq) +setMethod("ntaxa", "taxonomyTable", function(physeq){ + nrow(physeq) }) #' @rdname ntaxa-methods #' @aliases ntaxa,phylo-method -setMethod("ntaxa", "phylo", function(physeq) { - length(physeq$tip.label) +setMethod("ntaxa", "phylo", function(physeq){ + length(physeq$tip.label) }) #' @rdname ntaxa-methods #' @aliases ntaxa,XStringSet-method -setMethod("ntaxa", "XStringSet", function(physeq) { - length(physeq) +setMethod("ntaxa", "XStringSet", function(physeq){ + length(physeq) }) -################################################################################ +################################################################################ #' Get species / taxa names. #' #' @usage taxa_names(physeq) @@ -210,46 +203,44 @@ setMethod("ntaxa", "XStringSet", function(physeq) { #' @export #' #' @examples # -#' data('esophagus') +#' data("esophagus") #' tree <- phy_tree(esophagus) #' OTU1 <- otu_table(esophagus) #' taxa_names(tree) #' taxa_names(OTU1) #' physeq1 <- phyloseq(OTU1, tree) #' taxa_names(physeq1) -setGeneric("taxa_names", function(physeq) standardGeneric("taxa_names")) +setGeneric("taxa_names", function(physeq) standardGeneric("taxa_names")) #' @rdname taxa_names-methods #' @aliases taxa_names,ANY-method -setMethod("taxa_names", "ANY", function(physeq) { - return(NULL) -}) +setMethod("taxa_names", "ANY", function(physeq){ return(NULL) }) #' @rdname taxa_names-methods #' @aliases taxa_names,phyloseq-method -setMethod("taxa_names", "phyloseq", function(physeq) { - taxa_names(otu_table(physeq)) +setMethod("taxa_names", "phyloseq", function(physeq){ + taxa_names(otu_table(physeq)) }) #' @rdname taxa_names-methods #' @aliases taxa_names,otu_table-method -setMethod("taxa_names", "otu_table", function(physeq) { - if (taxa_are_rows(physeq)) { - return(rownames(physeq)) - } else { - return(colnames(physeq)) - } +setMethod("taxa_names", "otu_table", function(physeq){ + if( taxa_are_rows(physeq) ){ + return( rownames(physeq) ) + } else { + return( colnames(physeq) ) + } }) #' @rdname taxa_names-methods #' @aliases taxa_names,taxonomyTable-method -setMethod("taxa_names", "taxonomyTable", function(physeq) rownames(physeq)) +setMethod("taxa_names", "taxonomyTable", function(physeq) rownames(physeq) ) #' @rdname taxa_names-methods #' @aliases taxa_names,sample_data-method -setMethod("taxa_names", "sample_data", function(physeq) NULL) +setMethod("taxa_names", "sample_data", function(physeq) NULL ) #' @rdname taxa_names-methods #' @aliases taxa_names,phylo-method -setMethod("taxa_names", "phylo", function(physeq) physeq$tip.label) +setMethod("taxa_names", "phylo", function(physeq) physeq$tip.label ) #' @rdname taxa_names-methods #' @aliases taxa_names,XStringSet-method -setMethod("taxa_names", "XStringSet", function(physeq) names(physeq)) -################################################################################ +setMethod("taxa_names", "XStringSet", function(physeq) names(physeq) ) +################################################################################ #' Get the number of samples. #' #' @usage nsamples(physeq) @@ -267,7 +258,7 @@ setMethod("taxa_names", "XStringSet", function(physeq) names(physeq)) #' @export #' #' @examples # -#' data('esophagus') +#' data("esophagus") #' tree <- phy_tree(esophagus) #' OTU1 <- otu_table(esophagus) #' nsamples(OTU1) @@ -276,28 +267,26 @@ setMethod("taxa_names", "XStringSet", function(physeq) names(physeq)) setGeneric("nsamples", function(physeq) standardGeneric("nsamples")) #' @rdname nsamples-methods #' @aliases nsamples,ANY-method -setMethod("nsamples", "ANY", function(physeq) { - return(NULL) -}) +setMethod("nsamples", "ANY", function(physeq){ return(NULL) }) #' @rdname nsamples-methods #' @aliases nsamples,phyloseq-method -setMethod("nsamples", "phyloseq", function(physeq) { - # dispatch to core, required component, otu_table - nsamples(otu_table(physeq)) +setMethod("nsamples", "phyloseq", function(physeq){ + # dispatch to core, required component, otu_table + nsamples(otu_table(physeq)) }) #' @rdname nsamples-methods #' @aliases nsamples,otu_table-method -setMethod("nsamples", "otu_table", function(physeq) { - if (taxa_are_rows(physeq)) { - return(ncol(physeq)) - } else { - return(nrow(physeq)) - } +setMethod("nsamples", "otu_table", function(physeq){ + if( taxa_are_rows(physeq) ){ + return( ncol(physeq) ) + } else { + return( nrow(physeq) ) + } }) #' @rdname nsamples-methods #' @aliases nsamples,sample_data-method -setMethod("nsamples", "sample_data", function(physeq) nrow(physeq)) -################################################################################ +setMethod("nsamples", "sample_data", function(physeq) nrow(physeq) ) +################################################################################ #' Get sample names. #' #' @usage sample_names(physeq) @@ -319,33 +308,31 @@ setMethod("nsamples", "sample_data", function(physeq) nrow(physeq)) #' data(esophagus) #' sample_names(esophagus) setGeneric("sample_names", function(physeq) standardGeneric("sample_names")) -# Unless otherwise specified, this should return a value of NULL That way, -# objects that do not explicitly describe samples all behave in the same -# (returning NULL) way. +# Unless otherwise specified, this should return a value of NULL +# That way, objects that do not explicitly describe samples all +# behave in the same (returning NULL) way. #' @rdname sample_names-methods #' @aliases sample_names,ANY-method -setMethod("sample_names", "ANY", function(physeq) { - return(NULL) -}) +setMethod("sample_names", "ANY", function(physeq){ return(NULL) }) #' @rdname sample_names-methods #' @aliases sample_names,phyloseq-method -setMethod("sample_names", "phyloseq", function(physeq) { - # dispatch to core, required component, otu_table - sample_names(otu_table(physeq)) +setMethod("sample_names", "phyloseq", function(physeq){ + # dispatch to core, required component, otu_table + sample_names(otu_table(physeq)) }) #' @rdname sample_names-methods #' @aliases sample_names,sample_data-method -setMethod("sample_names", "sample_data", function(physeq) rownames(physeq)) +setMethod("sample_names", "sample_data", function(physeq) rownames(physeq) ) #' @rdname sample_names-methods #' @aliases sample_names,otu_table-method -setMethod("sample_names", "otu_table", function(physeq) { - if (taxa_are_rows(physeq)) { - return(colnames(physeq)) - } else { - return(rownames(physeq)) - } +setMethod("sample_names", "otu_table", function(physeq){ + if( taxa_are_rows(physeq) ){ + return( colnames(physeq) ) + } else { + return( rownames(physeq) ) + } }) -################################################################################ +################################################################################ #' Returns all abundance values for species \code{i}. #' #' This is a simple accessor function for investigating @@ -371,25 +358,25 @@ setMethod("sample_names", "otu_table", function(physeq) { #' @examples #' data(esophagus) #' taxa_names(esophagus) -#' get_sample(esophagus, '59_5_19') +#' get_sample(esophagus, "59_5_19") setGeneric("get_sample", function(physeq, i) standardGeneric("get_sample")) -################################################################################ +################################################################################ #' @aliases get_sample,otu_table-method #' @rdname get_sample-methods -setMethod("get_sample", "otu_table", function(physeq, i) { - if (taxa_are_rows(physeq)) { - as(physeq, "matrix")[i, ] - } else { - as(physeq, "matrix")[, i] - } +setMethod("get_sample", "otu_table", function(physeq, i){ + if( taxa_are_rows(physeq) ){ + as(physeq, "matrix")[i, ] + } else { + as(physeq, "matrix")[, i] + } }) -################################################################################ +################################################################################ #' @aliases get_sample,phyloseq-method #' @rdname get_sample-methods -setMethod("get_sample", "phyloseq", function(physeq, i) { - get_sample(otu_table(physeq), i) +setMethod("get_sample", "phyloseq", function(physeq, i){ + get_sample(otu_table(physeq), i) }) -################################################################################ +################################################################################ #' Returns all abundance values of sample \code{i}. #' #' This is a simple accessor function for investigating @@ -416,23 +403,23 @@ setMethod("get_sample", "phyloseq", function(physeq, i) { #' @examples #' data(esophagus) #' sample_names(esophagus) -#' get_taxa(esophagus, 'B') +#' get_taxa(esophagus, "B") setGeneric("get_taxa", function(physeq, i) standardGeneric("get_taxa")) #' @aliases get_taxa,otu_table-method #' @rdname get_taxa-methods -setMethod("get_taxa", "otu_table", function(physeq, i) { - if (taxa_are_rows(physeq)) { - as(physeq, "matrix")[, i] - } else { - as(physeq, "matrix")[i, ] - } +setMethod("get_taxa", "otu_table", function(physeq, i){ + if( taxa_are_rows(physeq) ){ + as(physeq, "matrix")[, i] + } else { + as(physeq, "matrix")[i, ] + } }) #' @aliases get_taxa,phyloseq-method #' @rdname get_taxa-methods -setMethod("get_taxa", "phyloseq", function(physeq, i) { - get_taxa(otu_table(physeq), i) +setMethod("get_taxa", "phyloseq", function(physeq, i){ + get_taxa(otu_table(physeq), i) }) -################################################################################ +################################################################################ #' Retrieve the names of the taxonomic ranks #' #' This is a simple accessor function to make it more convenient to determine @@ -459,10 +446,10 @@ setMethod("get_taxa", "phyloseq", function(physeq, i) { #' @examples #' data(enterotype) #' rank_names(enterotype) -rank_names <- function(physeq, errorIfNULL = TRUE) { - colnames(tax_table(physeq, errorIfNULL)) +rank_names <- function(physeq, errorIfNULL=TRUE){ + colnames(tax_table(physeq, errorIfNULL)) } -################################################################################ +################################################################################ #' Get a unique vector of the observed taxa at a particular taxonomic rank #' #' This is a simple accessor function to make it more convenient to determine @@ -494,11 +481,11 @@ rank_names <- function(physeq, errorIfNULL = TRUE) { #' data(enterotype) #' get_taxa_unique(enterotype) #' data(GlobalPatterns) -#' get_taxa_unique(GlobalPatterns, 'Family') -get_taxa_unique <- function(physeq, taxonomic.rank = rank_names(physeq)[1], errorIfNULL = TRUE) { - unique(as(tax_table(physeq, errorIfNULL)[, taxonomic.rank], "character")) +#' get_taxa_unique(GlobalPatterns, "Family") +get_taxa_unique <- function(physeq, taxonomic.rank=rank_names(physeq)[1], errorIfNULL=TRUE){ + unique(as(tax_table(physeq, errorIfNULL)[, taxonomic.rank], "character")) } -################################################################################ +################################################################################ #' Get the sample variables present in sample_data #' #' This is a simple accessor function to make it more convenient to determine @@ -525,10 +512,10 @@ get_taxa_unique <- function(physeq, taxonomic.rank = rank_names(physeq)[1], erro #' @examples #' data(enterotype) #' sample_variables(enterotype) -sample_variables <- function(physeq, errorIfNULL = TRUE) { - colnames(sample_data(physeq, errorIfNULL)) +sample_variables <- function(physeq, errorIfNULL=TRUE){ + colnames(sample_data(physeq, errorIfNULL)) } -################################################################################ +################################################################################ #' Get the values for a particular variable in sample_data #' #' This is a simple accessor function for streamlining access @@ -556,12 +543,12 @@ sample_variables <- function(physeq, errorIfNULL = TRUE) { #' # Load the GlobalPatterns dataset into the workspace environment #' data(GlobalPatterns) #' # Look at the different values for SampleType -#' get_variable(GlobalPatterns, 'SampleType') -get_variable <- function(physeq, varName) { - if (is.null(sample_data(physeq, FALSE))) { - stop("Your phyloseq data object does not have a sample-data component\n", - "Try ?sample_data for more details.") - } - return(as(sample_data(physeq), "data.frame")[, varName]) +#' get_variable(GlobalPatterns, "SampleType") +get_variable <- function(physeq, varName){ + if( is.null(sample_data(physeq, FALSE)) ){ + stop("Your phyloseq data object does not have a sample-data component\n", + "Try ?sample_data for more details.") + } + return( as(sample_data(physeq), "data.frame")[, varName] ) } -################################################################################ +################################################################################ diff --git a/R/as-methods.R b/R/as-methods.R index 9d490553..cb89511a 100644 --- a/R/as-methods.R +++ b/R/as-methods.R @@ -1,32 +1,34 @@ -################################################################################ coercion methods -setAs("phyloseq", "matrix", function(from) { - from@.Data +################################################################################ +# coercion methods +################################################################################ +setAs("phyloseq", "matrix", function(from){ + from@.Data }) -setAs("phyloseq", "otu_table", function(from) { - otu_table(from) +setAs("phyloseq", "otu_table", function(from){ + otu_table(from) }) -setAs("phyloseq", "otu_table", function(from) { - otu_table(from) +setAs("phyloseq", "otu_table", function(from){ + otu_table(from) }) -################################################################################ -setAs("data.frame", "sample_data", function(from) { - new("sample_data", from) +################################################################################ +setAs("data.frame", "sample_data", function(from){ + new("sample_data", from) }) -setAs("sample_data", "data.frame", function(from) { - data.frame(from) +setAs("sample_data", "data.frame", function(from){ + data.frame(from) }) -setAs("phyloseq", "sample_data", function(from) { - sample_data(from) +setAs("phyloseq", "sample_data", function(from){ + sample_data(from) }) -################################################################################ -setAs("taxonomyTable", "matrix", function(from) { - from@.Data +################################################################################ +setAs("taxonomyTable", "matrix", function(from){ + from@.Data }) -setAs("phyloseq", "taxonomyTable", function(from) { - tax_table(from) +setAs("phyloseq", "taxonomyTable", function(from){ + tax_table(from) }) -################################################################################ -setAs("phyloseq", "phylo", function(from) { - phy_tree(from) +################################################################################ +setAs("phyloseq", "phylo", function(from){ + phy_tree(from) }) -################################################################################ +################################################################################ diff --git a/R/assignment-methods.R b/R/assignment-methods.R index d435935c..aa82c99e 100644 --- a/R/assignment-methods.R +++ b/R/assignment-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Assign a new OTU Table to \code{x} #' #' @usage otu_table(x) <- value @@ -31,20 +31,18 @@ setGeneric("otu_table<-", function(x, value) standardGeneric("otu_table<-")) #' @rdname assign-otu_table #' @aliases otu_table<-,phyloseq,otu_table-method -setMethod("otu_table<-", c("phyloseq", "otu_table"), function(x, value) { - phyloseq(value, x@sam_data, x@tax_table, x@phy_tree, x@refseq) +setMethod("otu_table<-", c("phyloseq", "otu_table"), function(x, value){ + phyloseq(value, x@sam_data, x@tax_table, x@phy_tree, x@refseq) }) #' @rdname assign-otu_table #' @aliases otu_table<-,otu_table,otu_table-method -setMethod("otu_table<-", c("otu_table", "otu_table"), function(x, value) { - value -}) +setMethod("otu_table<-", c("otu_table", "otu_table"), function(x, value){ value }) #' @rdname assign-otu_table #' @aliases otu_table<-,phyloseq,phyloseq-method -setMethod("otu_table<-", c("phyloseq", "phyloseq"), function(x, value) { - phyloseq(otu_table(value), x@sam_data, x@tax_table, x@phy_tree, x@refseq) +setMethod("otu_table<-", c("phyloseq", "phyloseq"), function(x, value){ + phyloseq(otu_table(value), x@sam_data, x@tax_table, x@phy_tree, x@refseq) }) -################################################################################ +################################################################################ #' Manually change taxa_are_rows through assignment. #' #' The taxa_are_rows slot is a logical indicating the orientation of the @@ -67,22 +65,22 @@ setMethod("otu_table<-", c("phyloseq", "phyloseq"), function(x, value) { #' data(esophagus) #' taxa_are_rows(esophagus) #' taxa_are_rows(otu_table(esophagus)) -setGeneric("taxa_are_rows<-", function(x, value) { - standardGeneric("taxa_are_rows<-") +setGeneric("taxa_are_rows<-", function(x, value){ + standardGeneric("taxa_are_rows<-") }) #' @rdname assign-taxa_are_rows #' @aliases taxa_are_rows<-,otu_table,logical-method -setMethod("taxa_are_rows<-", c("otu_table", "logical"), function(x, value) { - x@taxa_are_rows <- value[1] - return(x) +setMethod("taxa_are_rows<-", c("otu_table", "logical"), function(x, value){ + x@taxa_are_rows <- value[1] + return(x) }) #' @rdname assign-taxa_are_rows #' @aliases taxa_are_rows<-,phyloseq,logical-method -setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value) { - taxa_are_rows(otu_table(x)) <- value - return(x) +setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value){ + taxa_are_rows(otu_table(x)) <- value + return(x) }) -################################################################################ +################################################################################ #' Assign (new) sample_data to \code{x} #' #' This replaces the current \code{sample_data} component of \code{x} with @@ -123,13 +121,13 @@ setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value) { #' head(sample_data(soilrep)) #' sample_data(soilrep)$Time <- as.integer(substr(sample_data(soilrep)$Sample, 1, 1)) #' head(sample_data(soilrep)) -"sample_data<-" <- function(x, value) { - if (!inherits(value, "sample_data")) { - value <- sample_data(value) - } - phyloseq(x@otu_table, value, x@tax_table, x@phy_tree, x@refseq) +"sample_data<-" <- function(x, value){ + if( !inherits(value, "sample_data") ){ + value <- sample_data(value) + } + phyloseq(x@otu_table, value, x@tax_table, x@phy_tree, x@refseq) } -################################################################################ +################################################################################ #' Assign a (new) Taxonomy Table to \code{x} #' #' @usage tax_table(x) <- value @@ -159,31 +157,31 @@ setMethod("taxa_are_rows<-", c("phyloseq", "logical"), function(x, value) { #' # tax_table(ex2c) <- ex2b #' # identical(ex2a, ex2c) #' # ex2c <- phyloseq(otu_table(ex2b), sample_data(ex2b), phy_tree(ex2b)) -#' # tax_table(ex2c) <- as(tax_table(ex2b), 'matrix') +#' # tax_table(ex2c) <- as(tax_table(ex2b), "matrix") #' # identical(ex2a, ex2c) setGeneric("tax_table<-", function(x, value) standardGeneric("tax_table<-")) #' @rdname assign-tax_table #' @aliases tax_table<-,phyloseq,taxonomyTable-method -setMethod("tax_table<-", c("phyloseq", "taxonomyTable"), function(x, value) { - phyloseq(x@otu_table, x@sam_data, value, x@phy_tree, x@refseq) +setMethod("tax_table<-", c("phyloseq", "taxonomyTable"), function(x, value){ + phyloseq(x@otu_table, x@sam_data, value, x@phy_tree, x@refseq) }) #' @rdname assign-tax_table #' @aliases tax_table<-,phyloseq,ANY-method -setMethod("tax_table<-", c("phyloseq", "ANY"), function(x, value) { - phyloseq(x@otu_table, x@sam_data, tax_table(value, FALSE), x@phy_tree, x@refseq) +setMethod("tax_table<-", c("phyloseq", "ANY"), function(x, value){ + phyloseq(x@otu_table, x@sam_data, tax_table(value, FALSE), x@phy_tree, x@refseq) }) #' @rdname assign-tax_table #' @aliases tax_table<-,taxonomyTable,taxonomyTable-method -setMethod("tax_table<-", c("taxonomyTable", "taxonomyTable"), function(x, value) { - # Asign as-is. - value +setMethod("tax_table<-", c("taxonomyTable", "taxonomyTable"), function(x, value){ + # Asign as-is. + value }) #' @rdname assign-tax_table #' @aliases tax_table<-,taxonomyTable,ANY-method -setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value) { - tax_table(value, FALSE) +setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value){ + tax_table(value, FALSE) }) -################################################################################ +################################################################################ #' Assign a (new) phylogenetic tree to \code{x} #' #' @usage phy_tree(x) <- value @@ -195,7 +193,7 @@ setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value) { #' @rdname assign-phy_tree #' @aliases assign-phy_tree phy_tree<- #' @examples # -#' data('esophagus') +#' data("esophagus") #' # An example of pruning to just the first 20 taxa in esophagus #' ex2a <- prune_taxa(taxa_names(esophagus)[1:20], esophagus) #' # The following 3 lines produces an ex2b that is equal to ex2a @@ -205,21 +203,21 @@ setMethod("tax_table<-", c("taxonomyTable", "ANY"), function(x, value) { setGeneric("phy_tree<-", function(x, value) standardGeneric("phy_tree<-")) #' @rdname assign-phy_tree #' @aliases phy_tree<-,phyloseq,phylo-method -setMethod("phy_tree<-", c("phyloseq", "phylo"), function(x, value) { - phyloseq(x@otu_table, x@sam_data, x@tax_table, value, x@refseq) +setMethod("phy_tree<-", c("phyloseq", "phylo"), function(x, value){ + phyloseq(x@otu_table, x@sam_data, x@tax_table, value, x@refseq) }) #' @rdname assign-phy_tree #' @aliases phy_tree<-,phyloseq,phyloseq-method -setMethod("phy_tree<-", c("phyloseq", "phyloseq"), function(x, value) { - phyloseq(x@otu_table, x@sam_data, x@tax_table, phy_tree(value), x@refseq) +setMethod("phy_tree<-", c("phyloseq", "phyloseq"), function(x, value){ + phyloseq(x@otu_table, x@sam_data, x@tax_table, phy_tree(value), x@refseq) }) -################################################################################ +################################################################################ #' Replace OTU identifier names #' #' @usage taxa_names(x) <- value #' #' @param x (Required). An object defined by the \code{\link{phyloseq-package}} -#' \tthat describes OTUs in some way. +#' that describes OTUs in some way. #' @param value (Required). A character vector #' to replace the current \code{\link{taxa_names}}. #' @@ -229,44 +227,43 @@ setMethod("phy_tree<-", c("phyloseq", "phyloseq"), function(x, value) { #' @aliases assign-taxa_names taxa_names<- #' #' @examples -#' data('esophagus') +#' data("esophagus") #' taxa_names(esophagus) -#' # plot_tree(esophagus, label.tips='taxa_names', ladderize='left') -#' taxa_names(esophagus) <- paste('OTU-', taxa_names(esophagus), sep='') +#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left") +#' taxa_names(esophagus) <- paste("OTU-", taxa_names(esophagus), sep="") #' taxa_names(esophagus) -#' # plot_tree(esophagus, label.tips='taxa_names', ladderize='left') +#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left") #' ## non-characters are first coerced to characters. #' taxa_names(esophagus) <- 1:ntaxa(esophagus) #' taxa_names(esophagus) -#' # plot_tree(esophagus, label.tips='taxa_names', ladderize='left') +#' # plot_tree(esophagus, label.tips="taxa_names", ladderize="left") #' ## Cannot assign non-unique or differently-lengthed name vectors. Error. #' # taxa_names(esophagus) <- sample(c(TRUE, FALSE), ntaxa(esophagus), TRUE) #' # taxa_names(esophagus) <- sample(taxa_names(esophagus), ntaxa(esophagus)-5, FALSE) -setGeneric("taxa_names<-", function(x, value) { - if (anyDuplicated(value)) { - stop("taxa_names<-: You are attempting to assign duplicated taxa_names") - } - standardGeneric("taxa_names<-") +setGeneric("taxa_names<-", function(x, value){ + if( anyDuplicated(value) ){ + stop("taxa_names<-: You are attempting to assign duplicated taxa_names") + } + standardGeneric("taxa_names<-") }) -# Attempt to coerce value to a character vector. Remaining methods will require -# it. +# Attempt to coerce value to a character vector. Remaining methods will require it. #' @rdname assign-taxa_names #' @aliases taxa_names<-,ANY,ANY-method -setMethod("taxa_names<-", c("ANY", "ANY"), function(x, value) { +setMethod("taxa_names<-", c("ANY", "ANY"), function(x, value){ taxa_names(x) <- as(value, "character") return(x) }) -# value is now character, but no specific method for first argumet return x -# unchanged. +# value is now character, but no specific method for first argumet +# return x unchanged. #' @rdname assign-taxa_names #' @aliases taxa_names<-,ANY,character-method -setMethod("taxa_names<-", c("ANY", "character"), function(x, value) { +setMethod("taxa_names<-", c("ANY", "character"), function(x, value){ return(x) }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,otu_table,character-method -setMethod("taxa_names<-", c("otu_table", "character"), function(x, value) { - if (taxa_are_rows(x)) { +setMethod("taxa_names<-", c("otu_table", "character"), function(x, value){ + if( taxa_are_rows(x) ){ rownames(x) <- value } else { colnames(x) <- value @@ -275,39 +272,40 @@ setMethod("taxa_names<-", c("otu_table", "character"), function(x, value) { }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,taxonomyTable,character-method -setMethod("taxa_names<-", c("taxonomyTable", "character"), function(x, value) { +setMethod("taxa_names<-", c("taxonomyTable", "character"), function(x, value){ rownames(x) <- value return(x) }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,phylo,character-method -setMethod("taxa_names<-", c("phylo", "character"), function(x, value) { +setMethod("taxa_names<-", c("phylo", "character"), function(x, value){ x$tip.label <- value return(x) }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,XStringSet,character-method -setMethod("taxa_names<-", c("XStringSet", "character"), function(x, value) { +setMethod("taxa_names<-", c("XStringSet", "character"), function(x, value){ names(x) <- value return(x) }) #' @rdname assign-taxa_names #' @aliases taxa_names<-,phyloseq,character-method -setMethod("taxa_names<-", c("phyloseq", "character"), function(x, value) { +setMethod("taxa_names<-", c("phyloseq", "character"), function(x, value){ # dispatch on components taxa_names(x@otu_table) <- value - taxa_names(x@phy_tree) <- value + taxa_names(x@phy_tree) <- value taxa_names(x@tax_table) <- value - taxa_names(x@refseq) <- value + taxa_names(x@refseq) <- value return(x) }) -################################################################################ +################################################################################ +################################################################################ #' Replace OTU identifier names #' #' @usage sample_names(x) <- value #' #' @param x (Required). An object defined by the \code{\link{phyloseq-package}} -#' \tthat describes OTUs in some way. +#' that describes OTUs in some way. #' @param value (Required). A character vector #' to replace the current \code{\link{sample_names}}. #' @@ -317,62 +315,61 @@ setMethod("taxa_names<-", c("phyloseq", "character"), function(x, value) { #' @aliases assign-sample_names sample_names<- #' #' @examples -#' data('esophagus') +#' data("esophagus") #' sample_names(esophagus) -#' # plot_tree(esophagus, color='sample_names', ladderize='left') -#' sample_names(esophagus) <- paste('Sa-', sample_names(esophagus), sep='') +#' # plot_tree(esophagus, color="sample_names", ladderize="left") +#' sample_names(esophagus) <- paste("Sa-", sample_names(esophagus), sep="") #' sample_names(esophagus) -#' # plot_tree(esophagus, color='sample_names', ladderize='left') +#' # plot_tree(esophagus, color="sample_names", ladderize="left") #' ## non-characters are first coerced to characters. #' sample_names(esophagus) <- 1:nsamples(esophagus) #' sample_names(esophagus) -#' # plot_tree(esophagus, color='sample_names', ladderize='left') +#' # plot_tree(esophagus, color="sample_names", ladderize="left") #' ## Cannot assign non-unique or differently-lengthed name vectors. Error. #' # sample_names(esophagus) <- sample(c(TRUE, FALSE), nsamples(esophagus), TRUE) #' # sample_names(esophagus) <- sample(sample_names(esophagus), nsamples(esophagus)-1, FALSE) -setGeneric("sample_names<-", function(x, value) { - if (anyDuplicated(value)) { - stop("sample_names<-: You are attempting to assign duplicated sample_names") - } - standardGeneric("sample_names<-") +setGeneric("sample_names<-", function(x, value){ + if( anyDuplicated(value) ){ + stop("sample_names<-: You are attempting to assign duplicated sample_names") + } + standardGeneric("sample_names<-") }) -# Attempt to coerce value to a character vector. Remaining methods will require -# it. +# Attempt to coerce value to a character vector. Remaining methods will require it. #' @rdname assign-sample_names #' @aliases sample_names<-,ANY,ANY-method -setMethod("sample_names<-", c("ANY", "ANY"), function(x, value) { - sample_names(x) <- as(value, "character") - return(x) +setMethod("sample_names<-", c("ANY", "ANY"), function(x, value){ + sample_names(x) <- as(value, "character") + return(x) }) -# value is now character, but no specific method for first argumet return x -# unchanged. +# value is now character, but no specific method for first argumet +# return x unchanged. #' @rdname assign-sample_names #' @aliases sample_names<-,ANY,character-method -setMethod("sample_names<-", c("ANY", "character"), function(x, value) { - return(x) +setMethod("sample_names<-", c("ANY", "character"), function(x, value){ + return(x) }) #' @rdname assign-sample_names #' @aliases sample_names<-,otu_table,character-method -setMethod("sample_names<-", c("otu_table", "character"), function(x, value) { - if (taxa_are_rows(x)) { - colnames(x) <- value - } else { - rownames(x) <- value - } - return(x) +setMethod("sample_names<-", c("otu_table", "character"), function(x, value){ + if( taxa_are_rows(x) ){ + colnames(x) <- value + } else { + rownames(x) <- value + } + return(x) }) #' @rdname assign-sample_names #' @aliases sample_names<-,sample_data,character-method -setMethod("sample_names<-", c("sample_data", "character"), function(x, value) { - rownames(x) <- value - return(x) +setMethod("sample_names<-", c("sample_data", "character"), function(x, value){ + rownames(x) <- value + return(x) }) #' @rdname assign-sample_names #' @aliases sample_names<-,phyloseq,character-method -setMethod("sample_names<-", c("phyloseq", "character"), function(x, value) { - # dispatch on components - sample_names(x@otu_table) <- value - sample_names(x@sam_data) <- value - return(x) +setMethod("sample_names<-", c("phyloseq", "character"), function(x, value){ + # dispatch on components + sample_names(x@otu_table) <- value + sample_names(x@sam_data) <- value + return(x) }) -################################################################################ +################################################################################ \ No newline at end of file diff --git a/R/deprecated_functions.R b/R/deprecated_functions.R index 94f81132..e4c62eb7 100644 --- a/R/deprecated_functions.R +++ b/R/deprecated_functions.R @@ -1,7 +1,7 @@ -################################################################################ +################################################################################ #' Depcrecated functions in the phyloseq package. #' -#' These will be migrated to \code{'defunct'} status in the next release, +#' These will be migrated to \code{"defunct"} status in the next release, #' and removed completely in the release after that. #' These functions are provided for compatibility with older version of #' the phyloseq package. They may eventually be completely @@ -62,173 +62,69 @@ #' \code{taxTab<-} \tab now a synonym for \code{\link{tax_table<-}}\cr #' } #' -deprecated_phyloseq_function <- function(x, value, ...) { - return(NULL) -} -plot_taxa_bar <- function(...) { - .Deprecated("plot_bar", package = "phyloseq") - return(plot_bar(...)) -} -taxaplot <- function(...) { - .Deprecated("plot_bar", package = "phyloseq") - return(plot_bar(...)) -} -taxtab <- function(...) { - .Deprecated("tax_table", package = "phyloseq") - return(tax_table(...)) -} -taxTab <- function(...) { - .Deprecated("tax_table", package = "phyloseq") - return(tax_table(...)) -} -sampleData <- function(...) { - .Deprecated("sample_data", package = "phyloseq") - return(sample_data(...)) -} -samData <- function(...) { - .Deprecated("sample_data", package = "phyloseq") - return(sample_data(...)) -} -sam_data <- function(...) { - .Deprecated("sample_data", package = "phyloseq") - return(sample_data(...)) -} -speciesSums <- function(...) { - .Deprecated("taxa_sums", package = "phyloseq") - return(taxa_sums(...)) -} -sampleSums <- function(...) { - .Deprecated("sample_sums", package = "phyloseq") - return(sample_sums(...)) -} -nspecies <- function(...) { - .Deprecated("ntaxa", package = "phyloseq") - return(ntaxa(...)) -} -species.names <- function(...) { - .Deprecated("taxa_names", package = "phyloseq") - return(taxa_names(...)) -} -sampleNames <- function(...) { - .Deprecated("sample_names", package = "phyloseq") - return(sample_names(...)) -} -sample.names <- function(...) { - .Deprecated("sample_names", package = "phyloseq") - return(sample_names(...)) -} -getSamples <- function(...) { - .Deprecated("get_sample", package = "phyloseq") - return(get_sample(...)) -} -getSpecies <- function(...) { - .Deprecated("get_taxa", package = "phyloseq") - return(get_taxa(...)) -} -rank.names <- function(...) { - .Deprecated("rank_names", package = "phyloseq") - return(rank_names(...)) -} -getTaxa <- function(...) { - .Deprecated("get_taxa_unique", package = "phyloseq") - return(get_taxa_unique(...)) -} -sample.variables <- function(...) { - .Deprecated("sample_variables", package = "phyloseq") - return(sample_variables(...)) -} -getVariable <- function(...) { - .Deprecated("get_variable", package = "phyloseq") - return(get_variable(...)) -} -merge_species <- function(...) { - .Deprecated("merge_taxa", package = "phyloseq") - return(merge_taxa(...)) -} -otuTable <- function(...) { - .Deprecated("otu_table", package = "phyloseq") - return(otu_table(...)) -} -speciesarerows <- function(...) { - .Deprecated("taxa_are_rows", package = "phyloseq") - return(taxa_are_rows(...)) -} -speciesAreRows <- function(...) { - .Deprecated("taxa_are_rows", package = "phyloseq") - return(taxa_are_rows(...)) -} -plot_richness_estimates <- function(...) { - .Deprecated("plot_richness", package = "phyloseq") - return(plot_richness(...)) -} -import_qiime_sampleData <- function(...) { - .Deprecated("import_qiime_sample_data", package = "phyloseq") - return(import_qiime_sample_data(...)) -} -filterfunSample <- function(...) { - .Deprecated("filterfun_sample", package = "phyloseq") - return(filterfun_sample(...)) -} -genefilterSample <- function(...) { - .Deprecated("genefilter_sample", package = "phyloseq") - return(genefilter_sample(...)) -} -prune_species <- function(...) { - .Deprecated("prune_taxa", package = "phyloseq") - return(prune_taxa(...)) -} -subset_species <- function(...) { - .Deprecated("subset_taxa", package = "phyloseq") - return(subset_taxa(...)) -} -tipglom <- function(...) { - .Deprecated("tip_glom", package = "phyloseq") - return(tip_glom(...)) -} -taxglom <- function(...) { - .Deprecated("tax_glom", package = "phyloseq") - return(tax_glom(...)) -} -tre <- function(...) { - .Deprecated("phy_tree", package = "phyloseq") - return(phy_tree(...)) -} -show_mothur_list_cutoffs <- function(...) { - .Deprecated("show_mothur_cutoffs", package = "phyloseq") - return(show_mothur_cutoffs(...)) -} -originalUniFrac <- function(...) { - .Deprecated("fastUniFrac", package = "phyloseq") - return(fastUniFrac(...)) -} -"sam_data<-" <- function(x, value) { - .Deprecated("sample_data<-", package = "phyloseq") +deprecated_phyloseq_function <- function(x, value, ...){return(NULL)} +plot_taxa_bar <- function(...){.Deprecated("plot_bar", package="phyloseq");return(plot_bar(...))} +taxaplot <- function(...){.Deprecated("plot_bar", package="phyloseq");return(plot_bar(...))} +taxtab <- function(...){.Deprecated("tax_table", package="phyloseq");return(tax_table(...))} +taxTab <- function(...){.Deprecated("tax_table", package="phyloseq");return(tax_table(...))} +sampleData <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))} +samData <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))} +sam_data <- function(...){.Deprecated("sample_data", package="phyloseq");return(sample_data(...))} +speciesSums <- function(...){.Deprecated("taxa_sums", package="phyloseq");return(taxa_sums(...))} +sampleSums <- function(...){.Deprecated("sample_sums", package="phyloseq");return(sample_sums(...))} +nspecies <- function(...){.Deprecated("ntaxa", package="phyloseq");return(ntaxa(...))} +species.names <- function(...){.Deprecated("taxa_names", package="phyloseq");return(taxa_names(...))} +sampleNames <- function(...){.Deprecated("sample_names", package="phyloseq");return(sample_names(...))} +sample.names <- function(...){.Deprecated("sample_names", package="phyloseq");return(sample_names(...))} +getSamples <- function(...){.Deprecated("get_sample", package="phyloseq");return(get_sample(...))} +getSpecies <- function(...){.Deprecated("get_taxa", package="phyloseq");return(get_taxa(...))} +rank.names <- function(...){.Deprecated("rank_names", package="phyloseq");return(rank_names(...))} +getTaxa <- function(...){.Deprecated("get_taxa_unique", package="phyloseq");return(get_taxa_unique(...))} +sample.variables <- function(...){.Deprecated("sample_variables", package="phyloseq");return(sample_variables(...))} +getVariable <- function(...){.Deprecated("get_variable", package="phyloseq");return(get_variable(...))} +merge_species <- function(...){.Deprecated("merge_taxa", package="phyloseq");return(merge_taxa(...))} +otuTable <- function(...){.Deprecated("otu_table", package="phyloseq");return(otu_table(...))} +speciesarerows <- function(...){.Deprecated("taxa_are_rows", package="phyloseq");return(taxa_are_rows(...))} +speciesAreRows <- function(...){.Deprecated("taxa_are_rows", package="phyloseq");return(taxa_are_rows(...))} +plot_richness_estimates <- function(...){.Deprecated("plot_richness", package="phyloseq");return(plot_richness(...))} +import_qiime_sampleData <- function(...){.Deprecated("import_qiime_sample_data", package="phyloseq");return(import_qiime_sample_data(...))} +filterfunSample <- function(...){.Deprecated("filterfun_sample", package="phyloseq");return(filterfun_sample(...))} +genefilterSample <- function(...){.Deprecated("genefilter_sample", package="phyloseq");return(genefilter_sample(...))} +prune_species <- function(...){.Deprecated("prune_taxa", package="phyloseq");return(prune_taxa(...))} +subset_species <- function(...){.Deprecated("subset_taxa", package="phyloseq");return(subset_taxa(...))} +tipglom <- function(...){.Deprecated("tip_glom", package="phyloseq");return(tip_glom(...))} +taxglom <- function(...){.Deprecated("tax_glom", package="phyloseq");return(tax_glom(...))} +tre <- function(...){.Deprecated("phy_tree", package="phyloseq");return(phy_tree(...))} +show_mothur_list_cutoffs <- function(...){.Deprecated("show_mothur_cutoffs", package="phyloseq");return(show_mothur_cutoffs(...))} +originalUniFrac <- function(...){.Deprecated("fastUniFrac", package="phyloseq");return(fastUniFrac(...))} +"sam_data<-" <- function(x, value){ + .Deprecated("sample_data<-", package="phyloseq") sample_data(x) <- value return(x) } -"sampleData<-" <- function(x, value) { - .Deprecated("sample_data<-", package = "phyloseq") +"sampleData<-" <- function(x, value){ + .Deprecated("sample_data<-", package="phyloseq") sample_data(x) <- value return(x) } -"tre<-" <- function(x, value) { - .Deprecated("phy_tree<-", package = "phyloseq") +"tre<-" <- function(x, value){ + .Deprecated("phy_tree<-", package="phyloseq") phy_tree(x) <- value return(x) } -"speciesAreRows<-" <- function(x, value) { - .Deprecated("taxa_are_rows<-", package = "phyloseq") +"speciesAreRows<-" <- function(x, value){ + .Deprecated("taxa_are_rows<-", package="phyloseq") taxa_are_rows(x) <- value return(x) } -"otuTable<-" <- function(x, value) { - .Deprecated("otu_table<-", package = "phyloseq") +"otuTable<-" <- function(x, value){ + .Deprecated("otu_table<-", package="phyloseq") otu_table(x) <- value return(x) } -"taxTab<-" <- function(x, value) { - .Deprecated("tax_table<-", package = "phyloseq") +"taxTab<-" <- function(x, value){ + .Deprecated("tax_table<-", package="phyloseq") tax_table(x) <- value return(x) } -################################################################################ +################################################################################ diff --git a/R/extend_DESeq2.R b/R/extend_DESeq2.R index 532dad84..2132fa7b 100644 --- a/R/extend_DESeq2.R +++ b/R/extend_DESeq2.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Convert phyloseq data to DESeq2 dds object #' #' No testing is performed by this function. The phyloseq data is converted @@ -23,7 +23,7 @@ #' reference sample class in tests by setting it to the first of the factor levels #' using the \code{\link{relevel}} function: #' -#' \code{sample_data(entill)$Enterotype <- relevel(sample_data(entill)$Enterotype, '1')} +#' \code{sample_data(entill)$Enterotype <- relevel(sample_data(entill)$Enterotype, "1")} #' #' @param ... (Optional). Additional named arguments passed to \code{\link[DESeq2]{DESeqDataSetFromMatrix}}. #' Most users will not need to pass any additional arguments here. @@ -34,7 +34,7 @@ #' #' @seealso #' -#' \code{vignette('phyloseq-mixture-models')} +#' \code{vignette("phyloseq-mixture-models")} #' #' The #' \href{http://joey711.github.io/phyloseq-extensions}{phyloseq-extensions} @@ -50,23 +50,21 @@ #' #' @examples #' # Check out the vignette phyloseq-mixture-models for more details. -#' # vignette('phyloseq-mixture-models') +#' # vignette("phyloseq-mixture-models") #' data(soilrep) #' phyloseq_to_deseq2(soilrep, ~warmed) -phyloseq_to_deseq2 = function(physeq, design, ...) { +phyloseq_to_deseq2 = function(physeq, design, ...){ # Need to add check here for missing sample_data - if (is.null(sample_data(physeq, FALSE))) { + if( is.null(sample_data(physeq, FALSE)) ){ stop("There must be sample_data present, for specifying experimental design. See ?phyloseq_to_deseq2") } # Enforce orientation. Samples are columns - if (!taxa_are_rows(physeq)) { - physeq <- t(physeq) - } + if( !taxa_are_rows(physeq) ){ physeq <- t(physeq)} # Coerce count data to vanilla matrix of integers - countData = round(as(otu_table(physeq), "matrix"), digits = 0) + countData = round(as(otu_table(physeq), "matrix"), digits=0) colData = data.frame(sample_data(physeq)) # Create the DESeq data set, dds. dds <- DESeqDataSetFromMatrix(countData, colData, design, ...) return(dds) } -################################################################################ +################################################################################ diff --git a/R/extend_vegan.R b/R/extend_vegan.R index 01f43444..2da390ba 100644 --- a/R/extend_vegan.R +++ b/R/extend_vegan.R @@ -1,44 +1,52 @@ -################################################################################ Define S3 methods for scores (originally defined by vegan-package) to work for -################################################################################ other ordination results vegan:::scores.default pcoa-class, from pcoa{ape} +################################################################################ +# Define S3 methods for scores (originally defined by vegan-package) +# to work for other ordination results +# vegan:::scores.default +################################################################################ +# pcoa-class, from pcoa{ape} #' @importFrom vegan wascores #' @importFrom vegan scores #' @keywords internal -scores.pcoa <- function(x, choices = NULL, display = "sites", physeq = NULL, ...) { - if (is.null(choices)) { - choices <- colnames(x$vectors) - } +scores.pcoa <- function(x, choices=NULL, display="sites", physeq=NULL, ...){ + if(is.null(choices)){ + choices <- colnames(x$vectors) + } co = list(sites = x$vectors[, choices]) - if ("species" %in% display) { - if (is.null(otu_table(physeq, errorIfNULL = FALSE))) { - warning("scores.pcoa: Failed to access OTU table from `physeq` argument, \n\n needed for weighted average of OTU/taxa/species points in MDS/PCoA.") + if( "species" %in% display ){ + if(is.null(otu_table(physeq, errorIfNULL = FALSE))){ + warning("scores.pcoa: Failed to access OTU table from `physeq` argument, \n + needed for weighted average of OTU/taxa/species points in MDS/PCoA.") } else { - # MDS/PCoA only provides coordinates of the elements in the distance matrix, - # usually sites/samples, so species (etc.) This means we need to use the - # weighted-average as there is no corresponding axes from the ordination - # directly. - co$species <- wascores(x$vectors[, choices], w = veganifyOTU(physeq)) + # MDS/PCoA only provides coordinates of the elements in the + # distance matrix, usually sites/samples, so species (etc.) + # This means we need to use the weighted-average as there is + # no corresponding axes from the ordination directly. + co$species <- wascores(x$vectors[, choices], w = veganifyOTU(physeq)) } - } + } co <- co[display] - if (length(co) < 2L) { + if(length(co) < 2L){ # Unlist co <- co[[display]] } return(co) } -################################################################################ DPCoA management +################################################################################ +# DPCoA management +################################################################################ #' @importFrom vegan scores #' @keywords internal -get_dpcoa_species_coords = function(x, physeq = NULL) { +get_dpcoa_species_coords = function(x, physeq=NULL){ # Grab coordinates from the dpcoa object coords = x$dls - # ade4 mangles the element names using `make.names` conventions in base R Replace - # them in `coords` - if (is.null(taxa_names(physeq))) { - warning("scores.dpcoa: Failed to access `taxa_names` from `physeq` argument, \n\n needed to ensure correct mapping of OTU/taxa/species points in DPCoA.") + # ade4 mangles the element names using `make.names` conventions in base R + # Replace them in `coords` + if(is.null(taxa_names(physeq))){ + warning("scores.dpcoa: Failed to access `taxa_names` from `physeq` argument, \n + needed to ensure correct mapping of OTU/taxa/species points in DPCoA.") } else { - # if the names are available, use them by mapping the same variable-name - # conversion that ade4 would have used. + # if the names are available, use them + # by mapping the same variable-name conversion that ade4 would have used. taxnames = taxa_names(physeq) names(taxnames) <- make.names(taxnames) rownames(coords) <- taxnames[rownames(coords)] @@ -47,16 +55,17 @@ get_dpcoa_species_coords = function(x, physeq = NULL) { } #' @importFrom vegan scores #' @keywords internal -get_dpcoa_sites_coords = function(x, physeq = NULL) { +get_dpcoa_sites_coords = function(x, physeq=NULL){ # Grab coordinates from the dpcoa object coords = x$li - # ade4 mangles the element names using `make.names` conventions in base R Replace - # them in `coords` - if (is.null(sample_names(physeq))) { - warning("scores.dpcoa: Failed to access `sample_names` from `physeq` argument, \n\n needed to ensure correct mapping of site/sample/library points in DPCoA.") + # ade4 mangles the element names using `make.names` conventions in base R + # Replace them in `coords` + if(is.null(sample_names(physeq))){ + warning("scores.dpcoa: Failed to access `sample_names` from `physeq` argument, \n + needed to ensure correct mapping of site/sample/library points in DPCoA.") } else { - # if the names are available, use them by mapping the same variable-name - # conversion that ade4 would have used. + # if the names are available, use them + # by mapping the same variable-name conversion that ade4 would have used. samplenames = sample_names(physeq) names(samplenames) <- make.names(samplenames) rownames(coords) <- samplenames[rownames(coords)] @@ -66,49 +75,64 @@ get_dpcoa_sites_coords = function(x, physeq = NULL) { # dpcoa-class, from ade4 #' @importFrom vegan scores #' @keywords internal -scores.dpcoa <- function(x, choices = NULL, display = "sites", physeq = NULL, ...) { - # x = ordination display = 'species' +scores.dpcoa <- function(x, choices=NULL, display="sites", physeq=NULL, ...){ + # x = ordination + # display = "species" coords = NULL - # `display` must be either 'sites' or 'species', per vegan-package convention. - coords <- switch(EXPR = display, species = get_dpcoa_species_coords(x, physeq), - sites = get_dpcoa_sites_coords(x, physeq)) + # `display` must be either "sites" or "species", per vegan-package convention. + coords <- switch(EXPR = display, + species = get_dpcoa_species_coords(x, physeq), + sites = get_dpcoa_sites_coords(x, physeq)) # If no choices selection, take all dimensions/columns - if (is.null(choices)) { - choices <- 1:ncol(coords) + if(is.null(choices)){ + choices <- 1:ncol(coords) } - return(coords[, choices, drop = FALSE]) + return( coords[, choices, drop=FALSE] ) } -################################################################################ Extend vegdist for phyloseq classes \code{\link[vegan]{vegdist}} wrapper for -################################################################################ phyloseq classes Trivially-extended S4 method from the -################################################################################ \code{\link[vegan]{vegdist}} function, such that S4 classes from the -################################################################################ \code{\link{phyloseq-package}} are properly handled / accessed. All -################################################################################ parameters passed on to \code{\link[vegan]{vegdist}} verbatim. @seealso -################################################################################ \code{\link[vegan]{vegdist}} @rdname vegdist-methods @docType methods -################################################################################ @aliases vegdist @examples data(esophagus) vegdist(esophagus, 'jaccard') +################################################################################ +# Extend vegdist for phyloseq classes +################################################################################ +# \code{\link[vegan]{vegdist}} wrapper for phyloseq classes +# +# Trivially-extended S4 method from the \code{\link[vegan]{vegdist}} function, +# such that S4 classes from the \code{\link{phyloseq-package}} are properly +# handled / accessed. All parameters passed on to \code{\link[vegan]{vegdist}} +# verbatim. +# +# @seealso \code{\link[vegan]{vegdist}} +# @rdname vegdist-methods +# @docType methods +# @aliases vegdist +# +# @examples +# data(esophagus) +# vegdist(esophagus, "jaccard") #' @importFrom vegan vegdist #' @keywords internal setGeneric("vegdist") -################################################################################ @aliases vegdist,otu_table-method @rdname vegdist-methods +################################################################################ +# @aliases vegdist,otu_table-method +# @rdname vegdist-methods #' @importFrom vegan vegdist -setMethod("vegdist", "otu_table", function(x, method = "bray", binary = FALSE, diag = FALSE, - upper = FALSE, na.rm = FALSE, ...) { - # Make sure in sample-by-species orientation - if (taxa_are_rows(x)) { - x <- t(x) - } - # Convert to simple matrix - x <- as(x, "matrix") - # pass to standard method (compiled C) - vegdist(x, method, binary, diag, upper, na.rm, ...) +setMethod("vegdist", "otu_table", function(x, method = "bray", binary = FALSE, + diag = FALSE, upper = FALSE, na.rm = FALSE, ...){ + # Make sure in sample-by-species orientation + if( taxa_are_rows(x) ){x <- t(x)} + # Convert to simple matrix + x <- as(x, "matrix") + # pass to standard method (compiled C) + vegdist(x, method, binary, diag, upper, na.rm, ...) }) -################################################################################ @aliases vegdist,phyloseq-method @rdname vegdist-methods -setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE, diag = FALSE, - upper = FALSE, na.rm = FALSE, ...) { - # Simply access the otu_table - x <- otu_table(x) - vegdist(x, method, binary, diag, upper, na.rm, ...) +################################################################################ +# @aliases vegdist,phyloseq-method +# @rdname vegdist-methods +setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE, + diag = FALSE, upper = FALSE, na.rm = FALSE, ...){ + # Simply access the otu_table + x <- otu_table(x) + vegdist(x, method, binary, diag, upper, na.rm, ...) }) -################################################################################ +################################################################################ #' Summarize alpha diversity #' #' Performs a number of standard alpha diversity estimates, @@ -136,7 +160,7 @@ setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE, di #' Alternatively, you can specify one or more measures #' as a character vector of measure names. #' Values must be among those supported: -#' \code{c('Observed', 'Chao1', 'ACE', 'Shannon', 'Simpson', 'InvSimpson', 'Fisher')}. +#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}. #' #' @return A \code{data.frame} of the richness estimates, and their standard error. #' @@ -159,91 +183,92 @@ setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE, di #' @examples #' ## There are many more interesting examples at the phyloseq online tutorials. #' ## http://joey711.github.com/phyloseq/plot_richness-examples -#' data('esophagus') +#' data("esophagus") #' # Default is all available measures #' estimate_richness(esophagus) #' # Specify just one: -#' estimate_richness(esophagus, measures='Observed') +#' estimate_richness(esophagus, measures="Observed") #' # Specify a few: -#' estimate_richness(esophagus, measures=c('Observed', 'InvSimpson', 'Shannon', 'Chao1')) -estimate_richness <- function(physeq, split = TRUE, measures = NULL) { - - if (!any(otu_table(physeq) == 1)) { - # Check for singletons, and then warning if they are missing. These metrics only - # really meaningful if singletons are included. - warning("The data you have provided does not have\n", "any singletons. This is highly suspicious. Results of richness\n", - "estimates (for example) are probably unreliable, or wrong, if you have already\n", - "trimmed low-abundance taxa from the data.\n", "\n", "We recommended that you find the un-trimmed data and retry.") - } - - # If we are not splitting sample-wise, sum the species. Else, enforce - # orientation. - if (!split) { - OTU <- taxa_sums(physeq) - } else if (split) { - OTU <- as(otu_table(physeq), "matrix") - if (taxa_are_rows(physeq)) { - OTU <- t(OTU) - } - } - - # Define renaming vector: - renamevec = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", - "Fisher") - names(renamevec) <- c("S.obs", "S.chao1", "S.ACE", "shannon", "simpson", "invsimpson", - "fisher") - # If measures was not explicitly provided (is NULL), set to all supported methods - if (is.null(measures)) { - measures = as.character(renamevec) - } +#' estimate_richness(esophagus, measures=c("Observed", "InvSimpson", "Shannon", "Chao1")) +estimate_richness <- function(physeq, split=TRUE, measures=NULL){ + + if( !any(otu_table(physeq)==1) ){ + # Check for singletons, and then warning if they are missing. + # These metrics only really meaningful if singletons are included. + warning( + "The data you have provided does not have\n", + "any singletons. This is highly suspicious. Results of richness\n", + "estimates (for example) are probably unreliable, or wrong, if you have already\n", + "trimmed low-abundance taxa from the data.\n", + "\n", + "We recommended that you find the un-trimmed data and retry." + ) + } + + # If we are not splitting sample-wise, sum the species. Else, enforce orientation. + if( !split ){ + OTU <- taxa_sums(physeq) + } else if( split ){ + OTU <- as(otu_table(physeq), "matrix") + if( taxa_are_rows(physeq) ){ OTU <- t(OTU) } + } + + # Define renaming vector: + renamevec = c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher") + names(renamevec) <- c("S.obs", "S.chao1", "S.ACE", "shannon", "simpson", "invsimpson", "fisher") + # If measures was not explicitly provided (is NULL), set to all supported methods + if( is.null(measures) ){ + measures = as.character(renamevec) + } # Rename measures if they are in the old-style - if (any(measures %in% names(renamevec))) { - measures[measures %in% names(renamevec)] <- renamevec[names(renamevec) %in% - measures] + if( any(measures %in% names(renamevec)) ){ + measures[measures %in% names(renamevec)] <- renamevec[names(renamevec) %in% measures] } # Stop with error if no measures are supported - if (!any(measures %in% renamevec)) { + if( !any(measures %in% renamevec) ){ stop("None of the `measures` you provided are supported. Try default `NULL` instead.") } # Initialize to NULL outlist = vector("list") - # Some standard diversity indices + # Some standard diversity indices estimRmeas = c("Chao1", "Observed", "ACE") - if (any(estimRmeas %in% measures)) { + if( any(estimRmeas %in% measures) ){ outlist <- c(outlist, list(t(data.frame(estimateR(OTU))))) - } - if ("Shannon" %in% measures) { - outlist <- c(outlist, list(shannon = diversity(OTU, index = "shannon"))) - } - if ("Simpson" %in% measures) { - outlist <- c(outlist, list(simpson = diversity(OTU, index = "simpson"))) - } - if ("InvSimpson" %in% measures) { - outlist <- c(outlist, list(invsimpson = diversity(OTU, index = "invsimpson"))) - } - if ("Fisher" %in% measures) { - fisher = tryCatch(fisher.alpha(OTU, se = TRUE), warning = function(w) { - warning("phyloseq::estimate_richness: Warning in fisher.alpha(). See `?fisher.fit` or ?`fisher.alpha`. Treat fisher results with caution") - suppressWarnings(fisher.alpha(OTU, se = TRUE)[, c("alpha", "se")]) - }) - if (!is.null(dim(fisher))) { + } + if( "Shannon" %in% measures ){ + outlist <- c(outlist, list(shannon = diversity(OTU, index="shannon"))) + } + if( "Simpson" %in% measures ){ + outlist <- c(outlist, list(simpson = diversity(OTU, index="simpson"))) + } + if( "InvSimpson" %in% measures ){ + outlist <- c(outlist, list(invsimpson = diversity(OTU, index="invsimpson"))) + } + if( "Fisher" %in% measures ){ + fisher = tryCatch(fisher.alpha(OTU, se=TRUE), + warning=function(w){ + warning("phyloseq::estimate_richness: Warning in fisher.alpha(). See `?fisher.fit` or ?`fisher.alpha`. Treat fisher results with caution") + suppressWarnings(fisher.alpha(OTU, se=TRUE)[, c("alpha", "se")]) + } + ) + if(!is.null(dim(fisher))){ colnames(fisher)[1:2] <- c("Fisher", "se.fisher") outlist <- c(outlist, list(fisher)) } else { - outlist <- c(outlist, Fisher = list(fisher)) + outlist <- c(outlist, Fisher=list(fisher)) } - } + } out = do.call("cbind", outlist) # Rename columns per renamevec namechange = intersect(colnames(out), names(renamevec)) colnames(out)[colnames(out) %in% namechange] <- renamevec[namechange] - # Final prune to just those columns related to 'measures'. Use grep. - colkeep = sapply(paste0("(se\\.){0,}", measures), grep, colnames(out), ignore.case = TRUE) - out = out[, sort(unique(unlist(colkeep))), drop = FALSE] + # Final prune to just those columns related to "measures". Use grep. + colkeep = sapply(paste0("(se\\.){0,}", measures), grep, colnames(out), ignore.case=TRUE) + out = out[, sort(unique(unlist(colkeep))), drop=FALSE] # Make sure that you return a data.frame for reliable performance. out <- as.data.frame(out) - return(out) + return(out) } -################################################################################ +################################################################################ diff --git a/R/extract-methods.R b/R/extract-methods.R index b46c294a..e334d752 100644 --- a/R/extract-methods.R +++ b/R/extract-methods.R @@ -1,5 +1,7 @@ -################################################################################ subsetting functions Without these, the default coerces to the base object -################################################################################ (e.g. matrix or data.frame) +################################################################################ +# subsetting functions +# Without these, the default coerces to the base object (e.g. matrix or data.frame) +################################################################################ #' Method extensions to extraction operator for phyloseq objects. #' #' See the documentation for the \code{\link[base]{Extract}} generic, @@ -29,40 +31,44 @@ #' data(esophagus) #' nrow(otu_table(esophagus)) #' nrow(otu_table(esophagus)[1:5, ]) -setMethod("[", "otu_table", function(x, i, j, ...) { - newx <- as(x, "matrix")[i, j, drop = FALSE] - otu_table(newx, taxa_are_rows(x)) +setMethod("[", "otu_table", function(x, i, j, ...){ + newx <- as(x, "matrix")[i, j, drop=FALSE] + otu_table(newx, taxa_are_rows(x) ) }) # extract parts of sample_data +# #' @export #' @rdname extract-methods -setMethod("[", "sample_data", function(x, i, j, ...) { - sample_data(data.frame(x)[i, j, drop = FALSE]) +setMethod("[", "sample_data", function(x, i, j, ...){ + sample_data( data.frame(x)[i, j, drop=FALSE] ) }) # extract parts of taxonomyTable +# #' @export #' @rdname extract-methods -setMethod("[", "taxonomyTable", function(x, i, j, ...) { +setMethod("[", "taxonomyTable", function(x, i, j, ...){ # Coerce to matrix, apply std extraction, reconstruct. - return(tax_table(as(x, "matrix")[i, j, drop = FALSE])) + return( tax_table(as(x, "matrix")[i, j, drop=FALSE]) ) }) -# A numeric extraction method is already defined in Biostrings for XStringSet Add -# name-character-based extraction method for XStringSet +# A numeric extraction method is already defined in Biostrings for XStringSet +# Add name-character-based extraction method for XStringSet +# #' @importClassesFrom Biostrings XStringSet #' @export #' @rdname extract-methods -setMethod("[", c("XStringSet", "character"), function(x, i) { - index_vector = match(i, names(x), nomatch = NA_integer_) - index_vector = index_vector[!is.na(index_vector)] - if (length(index_vector) <= 0) { - warning("[,XStringSet: no valid seq-indices provided, NULL returned") - return(NULL) - } - if (length(index_vector) < length(i)) { - warning("[,XStringSet: some seq-name indices invalid, omitted.") - } - # index_vector is an integer, subsetting now dispatches to standard - x = x[index_vector] - return(x) +setMethod("[", c("XStringSet", "character"), function(x, i){ + index_vector = match(i, names(x), nomatch=NA_integer_) + index_vector = index_vector[!is.na(index_vector)] + if( length(index_vector) <= 0 ){ + warning("[,XStringSet: no valid seq-indices provided, NULL returned") + return(NULL) + } + if( length(index_vector) < length(i) ){ + warning("[,XStringSet: some seq-name indices invalid, omitted.") + } + # index_vector is an integer, subsetting now dispatches to standard + x = x[index_vector] + return(x) }) -################################################################################ +################################################################################ +################################################################################ diff --git a/R/merge-methods.R b/R/merge-methods.R index 2841a63c..4f4ce200 100644 --- a/R/merge-methods.R +++ b/R/merge-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Merge arguments into one phyloseq object. #' #' Takes a comma-separated list of phyloseq objects as arguments, @@ -47,9 +47,9 @@ #' @examples # #' ## # Make a random complex object #' ## OTU1 <- otu_table(matrix(sample(0:5,250,TRUE),25,10), taxa_are_rows=TRUE) -#' ## tax1 <- tax_table(matrix('abc', 30, 8)) +#' ## tax1 <- tax_table(matrix("abc", 30, 8)) #' ## map1 <- data.frame( matrix(sample(0:3,250,TRUE),25,10), -#' ## matrix(sample(c('a','b','c'),150,TRUE), 25, 6) ) +#' ## matrix(sample(c("a","b","c"),150,TRUE), 25, 6) ) #' ## map1 <- sample_data(map1) #' ## exam1 <- phyloseq(OTU1, map1, tax1) #' ## x <- exam1 @@ -57,39 +57,38 @@ #' ## y <- tax_table(exam1) #' ## merge_phyloseq(x, y) #' ## merge_phyloseq(y, y, y, y) -merge_phyloseq <- function(...) { - arguments <- list(...) - # create list of all components of all objects - comp.list <- list() - for (i in 1:length(arguments)) { - comp.list <- c(comp.list, splat.phyloseq.objects(arguments[[i]])) - } - # loop through each component type. Note, list names redundant. will use this - merged.list <- list() - for (i in unique(names(comp.list))) { - # i='tax_table' check if length 1, if so, cat to merged.list. - i.list <- comp.list[names(comp.list) == i] - if (length(i.list) == 1) { - merged.list <- c(merged.list, i.list) - } else { - # else, loop through each identically-named objects. - x1 <- i.list[[1]] - for (j in 2:length(i.list)) { - x1 <- merge_phyloseq_pair(x1, i.list[[j]]) - } - x1 <- list(x1) - names(x1) <- i - merged.list <- c(merged.list, x1) - } - } - # Remove names to avoid any conflicts with phyloseq(), which does not need - # named-arguments - names(merged.list) <- NULL - - # Use do.call for calling this variable-length, variable-content argument list. - return(do.call(phyloseq, merged.list)) +merge_phyloseq <- function(...){ + arguments <- list(...) + # create list of all components of all objects + comp.list <- list() + for( i in 1:length(arguments) ){ + comp.list <- c(comp.list, splat.phyloseq.objects(arguments[[i]])) + } + # loop through each component type. Note, list names redundant. will use this + merged.list <- list() + for( i in unique(names(comp.list)) ){ #i="tax_table" + # check if length 1, if so, cat to merged.list. + i.list <- comp.list[names(comp.list)==i] + if( length(i.list) == 1 ){ + merged.list <- c(merged.list, i.list) + } else { + # else, loop through each identically-named objects. + x1 <- i.list[[1]] + for( j in 2:length(i.list)){ + x1 <- merge_phyloseq_pair(x1, i.list[[j]]) + } + x1 <- list(x1) + names(x1) <- i + merged.list <- c(merged.list, x1) + } + } + # Remove names to avoid any conflicts with phyloseq(), which does not need named-arguments + names(merged.list) <- NULL + + # Use do.call for calling this variable-length, variable-content argument list. + return( do.call(phyloseq, merged.list) ) } -################################################################################ +################################################################################ #' Merge pair of phyloseq component data objects of the same class. #' #' Internal S4 methods to combine pairs of objects of classes specified in the @@ -132,138 +131,129 @@ merge_phyloseq <- function(...) { #' ## xy <- merge_phyloseq_pair(x, y) #' ## yx <- merge_phyloseq_pair(y, x) #' ## # merge two simulated tax_table objects -#' ## x <- tax_table(matrix('abc', 20, 6)) -#' ## y <- tax_table(matrix('def', 30, 8)) +#' ## x <- tax_table(matrix("abc", 20, 6)) +#' ## y <- tax_table(matrix("def", 30, 8)) #' ## xy <- merge_phyloseq_pair(x, y) #' ## # merge two simulated sample_data objects #' ## x <- data.frame( matrix(sample(0:3,250,TRUE),25,10), -#' ## matrix(sample(c('a','b','c'),150,TRUE),25,6) ) +#' ## matrix(sample(c("a","b","c"),150,TRUE),25,6) ) #' ## x <- sample_data(x) #' ## y <- data.frame( matrix(sample(4:6,200,TRUE),20,10), -#' ## matrix(sample(c('d','e','f'),120,TRUE),20,8) ) +#' ## matrix(sample(c("d","e","f"),120,TRUE),20,8) ) #' ## y <- sample_data(y) #' ## merge_phyloseq_pair(x, y) #' ## data.frame(merge_phyloseq_pair(x, y)) #' ## data.frame(merge_phyloseq_pair(y, x)) setGeneric("merge_phyloseq_pair", function(x, y) standardGeneric("merge_phyloseq_pair")) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,otu_table,otu_table-method #' @rdname merge_phyloseq_pair-methods -setMethod("merge_phyloseq_pair", signature("otu_table", "otu_table"), function(x, - y) { - specRrowsx <- taxa_are_rows(x) - new.sp.names <- union(taxa_names(x), taxa_names(y)) - new.sa.names <- union(sample_names(x), sample_names(y)) - - # Create the empty new matrix structure - newx <- matrix(0, nrow = length(new.sp.names), ncol = length(new.sa.names), dimnames = list(new.sp.names, - new.sa.names)) - - # assign a standard taxa_are_rows orientation to TRUE for x and y - if (!taxa_are_rows(x)) { - x <- t(x) - } - if (!taxa_are_rows(y)) { - y <- t(y) - } - - # 'merge' by addition. - newx[rownames(x), colnames(x)] <- x - newx[rownames(y), colnames(y)] <- newx[rownames(y), colnames(y)] + y - - # Create the new otu_table object - newx <- otu_table(newx, taxa_are_rows = TRUE) - - # Return the orientation that was in x - if (!specRrowsx) { - newx <- t(newx) - } - return(newx) +setMethod("merge_phyloseq_pair", signature("otu_table", "otu_table"), function(x, y){ + specRrowsx <- taxa_are_rows(x) + new.sp.names <- union(taxa_names(x), taxa_names(y)) + new.sa.names <- union(sample_names(x), sample_names(y)) + + # Create the empty new matrix structure + newx <- matrix(0, nrow=length(new.sp.names), ncol=length(new.sa.names), + dimnames=list(new.sp.names, new.sa.names)) + + # assign a standard taxa_are_rows orientation to TRUE for x and y + if( !taxa_are_rows(x) ){ x <- t(x) } + if( !taxa_are_rows(y) ){ y <- t(y) } + + # "merge" by addition. + newx[rownames(x), colnames(x)] <- x + newx[rownames(y), colnames(y)] <- newx[rownames(y), colnames(y)] + y + + # Create the new otu_table object + newx <- otu_table(newx, taxa_are_rows=TRUE) + + # Return the orientation that was in x + if( !specRrowsx ){ newx <- t(newx) } + return(newx) }) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,taxonomyTable,taxonomyTable-method #' @rdname merge_phyloseq_pair-methods -setMethod("merge_phyloseq_pair", signature("taxonomyTable", "taxonomyTable"), function(x, - y) { - new.sp.names <- union(rownames(x), rownames(y)) - new.ta.names <- union(colnames(x), colnames(y)) - - # Create the empty new matrix structure - newx <- matrix(NA, nrow = length(new.sp.names), ncol = length(new.ta.names), - dimnames = list(new.sp.names, new.ta.names)) - - # 'merge'. Overwrite with x information. - newx[rownames(y), colnames(y)] <- y - newx[rownames(x), colnames(x)] <- x - - # Create the new otu_table object - newx <- tax_table(newx) - - return(newx) +setMethod("merge_phyloseq_pair", signature("taxonomyTable", "taxonomyTable"), function(x, y){ + new.sp.names <- union(rownames(x), rownames(y)) + new.ta.names <- union(colnames(x), colnames(y)) + + # Create the empty new matrix structure + newx <- matrix(NA, nrow=length(new.sp.names), ncol=length(new.ta.names), + dimnames=list(new.sp.names, new.ta.names)) + + # "merge". Overwrite with x information. + newx[rownames(y), colnames(y)] <- y + newx[rownames(x), colnames(x)] <- x + + # Create the new otu_table object + newx <- tax_table(newx) + + return(newx) }) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,sample_data,sample_data-method #' @rdname merge_phyloseq_pair-methods -setMethod("merge_phyloseq_pair", signature("sample_data", "sample_data"), function(x, - y) { - new.sa.names <- union(rownames(x), rownames(y)) - new.va.names <- union(colnames(x), colnames(y)) - - partx <- data.frame(X0 = rownames(x), x) - party <- data.frame(X0 = rownames(y), y) - newx <- merge(partx, party, all = TRUE) - # now we have the correct template, lets remove redundant rows. - keep.samp.rows <- sapply(unique(as.character(newx[, 1])), function(i, nx) { - rownames(subset(nx, X0 == i))[1] - }, newx) - newx <- newx[keep.samp.rows, ] - rownames(newx) <- as.character(newx$X0) - - # 'merge'. Overwrite with x information. - newx[rownames(y), colnames(y)] <- data.frame(y) - newx[rownames(x), colnames(x)] <- data.frame(x) - - # trim the sample name column - newx <- newx[, names(newx) != "X0"] - - # Create the new otu_table object - newx <- sample_data(newx) - return(newx) +setMethod("merge_phyloseq_pair", signature("sample_data", "sample_data"), function(x, y){ + new.sa.names <- union(rownames(x), rownames(y)) + new.va.names <- union(colnames(x), colnames(y)) + + partx <- data.frame("X0"=rownames(x), x) + party <- data.frame("X0"=rownames(y), y) + newx <- merge(partx, party, all=TRUE) + # now we have the correct template, lets remove redundant rows. + keep.samp.rows <- sapply(unique(as.character(newx[,1])), function(i,nx){ + rownames(subset(nx, X0==i))[1] + },newx) + newx <- newx[keep.samp.rows,] + rownames(newx) <- as.character(newx$"X0") + + # "merge". Overwrite with x information. + newx[rownames(y), colnames(y)] <- data.frame(y) + newx[rownames(x), colnames(x)] <- data.frame(x) + + # trim the sample name column + newx <- newx[,names(newx)!="X0"] + + # Create the new otu_table object + newx <- sample_data(newx) + return(newx) }) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,phylo,phylo-method #' @rdname merge_phyloseq_pair-methods #' @importFrom ape consensus -setMethod("merge_phyloseq_pair", signature("phylo", "phylo"), function(x, y) { - if (identical(x, y)) { - return(x) - } else { - return(consensus(x, y)) - } +setMethod("merge_phyloseq_pair", signature("phylo", "phylo"), function(x, y){ + if(identical(x, y)){ + return(x) + } else { + return( consensus(x, y) ) + } }) -################################################################################ +################################################################################ #' @aliases merge_phyloseq_pair,XStringSet,XStringSet-method #' @rdname merge_phyloseq_pair-methods -setMethod("merge_phyloseq_pair", signature("XStringSet", "XStringSet"), function(x, - y) { - if (class(x) != class(y)) { - # if class of x and y don't match, throw warning, try anyway (just in case) - warning("For merging reference sequence objects, x and y should be same type.\n", - "That is, the same subclass of XStringSet. e.g. both DNAStringSet.\n", - "Try coercing each to the same compatible class prior to merge.") - } - # Add to x the stuff that is in y, but not in x - add_y_taxa = setdiff(taxa_names(y), taxa_names(x)) - if (length(add_y_taxa) < 1L) { - # If there is nothing from y to add, just return x as-is - return(x) - } else { - # Else, add unique stuff from y only to x (they are both lists!) - x = c(x, y[add_y_taxa]) - return(x) - } +setMethod("merge_phyloseq_pair", signature("XStringSet", "XStringSet"), function(x, y){ + if( class(x) != class(y) ){ + # if class of x and y don't match, throw warning, try anyway (just in case) + warning("For merging reference sequence objects, x and y should be same type.\n", + "That is, the same subclass of XStringSet. e.g. both DNAStringSet.\n", + "Try coercing each to the same compatible class prior to merge.") + } + # Add to x the stuff that is in y, but not in x + add_y_taxa = setdiff(taxa_names(y), taxa_names(x)) + if( length(add_y_taxa) < 1L ){ + # If there is nothing from y to add, just return x as-is + return(x) + } else { + # Else, add unique stuff from y only to x (they are both lists!) + x = c(x, y[add_y_taxa]) + return(x) + } }) -################################################################################ +################################################################################ +################################################################################ #' Merge a subset of the species in \code{x} into one species/taxa/OTU. #' #' Takes as input an object that describes species/taxa @@ -312,169 +302,163 @@ setMethod("merge_phyloseq_pair", signature("XStringSet", "XStringSet"), function #' otutree0 <- phyloseq(otu, tree) #' # plot_tree(otutree0) #' otutree1 <- merge_taxa(otutree0, 1:8, 2) -#' # plot_tree(esophagus, ladderize='left') -setGeneric("merge_taxa", function(x, eqtaxa, archetype = 1L) standardGeneric("merge_taxa")) -################################################################################ +#' # plot_tree(esophagus, ladderize="left") +setGeneric("merge_taxa", function(x, eqtaxa, archetype=1L) standardGeneric("merge_taxa")) +################################################################################ #' @keywords internal -merge_taxa.indices.internal = function(x, eqtaxa, archetype) { - ## If eqtaxa or archetype are character, interpret them to be OTUs and coerce them - ## to integer indices - if (is.character(archetype)) { - # If archetype is already an OTU, just assign it to keepIndex - keepIndex = which(taxa_names(x) %in% archetype[1L]) - } else { - # Else archetype is the numeric index of the eqtaxa that should be kept. Need to - # grab from unmodifed eqtaxa, and then decide - archetype = eqtaxa[as.integer(archetype[1L])] - if (is.character(archetype)) { - # If archetype is now an OTU name, find the index and assign to keepIndex - keepIndex = which(taxa_names(x) == archetype[1L]) - } else { - # Otherwise, assume it is a taxa index, and assign to keepIndex - keepIndex = as.integer(archetype) - } - } - # Ensure eqtaxa is the integer indices of the taxa that are being merged together - if (is.character(eqtaxa)) { - # assume OTU name, index it against the OTU names in x - eqtaxa = which(taxa_names(x) %in% eqtaxa) - } else { - # Else assume numeric index of the OTU that are being merged - eqtaxa = as.integer(eqtaxa) - } - # keepIndex is index of the OTU that is kept / everything merged into. It must - # be among the set of indices in eqtaxa or there is a logic error. Stop. - if (length(keepIndex) <= 0L) { - stop("invalid archetype provided.") - } - if (!keepIndex %in% eqtaxa) { - stop("invalid archetype provided. It is not part of eqtaxa.") - } - # removeIndex is the index of each OTU that will be removed - removeIndex = setdiff(eqtaxa, keepIndex) - # Check that indices are valid - allIndices = unlist(list(keepIndex, removeIndex)) - if (any(allIndices > ntaxa(x) | allIndices < 0L)) { - stop("invalid OTU indices provided as eqtaxa or archetype.") - } - return(list(removeIndex = removeIndex, keepIndex = keepIndex)) +merge_taxa.indices.internal = function(x, eqtaxa, archetype){ + ## If eqtaxa or archetype are character, interpret them to be OTUs and coerce them to integer indices + if( is.character(archetype) ){ + # If archetype is already an OTU, just assign it to keepIndex + keepIndex = which(taxa_names(x) %in% archetype[1L]) + } else { + # Else archetype is the numeric index of the eqtaxa that should be kept. + # Need to grab from unmodifed eqtaxa, and then decide + archetype = eqtaxa[as.integer(archetype[1L])] + if( is.character(archetype) ){ + # If archetype is now an OTU name, find the index and assign to keepIndex + keepIndex = which(taxa_names(x) == archetype[1L]) + } else { + # Otherwise, assume it is a taxa index, and assign to keepIndex + keepIndex = as.integer(archetype) + } + } + # Ensure eqtaxa is the integer indices of the taxa that are being merged together + if( is.character(eqtaxa) ){ + # assume OTU name, index it against the OTU names in x + eqtaxa = which(taxa_names(x) %in% eqtaxa) + } else { + # Else assume numeric index of the OTU that are being merged + eqtaxa = as.integer(eqtaxa) + } + # keepIndex is index of the OTU that is kept / everything merged into. + # It must be among the set of indices in eqtaxa or there is a logic error. Stop. + if( length(keepIndex) <= 0L ){ stop("invalid archetype provided.") } + if( !keepIndex %in% eqtaxa ){ stop("invalid archetype provided. It is not part of eqtaxa.") } + # removeIndex is the index of each OTU that will be removed + removeIndex = setdiff(eqtaxa, keepIndex) + # Check that indices are valid + allIndices = unlist(list(keepIndex, removeIndex)) + if( any(allIndices > ntaxa(x) | allIndices < 0L) ){ + stop("invalid OTU indices provided as eqtaxa or archetype.") + } + return(list(removeIndex=removeIndex, keepIndex=keepIndex)) } -################################################################################ +################################################################################ #' @aliases merge_taxa,phyloseq-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "phyloseq", function(x, eqtaxa, archetype = eqtaxa[which.max(taxa_sums(x)[eqtaxa])]) { +setMethod("merge_taxa", "phyloseq", function(x, eqtaxa, + archetype=eqtaxa[which.max(taxa_sums(x)[eqtaxa])]){ - comp_list <- splat.phyloseq.objects(x) - merged_list <- lapply(comp_list, merge_taxa, eqtaxa, archetype) - # the element names can wreak havoc on do.call - names(merged_list) <- NULL - # Re-instantiate the combined object using the species-merged object. - do.call("phyloseq", merged_list) + comp_list <- splat.phyloseq.objects(x) + merged_list <- lapply(comp_list, merge_taxa, eqtaxa, archetype) + # the element names can wreak havoc on do.call + names(merged_list) <- NULL + # Re-instantiate the combined object using the species-merged object. + do.call("phyloseq", merged_list) }) -############################################################################### Don't need to merge anything for sample_data. Return As-is. +############################################################################### +# Don't need to merge anything for sample_data. Return As-is. #' @aliases merge_taxa,sample_data-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "sample_data", function(x, eqtaxa, archetype = 1L) { - return(x) +setMethod("merge_taxa", "sample_data", function(x, eqtaxa, archetype=1L){ + return(x) }) -############################################################################### +############################################################################### #' @aliases merge_taxa,otu_table-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "otu_table", function(x, eqtaxa, archetype = eqtaxa[which.max(taxa_sums(x)[eqtaxa])]) { +setMethod("merge_taxa", "otu_table", function(x, eqtaxa, + archetype=eqtaxa[which.max(taxa_sums(x)[eqtaxa])]){ - if (length(eqtaxa) < 2) { - return(x) - } - indList = merge_taxa.indices.internal(x, eqtaxa, archetype) - removeIndex = indList$removeIndex - keepIndex = indList$keepIndex - # Merge taxa by summing all the equivalent taxa and assigning to the one in - # keepIndex - if (taxa_are_rows(x)) { - x[keepIndex, ] = colSums(x[eqtaxa, ]) - } else { - x[, keepIndex] = rowSums(x[, eqtaxa]) - } - # For speed, use matrix subsetting instead of prune_taxa() - if (taxa_are_rows(x)) { - x = x[-removeIndex, , drop = FALSE] - } else { - x = x[, -removeIndex, drop = FALSE] - } - return(x) + if( length(eqtaxa) < 2 ){ + return(x) + } + indList = merge_taxa.indices.internal(x, eqtaxa, archetype) + removeIndex = indList$removeIndex + keepIndex = indList$keepIndex + # Merge taxa by summing all the equivalent taxa and assigning to the one in keepIndex + if( taxa_are_rows(x) ){ + x[keepIndex, ] = colSums(x[eqtaxa, ]) + } else { + x[, keepIndex] = rowSums(x[, eqtaxa]) + } + # For speed, use matrix subsetting instead of prune_taxa() + if (taxa_are_rows(x)) { + x = x[-removeIndex, , drop = FALSE] + } else { + x = x[, -removeIndex, drop = FALSE] + } + return(x) }) -############################################################################### +############################################################################### #' @importFrom ape drop.tip #' @aliases merge_taxa,phylo-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "phylo", function(x, eqtaxa, archetype = 1L) { - # If there is nothing to merge, return x as-is - if (length(eqtaxa) < 2) { - return(x) - } - indList = merge_taxa.indices.internal(x, eqtaxa, archetype) - removeIndex = indList$removeIndex - # If there is too much to merge (tree would have 1 or 0 branches), return - # NULL/warning - if (length(removeIndex) >= (ntaxa(x) - 1)) { - # Can't have a tree with 1 or fewer tips - warning("merge_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.") - return(NULL) - # Else, drop the removeIndex tips and returns the pruned tree. - } else { - return(drop.tip(x, removeIndex)) - } +setMethod("merge_taxa", "phylo", function(x, eqtaxa, archetype=1L){ + # If there is nothing to merge, return x as-is + if( length(eqtaxa) < 2 ){ + return(x) + } + indList = merge_taxa.indices.internal(x, eqtaxa, archetype) + removeIndex = indList$removeIndex + # If there is too much to merge (tree would have 1 or 0 branches), return NULL/warning + if( length(removeIndex) >= (ntaxa(x)-1) ){ + # Can't have a tree with 1 or fewer tips + warning("merge_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.") + return(NULL) + # Else, drop the removeIndex tips and returns the pruned tree. + } else { + return( drop.tip(x, removeIndex) ) + } }) -############################################################################### +############################################################################### #' @importClassesFrom Biostrings XStringSet #' @aliases merge_taxa,XStringSet-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "XStringSet", function(x, eqtaxa, archetype = 1L) { - # If there is nothing to merge, return x as-is - if (length(eqtaxa) < 2) { - return(x) - } - indList = merge_taxa.indices.internal(x, eqtaxa, archetype) - removeIndex = indList$removeIndex - # If there is too much to merge (refseq would have 0 sequences), return - # NULL/warning - if (length(removeIndex) >= ntaxa(x)) { - # Can't have a refseq list with less - warning("merge_taxa attempted to reduce reference sequence list to 0 sequences.\n refseq replaced with NULL.") - return(NULL) - } else { - # Else, drop the removeIndex sequences and returns the pruned XStringSet object - x <- x[-removeIndex] - return(x) - } +setMethod("merge_taxa", "XStringSet", function(x, eqtaxa, archetype=1L){ + # If there is nothing to merge, return x as-is + if( length(eqtaxa) < 2 ){ + return(x) + } + indList = merge_taxa.indices.internal(x, eqtaxa, archetype) + removeIndex = indList$removeIndex + # If there is too much to merge (refseq would have 0 sequences), return NULL/warning + if( length(removeIndex) >= ntaxa(x) ){ + # Can't have a refseq list with less + warning("merge_taxa attempted to reduce reference sequence list to 0 sequences.\n refseq replaced with NULL.") + return(NULL) + } else { + # Else, drop the removeIndex sequences and returns the pruned XStringSet object + x <- x[-removeIndex] + return(x) + } }) -################################################################################ +################################################################################ #' @aliases merge_taxa,taxonomyTable-method #' @rdname merge_taxa-methods -setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype = 1L) { - if (length(eqtaxa) < 2) { - return(x) - } - indList = merge_taxa.indices.internal(x, eqtaxa, archetype) - removeIndex = indList$removeIndex - keepIndex = indList$keepIndex - # # # Taxonomy is trivial in ranks after disagreement among merged taxa # # Make - # those values NA_character_ - taxmerge <- as(x, "matrix")[eqtaxa, ] - bad_ranks <- apply(taxmerge, 2, function(i) { - length(unique(i)) != 1 - }) - # Test if all taxonomies agree. If so, do nothing. Just continue to pruning. - if (any(bad_ranks)) { - # The col indices of the bad ranks - bad_ranks <- min(which(bad_ranks)):length(bad_ranks) - # Replace bad taxonomy elements in the archetype only (others are pruned) - x[keepIndex, bad_ranks] <- NA_character_ - } - # Finally, remove the OTUs that have been merged into keepIndex - return(x[-removeIndex, , drop = FALSE]) +setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype=1L){ + if( length(eqtaxa) < 2 ){ + return(x) + } + indList = merge_taxa.indices.internal(x, eqtaxa, archetype) + removeIndex = indList$removeIndex + keepIndex = indList$keepIndex + # # # Taxonomy is trivial in ranks after disagreement among merged taxa + # # # Make those values NA_character_ + taxmerge <- as(x, "matrix")[eqtaxa, ] + bad_ranks <- apply(taxmerge, 2, function(i){ length(unique(i)) != 1 }) + # Test if all taxonomies agree. If so, do nothing. Just continue to pruning. + if( any(bad_ranks) ){ + # The col indices of the bad ranks + bad_ranks <- min(which(bad_ranks)):length(bad_ranks) + # Replace bad taxonomy elements in the archetype only (others are pruned) + x[keepIndex, bad_ranks] <- NA_character_ + } + # Finally, remove the OTUs that have been merged into keepIndex + return( x[-removeIndex, , drop = FALSE] ) }) -################################################################################ +################################################################################ +################################################################################ #' Merge samples based on a sample variable or factor. #' #' The purpose of this method is to merge/agglomerate the sample indices of a @@ -513,8 +497,8 @@ setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype = 1L) { #' @examples # #' data(GlobalPatterns) #' GP = GlobalPatterns -#' mergedGP = merge_samples(GlobalPatterns, 'SampleType') -#' SD = merge_samples(sample_data(GlobalPatterns), 'SampleType') +#' mergedGP = merge_samples(GlobalPatterns, "SampleType") +#' SD = merge_samples(sample_data(GlobalPatterns), "SampleType") #' print(SD) #' print(mergedGP) #' sample_names(GlobalPatterns) @@ -525,104 +509,92 @@ setMethod("merge_taxa", "taxonomyTable", function(x, eqtaxa, archetype = 1L) { #' OTUnames10 = names(sort(taxa_sums(GP), TRUE)[1:10]) #' GP10 = prune_taxa(OTUnames10, GP) #' mGP10 = prune_taxa(OTUnames10, mergedGP) -#' ocean_samples = sample_names(subset(sample_data(GP), SampleType=='Ocean')) +#' ocean_samples = sample_names(subset(sample_data(GP), SampleType=="Ocean")) #' print(ocean_samples) #' otu_table(GP10)[, ocean_samples] #' rowSums(otu_table(GP10)[, ocean_samples]) -#' otu_table(mGP10)['Ocean', ] -setGeneric("merge_samples", function(x, group, fun = mean) standardGeneric("merge_samples")) -################################################################################ +#' otu_table(mGP10)["Ocean", ] +setGeneric("merge_samples", function(x, group, fun=mean) standardGeneric("merge_samples")) +################################################################################ #' @aliases merge_samples,sample_data-method #' @rdname merge_samples-methods -setMethod("merge_samples", signature("sample_data"), function(x, group, fun = mean) { - x1 <- data.frame(x) - - # Check class of group and modify if 'character' - if (class(group) == "character" & length(group) == 1) { - if (!group %in% colnames(x)) { - stop("group not found among sample variable names.") - } - group <- x1[, group] - } - if (class(group) != "factor") { - # attempt to coerce to factor - group <- factor(group) - } - - # Remove any non-coercable columns. Philosophy is to keep as much as possible. - # If it is coercable at all, keep. Coerce all columns to numeric matrix - coercable <- sapply(x1, canCoerce, "numeric") - x2 <- sapply(x1[, coercable], as, "numeric") - rownames(x2) <- rownames(x1) - - # Perform the aggregation. - outdf <- aggregate(x2, list(group), fun) - # get rownames from the 'group' column (always first) rownames(outdf) <- - # as.character(outdf[, 1]) - rownames(outdf) <- levels(group) - # 'pop' the first column - outdf <- outdf[, -1, drop = FALSE] - - return(sample_data(outdf)) +setMethod("merge_samples", signature("sample_data"), function(x, group, fun=mean){ + x1 <- data.frame(x) + + # Check class of group and modify if "character" + if( class(group)=="character" & length(group)==1 ){ + if( !group %in% colnames(x) ){stop("group not found among sample variable names.")} + group <- x1[, group] + } + if( class(group)!="factor" ){ + # attempt to coerce to factor + group <- factor(group) + } + + # Remove any non-coercable columns. + # Philosophy is to keep as much as possible. If it is coercable at all, keep. + # Coerce all columns to numeric matrix + coercable <- sapply(x1, canCoerce, "numeric") + x2 <- sapply(x1[, coercable], as, "numeric") + rownames(x2) <- rownames(x1) + + # Perform the aggregation. + outdf <- aggregate(x2, list(group), fun) + # get rownames from the "group" column (always first) + # rownames(outdf) <- as.character(outdf[, 1]) + rownames(outdf) <- levels(group) + # "pop" the first column + outdf <- outdf[, -1, drop=FALSE] + + return( sample_data(outdf) ) }) -################################################################################ +################################################################################ #' @aliases merge_samples,otu_table-method #' @rdname merge_samples-methods -setMethod("merge_samples", signature("otu_table"), function(x, group) { - # needs to be in sample-by-species orientation - if (taxa_are_rows(x)) { - x <- t(x) - } - # coerce to matrix, x2 - x2 <- as(x, "matrix") - - # # # #aggregate(x2, list(group), fun) - out <- rowsum(x2, group) - - # convert back to otu_table, and return - return(otu_table(out, taxa_are_rows = FALSE)) +setMethod("merge_samples", signature("otu_table"), function(x, group){ + # needs to be in sample-by-species orientation + if( taxa_are_rows(x) ){ x <- t(x) } + # coerce to matrix, x2 + x2 <- as(x, "matrix") + + # # # #aggregate(x2, list(group), fun) + out <- rowsum(x2, group) + + # convert back to otu_table, and return + return( otu_table(out, taxa_are_rows=FALSE) ) }) -################################################################################ +################################################################################ #' @aliases merge_samples,phyloseq-method #' @rdname merge_samples-methods -setMethod("merge_samples", signature("phyloseq"), function(x, group, fun = mean) { - - # Check if phyloseq object has a sample_data - if (!is.null(sample_data(x, FALSE))) { - # Check class of group and modify if single 'character' (column name) - if (class(group) == "character" & length(group) == 1) { - x1 <- data.frame(sample_data(x)) - if (!group %in% colnames(x1)) { - stop("group not found among sample variable names.") - } - group <- x1[, group] - } - # coerce to factor - if (class(group) != "factor") { - group <- factor(group) - } - # Perform merges. - newSM <- merge_samples(sample_data(x), group, fun) - newOT <- merge_samples(otu_table(x), group) - phyloseqList <- list(newOT, newSM) - # Else, the only relevant object to 'merge_samples' is the otu_table - } else { - if (class(group) != "factor") { - group <- factor(group) - } - phyloseqList <- list(newOT = merge_samples(otu_table(x), group)) - } - - ### Add to build-call-list the remaining components, if present in x. NULL is - ### returned by accessor if object lacks requested component/slot. Order of - ### objects in list doesn't matter for phyloseq. The list should not be named. - if (!is.null(access(x, "tax_table"))) { - phyloseqList <- c(phyloseqList, list(tax_table(x))) - } - if (!is.null(access(x, "phy_tree"))) { - phyloseqList <- c(phyloseqList, list(phy_tree(x))) - } - - return(do.call("phyloseq", phyloseqList)) +setMethod("merge_samples", signature("phyloseq"), function(x, group, fun=mean){ + + # Check if phyloseq object has a sample_data + if( !is.null(sample_data(x, FALSE)) ){ + # Check class of group and modify if single "character" (column name) + if( class(group)=="character" & length(group)==1 ){ + x1 <- data.frame(sample_data(x)) + if( !group %in% colnames(x1) ){stop("group not found among sample variable names.")} + group <- x1[, group] + } + # coerce to factor + if( class(group)!="factor" ){ group <- factor(group) } + # Perform merges. + newSM <- merge_samples(sample_data(x), group, fun) + newOT <- merge_samples(otu_table(x), group) + phyloseqList <- list(newOT, newSM) + # Else, the only relevant object to "merge_samples" is the otu_table + } else { + if( class(group)!="factor" ){ group <- factor(group) } + phyloseqList <- list( newOT=merge_samples(otu_table(x), group) ) + } + + ### Add to build-call-list the remaining components, if present in x. + ### NULL is returned by accessor if object lacks requested component/slot. + ### Order of objects in list doesn't matter for phyloseq. + ### The list should not be named. + if( !is.null(access(x, "tax_table")) ){ phyloseqList <- c(phyloseqList, list(tax_table(x))) } + if( !is.null(access(x, "phy_tree")) ){ phyloseqList <- c(phyloseqList, list(phy_tree(x))) } + + return( do.call("phyloseq", phyloseqList) ) }) -################################################################################ +################################################################################ diff --git a/R/multtest-wrapper.R b/R/multtest-wrapper.R index 2589b296..1c6af8f0 100644 --- a/R/multtest-wrapper.R +++ b/R/multtest-wrapper.R @@ -1,4 +1,6 @@ -#################################################################################### # # # Avoiding full import of multtest to mitigate potential conflicts +#################################################################################### +# # # # Avoiding full import of multtest to mitigate potential conflicts +#################################################################################### #' Multiple testing of taxa abundance according to sample categories/classes #' #' Please note that it is up to you to perform any necessary @@ -18,17 +20,17 @@ #' NOTE: the default test applied to each taxa is a two-sample two-sided #' \code{\link{t.test}}, WHICH WILL FAIL with an error if you provide a data variable #' (or custom vector) that contains MORE THAN TWO classes. One alternative to consider -#' is an F-test, by specifying \code{test='f'} as an additional argument. See +#' is an F-test, by specifying \code{test="f"} as an additional argument. See #' the first example below, and/or further documentation of #' \code{\link[multtest]{mt.maxT}} or \code{\link[multtest]{mt.minP}} #' for other options and formal details. #' -#' @param minPmaxT (Optional). Character string. \code{'mt.minP'} or \code{'mt.maxT'}. -#' Default is to use \code{'\link[multtest]{mt.minP}'}. +#' @param minPmaxT (Optional). Character string. \code{"mt.minP"} or \code{"mt.maxT"}. +#' Default is to use \code{"\link[multtest]{mt.minP}"}. #' #' @param method (Optional). Additional multiple-hypthesis correction methods. #' A character vector from the set \code{\link[stats]{p.adjust.methods}}. -#' Default is \code{'fdr'}, for the Benjamini and Hochberg (1995) method +#' Default is \code{"fdr"}, for the Benjamini and Hochberg (1995) method #' to control False Discovery Rate (FDR). This argument is passed on to #' \code{\link[stats]{p.adjust}}, please see that documentation for more details. #' @@ -61,109 +63,111 @@ #' # Filter samples that don't have Enterotype #' x <- subset_samples(enterotype, !is.na(Enterotype)) #' # (the taxa are at the genera level in this dataset) -#' res = mt(x, 'Enterotype', method=c('fdr', 'bonferroni'), test='f', B=300) +#' res = mt(x, "Enterotype", method=c("fdr", "bonferroni"), test="f", B=300) #' head(res, 10) #' ## # Not surprisingly, Prevotella and Bacteroides top the list. #' ## # Different test, multiple-adjusted t-test, whether samples are ent-2 or not. -#' ## mt(x, get_variable(x, 'Enterotype')==2) -setGeneric("mt", function(physeq, classlabel, minPmaxT = "minP", method = "fdr", - ...) standardGeneric("mt")) -################################################################################ First, access the otu_table, and if appropriate, define classlabel from the -################################################################################ sample_data. +#' ## mt(x, get_variable(x, "Enterotype")==2) +setGeneric("mt", function(physeq, classlabel, minPmaxT="minP", method="fdr", ...) standardGeneric("mt") ) +################################################################################ +# First, access the otu_table, and if appropriate, define classlabel from +# the sample_data. #' @aliases mt,phyloseq,ANY-method #' @rdname mt-methods -setMethod("mt", c("phyloseq", "ANY"), function(physeq, classlabel, minPmaxT = "minP", - method = "fdr", ...) { - # Extract the class information from the sample_data if sample_data slot is - # non-empty, and the classlabel is a character-class and its length is 1. - if (!is.null(sample_data(physeq, FALSE)) & inherits(classlabel, "character") & - identical(length(classlabel), 1L)) { - # Define a raw factor based on the data available in a sample variable - rawFactor = get_variable(physeq, classlabel[1]) - if (!inherits(rawFactor, "factor")) { - # coerce to a factor if it is not already one. - rawFactor = factor(rawFactor) - } - # Either way, replace `classlabel` with `rawFactor` - classlabel = rawFactor - } - # Either way, dispatch `mt` on otu_table(physeq) - MT = mt(otu_table(physeq), classlabel, minPmaxT, ...) - if (!is.null(tax_table(physeq, FALSE))) { - # If there is tax_table data present, add/cbind it to the results. - MT = cbind(MT, as(tax_table(physeq), "matrix")[rownames(MT), , drop = FALSE]) - } - if (length(method) > 0 & method %in% p.adjust.methods) { +setMethod("mt", c("phyloseq", "ANY"), function(physeq, classlabel, minPmaxT="minP", method="fdr", ...){ + # Extract the class information from the sample_data + # if sample_data slot is non-empty, + # and the classlabel is a character-class + # and its length is 1. + if( !is.null(sample_data(physeq, FALSE)) & + inherits(classlabel, "character") & + identical(length(classlabel), 1L) ){ + # Define a raw factor based on the data available in a sample variable + rawFactor = get_variable(physeq, classlabel[1]) + if( !inherits(rawFactor, "factor") ){ + # coerce to a factor if it is not already one. + rawFactor = factor(rawFactor) + } + # Either way, replace `classlabel` with `rawFactor` + classlabel = rawFactor + } + # Either way, dispatch `mt` on otu_table(physeq) + MT = mt(otu_table(physeq), classlabel, minPmaxT, ...) + if( !is.null(tax_table(physeq, FALSE)) ){ + # If there is tax_table data present, + # add/cbind it to the results. + MT = cbind(MT, as(tax_table(physeq), "matrix")[rownames(MT), , drop=FALSE]) + } + if(length(method)>0 & method %in% p.adjust.methods){ # Use only the supported methods method <- method[which(method %in% p.adjust.methods)] # Add adjust-p columns. sapply should retain the names. - adjp = sapply(method, function(meth, p) { - p.adjust(p, meth) - }, p = MT$rawp, USE.NAMES = TRUE) + adjp = sapply(method, function(meth, p){p.adjust(p, meth)}, p = MT$rawp, USE.NAMES = TRUE) MT <- cbind(MT, adjp) } - return(MT) + return(MT) }) -################################################################################ All valid mt() calls eventually funnel dispatch to this method. The otu_table -################################################################################ orientation is checked/handled here (and only here). +################################################################################ +# All valid mt() calls eventually funnel dispatch to this method. +# The otu_table orientation is checked/handled here (and only here). #' @aliases mt,otu_table,integer-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "integer"), function(physeq, classlabel, minPmaxT = "minP", - ...) { - # Guarantee proper orientation of abundance table, and coerce to matrix. - if (!taxa_are_rows(physeq)) { - physeq <- t(physeq) - } - mt.phyloseq.internal(as(physeq, "matrix"), classlabel, minPmaxT, ...) +setMethod("mt", c("otu_table", "integer"), function(physeq, classlabel, minPmaxT="minP", ...){ + # Guarantee proper orientation of abundance table, and coerce to matrix. + if( !taxa_are_rows(physeq) ){ physeq <- t(physeq) } + mt.phyloseq.internal(as(physeq, "matrix"), classlabel, minPmaxT, ...) }) -################################################################################ Coerce numeric classlabel to be integer, pass-on +################################################################################ +# Coerce numeric classlabel to be integer, pass-on #' @aliases mt,otu_table,numeric-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "numeric"), function(physeq, classlabel, minPmaxT = "minP", - ...) { - mt(physeq, as(classlabel, "integer"), minPmaxT = "minP", ...) +setMethod("mt", c("otu_table", "numeric"), function(physeq, classlabel, minPmaxT="minP", ...){ + mt(physeq, as(classlabel, "integer"), minPmaxT="minP", ...) }) -################################################################################ Coerce logical to integer, pass-on +################################################################################ +# Coerce logical to integer, pass-on #' @aliases mt,otu_table,logical-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "logical"), function(physeq, classlabel, minPmaxT = "minP", - ...) { - mt(physeq, as(classlabel, "integer"), minPmaxT = "minP", ...) +setMethod("mt", c("otu_table", "logical"), function(physeq, classlabel, minPmaxT="minP", ...){ + mt(physeq, as(classlabel, "integer"), minPmaxT="minP", ...) }) -################################################################################ Test for length, then dispatch... +################################################################################ +# Test for length, then dispatch... #' @aliases mt,otu_table,character-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "character"), function(physeq, classlabel, minPmaxT = "minP", - ...) { - if (length(classlabel) != nsamples(physeq)) { - stop("classlabel not the same length as nsamples(physeq)") - } else { - classlabel <- factor(classlabel) - } - # Use mt dispatch with classlabel now a suitable classlabel - mt(physeq, classlabel, minPmaxT, ...) +setMethod("mt", c("otu_table", "character"), function(physeq, classlabel, minPmaxT="minP", ...){ + if( length(classlabel) != nsamples(physeq) ){ + stop("classlabel not the same length as nsamples(physeq)") + } else { + classlabel <- factor(classlabel) + } + # Use mt dispatch with classlabel now a suitable classlabel + mt(physeq, classlabel, minPmaxT, ...) }) -################################################################################ Coerce factor to an integer vector of group labels, starting at 0 for the first -################################################################################ group +################################################################################ +# Coerce factor to an integer vector of group labels, +# starting at 0 for the first group #' @aliases mt,otu_table,factor-method #' @rdname mt-methods -setMethod("mt", c("otu_table", "factor"), function(physeq, classlabel, minPmaxT = "minP", - ...) { - # integerize classlabel, starting at 0 - classlabel <- (0:(length(classlabel) - 1))[classlabel] - # Use mt dispatch with classlabel now a suitable classlabel - mt(physeq, classlabel, minPmaxT, ...) +setMethod("mt", c("otu_table", "factor"), function(physeq, classlabel, minPmaxT="minP", ...){ + # integerize classlabel, starting at 0 + classlabel <- (0:(length(classlabel)-1))[classlabel] + # Use mt dispatch with classlabel now a suitable classlabel + mt(physeq, classlabel, minPmaxT, ...) }) -#################################################################################### Internal function @aliases mt,matrix,integer-method not exported +#################################################################################### +# Internal function +# @aliases mt,matrix,integer-method +# not exported #' @keywords internal -mt.phyloseq.internal <- function(physeq, classlabel, minPmaxT = "minP", ...) { - # require(multtest) - if (minPmaxT == "minP") { - return(mt.minP(physeq, classlabel, ...)) - } else if (minPmaxT == "maxT") { - return(mt.maxT(physeq, classlabel, ...)) - } else { - print("Nothing calculated. minPmaxT argument must be either minP or maxT.") - } +mt.phyloseq.internal <- function(physeq, classlabel, minPmaxT="minP", ...){ + # require(multtest) + if( minPmaxT == "minP" ){ + return( mt.minP(physeq, classlabel, ...) ) + } else if( minPmaxT == "maxT" ){ + return( mt.maxT(physeq, classlabel, ...) ) + } else { + print("Nothing calculated. minPmaxT argument must be either minP or maxT.") + } } -#################################################################################### +#################################################################################### diff --git a/R/network-methods.R b/R/network-methods.R index 0ce22df6..34f129b4 100644 --- a/R/network-methods.R +++ b/R/network-methods.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Make microbiome network (igraph) #' #' A specialized function for creating a network representation of microbiomes, @@ -7,7 +7,7 @@ #' The graph is ultimately represented using the #' \code{igraph}-package. #' -#' @usage make_network(physeq, type='samples', distance='jaccard', max.dist = 0.4, +#' @usage make_network(physeq, type="samples", distance="jaccard", max.dist = 0.4, #' keep.isolates=FALSE, ...) #' #' @param physeq (Required). Default \code{NULL}. @@ -15,22 +15,22 @@ #' or \code{\link{otu_table-class}} object, #' on which \code{g} is based. \code{phyloseq-class} recommended. #' -#' @param type (Optional). Default \code{'samples'}. +#' @param type (Optional). Default \code{"samples"}. #' Whether the network should be samples or taxa/OTUs. -#' Supported arguments are \code{'samples'}, \code{'taxa'}, -#' where \code{'taxa'} indicates using the OTUs/taxaindices, +#' Supported arguments are \code{"samples"}, \code{"taxa"}, +#' where \code{"taxa"} indicates using the OTUs/taxaindices, #' whether they actually represent species or some other taxonomic rank. #' -#' NOTE: not all distance methods are supported if \code{'taxa'} +#' NOTE: not all distance methods are supported if \code{"taxa"} #' selected for type. For example, the UniFrac distance and DPCoA #' cannot be calculated for taxa-wise distances, because they use #' a taxa-wise tree as part of their calculation between samples, and #' there is no transpose-equivalent for this tree. #' -#' @param distance (Optional). Default \code{'jaccard'}. +#' @param distance (Optional). Default \code{"jaccard"}. #' Any supported argument to the \code{method} parameter of the #' \code{\link{distance}} function is supported here. -#' Some distance methods, like \code{'unifrac'}, may take +#' Some distance methods, like \code{"unifrac"}, may take #' a non-trivial amount of time to calculate, in which case #' you probably want to calculate the distance matrix separately, #' save, and then provide it as the argument to \code{distance} instead. @@ -74,103 +74,98 @@ #' # # Example plots with Enterotype Dataset #' data(enterotype) #' ig <- make_network(enterotype, max.dist=0.3) -#' plot_network(ig, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) +#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) #' # #' ig1 <- make_network(enterotype, max.dist=0.2) -#' plot_network(ig1, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) +#' plot_network(ig1, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) #' # #' # # Three methods of choosing/providing distance/distance-method #' # Provide method name available to distance() function -#' ig <- make_network(enterotype, max.dist=0.3, distance='jaccard') +#' ig <- make_network(enterotype, max.dist=0.3, distance="jaccard") #' # Provide distance object, already computed -#' jaccdist <- distance(enterotype, 'jaccard') +#' jaccdist <- distance(enterotype, "jaccard") #' ih <- make_network(enterotype, max.dist=0.3, distance=jaccdist) -#' # Provide 'custom' function. -#' ii <- make_network(enterotype, max.dist=0.3, distance=function(x){vegan::vegdist(x, 'jaccard')}) -#' # The have equal results:\t\t +#' # Provide "custom" function. +#' ii <- make_network(enterotype, max.dist=0.3, distance=function(x){vegan::vegdist(x, "jaccard")}) +#' # The have equal results: #' all.equal(ig, ih) #' all.equal(ig, ii) #' # -#' # Try out making a trivial 'network' of the 3-sample esophagus data, +#' # Try out making a trivial "network" of the 3-sample esophagus data, #' # with weighted-UniFrac as distance #' data(esophagus) -#' ij <- make_network(esophagus, 'samples', 'unifrac', weighted=TRUE) -make_network <- function(physeq, type = "samples", distance = "jaccard", max.dist = 0.4, - keep.isolates = FALSE, ...) { - - if (type %in% c("taxa", "species", "OTUs", "otus", "otu")) { +#' ij <- make_network(esophagus, "samples", "unifrac", weighted=TRUE) +make_network <- function(physeq, type="samples", distance="jaccard", max.dist = 0.4, + keep.isolates=FALSE, ...){ + + if( type %in% c("taxa", "species", "OTUs", "otus", "otu")){ # Calculate or asign taxa-wise distance matrix - if (class(distance) == "dist") { - # If distance a distance object, use it rather than re-calculate - obj.dist <- distance - if (attributes(obj.dist)$Size != ntaxa(physeq)) { - stop("ntaxa(physeq) does not match size of dist object in distance") - } - if (!setequal(attributes(obj.dist)$Labels, taxa_names(physeq))) { - stop("taxa_names does not exactly match dist-indices") - } - } else if (class(distance) == "character") { - # If character string, pass on to distance(), assume supported - obj.dist <- distance(physeq, method = distance, type = type, ...) - # Else, assume a custom function and attempt to calculate. - } else { - # Enforce orientation for taxa-wise distances - if (!taxa_are_rows(physeq)) { - physeq <- t(physeq) - } - # Calculate distances - obj.dist <- distance(as(otu_table(physeq), "matrix")) - } - # coerce distance-matrix back into vanilla matrix, Taxa Distance Matrix, TaDiMa - TaDiMa <- as.matrix(obj.dist) - # Add Inf to the diagonal to avoid self-connecting edges (inefficient) - TaDiMa <- TaDiMa + diag(Inf, ntaxa(physeq), ntaxa(physeq)) - # Convert distance matrix to coincidence matrix, CoMa, using max.dist - CoMa <- TaDiMa < max.dist - } else if (type == "samples") { - # Calculate or asign sample-wise distance matrix If argument is already a - # distance matrix. If distance a distance object, use it rather than - # re-calculate - if (class(distance) == "dist") { - obj.dist <- distance - if (attributes(obj.dist)$Size != nsamples(physeq)) { - stop("nsamples(physeq) does not match size of dist object in distance") - } - if (!setequal(attributes(obj.dist)$Labels, sample_names(physeq))) { - stop("sample_names does not exactly match dist-indices") - } - # If character string, pass on to distance(), assume supported - } else if (class(distance) == "character") { - # Else, assume a custom function and attempt to calculate. - obj.dist <- distance(physeq, method = distance, type = type, ...) - } else { - # Enforce orientation for sample-wise distances - if (taxa_are_rows(physeq)) { - physeq <- t(physeq) - } - # Calculate distances - obj.dist <- distance(as(otu_table(physeq), "matrix")) - } - # coerce distance-matrix back into vanilla matrix, Sample Distance Matrix, SaDiMa - SaDiMa <- as.matrix(obj.dist) - # Add Inf to the diagonal to avoid self-connecting edges (inefficient) - SaDiMa <- SaDiMa + diag(Inf, nsamples(physeq), nsamples(physeq)) - # Convert distance matrix to coincidence matrix, CoMa, using max.dist - CoMa <- SaDiMa < max.dist - } else { - stop("type argument must be one of \n (1) samples \n or \n (2) taxa") - } - # Calculate the igraph-formatted network - ig <- graph.adjacency(CoMa, mode = "lower") - if (!keep.isolates) { - # If not-keeping isolates, remove them - isolates <- V(ig)[degree(ig) == 0] - ig = delete.vertices(ig, V(ig)[degree(ig) == 0]) - } - if (vcount(ig) < 2) { - # Report a warning if the graph is empty - warning("The graph you created has too few vertices. Consider changing `max.dist` argument, and check your data.") - } - return(ig) + if( class(distance) == "dist" ){ + # If distance a distance object, use it rather than re-calculate + obj.dist <- distance + if( attributes(obj.dist)$Size != ntaxa(physeq) ){ + stop("ntaxa(physeq) does not match size of dist object in distance") + } + if( !setequal(attributes(obj.dist)$Labels, taxa_names(physeq)) ){ + stop("taxa_names does not exactly match dist-indices") + } + } else if( class(distance) == "character" ){ + # If character string, pass on to distance(), assume supported + obj.dist <- distance(physeq, method=distance, type=type, ...) + # Else, assume a custom function and attempt to calculate. + } else { + # Enforce orientation for taxa-wise distances + if( !taxa_are_rows(physeq) ){ physeq <- t(physeq) } + # Calculate distances + obj.dist <- distance(as(otu_table(physeq), "matrix")) + } + # coerce distance-matrix back into vanilla matrix, Taxa Distance Matrix, TaDiMa + TaDiMa <- as.matrix(obj.dist) + # Add Inf to the diagonal to avoid self-connecting edges (inefficient) + TaDiMa <- TaDiMa + diag(Inf, ntaxa(physeq), ntaxa(physeq)) + # Convert distance matrix to coincidence matrix, CoMa, using max.dist + CoMa <- TaDiMa < max.dist + } else if( type == "samples" ){ + # Calculate or asign sample-wise distance matrix + if( class(distance) == "dist" ){ # If argument is already a distance matrix. + # If distance a distance object, use it rather than re-calculate + obj.dist <- distance + if( attributes(obj.dist)$Size != nsamples(physeq) ){ + stop("nsamples(physeq) does not match size of dist object in distance") + } + if( !setequal(attributes(obj.dist)$Labels, sample_names(physeq)) ){ + stop("sample_names does not exactly match dist-indices") + } + # If character string, pass on to distance(), assume supported + } else if( class(distance) == "character" ){ + # Else, assume a custom function and attempt to calculate. + obj.dist <- distance(physeq, method=distance, type=type, ...) + } else { + # Enforce orientation for sample-wise distances + if(taxa_are_rows(physeq)){ physeq <- t(physeq) } + # Calculate distances + obj.dist <- distance(as(otu_table(physeq), "matrix")) + } + # coerce distance-matrix back into vanilla matrix, Sample Distance Matrix, SaDiMa + SaDiMa <- as.matrix(obj.dist) + # Add Inf to the diagonal to avoid self-connecting edges (inefficient) + SaDiMa <- SaDiMa + diag(Inf, nsamples(physeq), nsamples(physeq)) + # Convert distance matrix to coincidence matrix, CoMa, using max.dist + CoMa <- SaDiMa < max.dist + } else { + stop("type argument must be one of \n (1) samples \n or \n (2) taxa") + } + # Calculate the igraph-formatted network + ig <- graph.adjacency(CoMa, mode="lower") + if( !keep.isolates ){ + # If not-keeping isolates, remove them + isolates <- V(ig)[degree(ig) == 0] + ig = delete.vertices(ig, V(ig)[degree(ig) == 0]) + } + if( vcount(ig) < 2 ){ + # Report a warning if the graph is empty + warning("The graph you created has too few vertices. Consider changing `max.dist` argument, and check your data.") + } + return(ig) } -################################################################################ +################################################################################ diff --git a/R/ordination-methods.R b/R/ordination-methods.R index 92dcb31c..17dd03dd 100644 --- a/R/ordination-methods.R +++ b/R/ordination-methods.R @@ -1,9 +1,9 @@ -################################################################################ +################################################################################ #' Perform an ordination on phyloseq data #' #' This function wraps several commonly-used ordination methods. The type of #' ordination depends upon the argument to \code{method}. Try -#' \code{ordinate('help')} or \code{ordinate('list')} for the currently +#' \code{ordinate("help")} or \code{ordinate("list")} for the currently #' supported method options. #' #' @param physeq (Required). Phylogenetic sequencing data @@ -19,10 +19,10 @@ #' also results in these different data components being checked for validity #' and completeness by the method. #' -#' @param method (Optional). A character string. Default is \code{'DCA'}. +#' @param method (Optional). A character string. Default is \code{"DCA"}. #' #' Currently supported method options are: -#' \code{c('DCA', 'CCA', 'RDA', 'CAP', 'DPCoA', 'NMDS', 'MDS', 'PCoA')} +#' \code{c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA")} #' #' \describe{ #' \item{DCA}{Performs detrended correspondence analysis using\code{\link{decorana}}} @@ -54,7 +54,7 @@ #' argument to \code{metaMDS} if it is among the #' supported \code{vegdist} methods. However, all distance methods #' supported by \code{\link{distance}} are supported here, -#' including \code{'unifrac'} (the default) and \code{'DPCoA'}.} +#' including \code{"unifrac"} (the default) and \code{"DPCoA"}.} #' \item{MDS/PCoA}{Performs principal coordinate analysis #' (also called principle coordinate decomposition, #' multidimensional scaling (MDS), or classical scaling) @@ -62,10 +62,10 @@ #' including two correction methods for negative eigenvalues. #' See #' \code{\link[ape]{pcoa}} for further details. -#' }\t -#'\t} +#' } +#' } #' -#' @param distance (Optional). A character string. Default is \code{'bray'}. +#' @param distance (Optional). A character string. Default is \code{"bray"}. #' The name of a supported \code{\link{distance}} method; #' or, alternatively, #' a pre-computed \code{\link{dist}}-class object. @@ -75,9 +75,9 @@ #' #' Any supported \code{\link{distance}} methods #' are supported arguments to \code{distance} here. -#' Try \code{distance('list')} for a explicitly supported distance method +#' Try \code{distance("list")} for a explicitly supported distance method #' abbreviations. User-specified custom distance equations should also work, -#' e.g. \code{'(A+B-2*J)/(A+B)'}. +#' e.g. \code{"(A+B-2*J)/(A+B)"}. #' See \code{\link{distance}} for more details, examples. #' #' @param formula (Optional). A model \code{\link{formula}}. @@ -92,9 +92,9 @@ #' #' @param ... (Optional). Additional arguments to supporting functions. For #' example, the additional argument \code{weighted=TRUE} would be passed on -#' to \code{\link{UniFrac}} if \code{'unifrac'} were chosen as the -#' \code{distance} option and \code{'MDS'} as the ordination \code{method} -#' option. Alternatively, if \code{'DCA'} were chosen as the +#' to \code{\link{UniFrac}} if \code{"unifrac"} were chosen as the +#' \code{distance} option and \code{"MDS"} as the ordination \code{method} +#' option. Alternatively, if \code{"DCA"} were chosen as the #' ordination \code{method} option, additional arguments would be passed on #' to the relevant ordination function, \code{\link{decorana}}, for example. #' @@ -153,94 +153,94 @@ #' @examples #' # See http://joey711.github.io/phyloseq/plot_ordination-examples #' # for many more examples. -#' # plot_ordination(GP, ordinate(GP, 'DCA'), 'samples', color='SampleType') -ordinate = function(physeq, method = "DCA", distance = "bray", formula = NULL, ...) { - # If `physeq` is a formula, post deprecated notice, attempt to convert and - # dispatch - if (inherits(physeq, "formula")) { - .Deprecated(msg = paste0("First argument, `physeq`, as formula is deprecated.\n", - "There is now an explicit `formula` argument.\n", "Please revise method call accordingly.")) +#' # plot_ordination(GP, ordinate(GP, "DCA"), "samples", color="SampleType") +ordinate = function(physeq, method="DCA", distance="bray", formula=NULL, ...){ + # If `physeq` is a formula, post deprecated notice, attempt to convert and dispatch + if( inherits(physeq, "formula") ){ + .Deprecated(msg=paste0("First argument, `physeq`, as formula is deprecated.\n", + "There is now an explicit `formula` argument.\n", + "Please revise method call accordingly.")) # Create the new formula, RHS-only formchar = as.character(physeq) # Error if only RHS. Formula-first syntax required both sides. - if (length(formchar) < 3) { + if(length(formchar) < 3){ stop("Need both sides of formula in this deprecated syntax... Revisit ordinate() documentation / examples.") } # Replace with (presumed) phyloseq object. physeq <- get(as.character(physeq)[2]) - # Create the new formula, RHS-only. - newFormula = as.formula(paste0("~", formchar[length(formchar)])) + # Create the new formula, RHS-only. + newFormula = as.formula(paste0("~", formchar[length(formchar)])) # Dispatch to (hopefully) ordinate,phyloseq - return(ordinate(physeq, method = method, distance = distance, formula = newFormula, - ...)) - } - # Define table of currently-supported methods - method_table <- c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA") - # List supported method names to user, if requested. - if (inherits(physeq, "character")) { - if (physeq == "help") { - cat("Available arguments to methods:\n") - print(c(method_table)) - cat("Please be exact, partial-matching not supported.\n") - cat("Can alternatively provide a custom distance.\n") - cat("See:\n help(\"distance\") \n") - return() - } else if (physeq == "list") { - return(c(method_table)) - } else { - cat("physeq needs to be a phyloseq-class object, \n") - cat("or a character string matching \"help\" or \"list\". \n") - } + return(ordinate(physeq, method=method, distance=distance, formula=newFormula, ...)) } + # Define table of currently-supported methods + method_table <- c("DCA", "CCA", "RDA", "CAP", "DPCoA", "NMDS", "MDS", "PCoA") + # List supported method names to user, if requested. + if( inherits(physeq, "character") ){ + if( physeq=="help" ){ + cat("Available arguments to methods:\n") + print(c(method_table)) + cat("Please be exact, partial-matching not supported.\n") + cat("Can alternatively provide a custom distance.\n") + cat("See:\n help(\"distance\") \n") + return() + } else if( physeq=="list" ){ + return(c(method_table)) + } else { + cat("physeq needs to be a phyloseq-class object, \n") + cat("or a character string matching \"help\" or \"list\". \n") + } + } # Final check that `physeq` is a phyloseq or otu_table class - if (!inherits(physeq, "phyloseq") & !inherits(physeq, "otu_table")) { + if( !inherits(physeq, "phyloseq") & !inherits(physeq, "otu_table") ){ stop("Expected a phyloseq object or otu_table object.") } - # # Start with methods that don't require # additional distance calculation. - # (distance argument ignored) DCA - if (method == "DCA") { - return(decorana(veganifyOTU(physeq), ...)) - } - # CCA / RDA - if (method %in% c("CCA", "RDA")) { - return(cca.phyloseq(physeq, formula, method, ...)) - } - # CAP - if (method == "CAP") { + # # Start with methods that don't require + # # additional distance calculation. (distance argument ignored) + # DCA + if( method == "DCA" ){ + return( decorana(veganifyOTU(physeq), ...) ) + } + # CCA / RDA + if( method %in% c("CCA", "RDA") ){ + return(cca.phyloseq(physeq, formula, method, ...)) + } + # CAP + if( method == "CAP" ){ # Call/return with do.call - return(capscale.phyloseq(physeq, formula, distance, ...)) - } - # DPCoA - if (method == "DPCoA") { - return(DPCoA(physeq, ...)) - } - # # Now resort to methods that do require a separate distance/dist-calc Define - # ps.dist. Check the class of distance argument is character or dist - if (inherits(distance, "dist")) { - ps.dist <- distance - } else if (class(distance) == "character") { - # There are some special options for NMDS/metaMDS if distance-method is supported - # by vegdist, so check first. If not, just calculate distance - vegdist_methods <- c("manhattan", "euclidean", "canberra", "bray", "kulczynski", - "jaccard", "gower", "altGower", "morisita", "horn", "mountford", "raup", - "binomial", "chao") - # NMDS with vegdist-method to include species - if (method == "NMDS" & distance %in% vegdist_methods) { - return(metaMDS(veganifyOTU(physeq), distance, ...)) - } - # Calculate distance with handoff to distance() - ps.dist <- distance(physeq, distance, ...) - } - # Vanilla MDS/PCoA - if (method %in% c("PCoA", "MDS")) { - return(pcoa(ps.dist)) - } - # NMDS with non-vegdist-method - if (method == "NMDS") { - return(metaMDS(ps.dist)) - } + return(capscale.phyloseq(physeq, formula, distance, ...)) + } + # DPCoA + if( method == "DPCoA" ){ + return( DPCoA(physeq, ...) ) + } + # # Now resort to methods that do require a separate distance/dist-calc + # Define ps.dist. Check the class of distance argument is character or dist + if( inherits(distance, "dist") ){ + ps.dist <- distance + } else if( class(distance) == "character" ){ + # There are some special options for NMDS/metaMDS if distance-method + # is supported by vegdist, so check first. If not, just calculate distance + vegdist_methods <- c("manhattan", "euclidean", "canberra", "bray", + "kulczynski", "jaccard", "gower", "altGower", "morisita", "horn", + "mountford", "raup" , "binomial", "chao") + # NMDS with vegdist-method to include species + if(method == "NMDS" & distance %in% vegdist_methods){ + return(metaMDS(veganifyOTU(physeq), distance, ...)) + } + # Calculate distance with handoff to distance() + ps.dist <- distance(physeq, distance, ...) + } + # Vanilla MDS/PCoA + if( method %in% c("PCoA", "MDS")){ + return(pcoa(ps.dist)) + } + # NMDS with non-vegdist-method + if(method == "NMDS"){ + return(metaMDS(ps.dist)) + } } -################################################################################ +################################################################################ #' Calculate Double Principle Coordinate Analysis (DPCoA) #' using phylogenetic distance #' @@ -310,9 +310,9 @@ ordinate = function(physeq, method = "DCA", distance = "bray", formula = NULL, . #' data(esophagus) #' eso.dpcoa <- DPCoA(esophagus) #' eso.dpcoa -#' plot_ordination(esophagus, eso.dpcoa, 'samples') -#' plot_ordination(esophagus, eso.dpcoa, 'species') -#' plot_ordination(esophagus, eso.dpcoa, 'biplot') +#' plot_ordination(esophagus, eso.dpcoa, "samples") +#' plot_ordination(esophagus, eso.dpcoa, "species") +#' plot_ordination(esophagus, eso.dpcoa, "biplot") #' # #' # #' # # # # # # GlobalPatterns @@ -322,47 +322,49 @@ ordinate = function(physeq, method = "DCA", distance = "bray", formula = NULL, . #' GP <- prune_taxa(keepTaxa, GlobalPatterns) #' # Perform DPCoA #' GP.dpcoa <- DPCoA(GP) -#' plot_ordination(GP, GP.dpcoa, color='SampleType') -DPCoA <- function(physeq, correction = cailliez, scannf = FALSE, ...) { - # Check that physeq is a phyloseq-class - if (!class(physeq) == "phyloseq") { - stop("physeq must be phyloseq-class") - } - - # Remove any OTUs that are absent from all the samples. - physeq <- prune_taxa((taxa_sums(physeq) > 0), physeq) - - # Access components for handing-off - OTU <- otu_table(physeq) - tree <- phy_tree(physeq) +#' plot_ordination(GP, GP.dpcoa, color="SampleType") +DPCoA <- function(physeq, correction=cailliez, scannf=FALSE, ...){ + # Check that physeq is a phyloseq-class + if(!class(physeq)=="phyloseq"){stop("physeq must be phyloseq-class")} + + # Remove any OTUs that are absent from all the samples. + physeq <- prune_taxa((taxa_sums(physeq) > 0), physeq) + + # Access components for handing-off + OTU <- otu_table(physeq) + tree <- phy_tree(physeq) + + # Enforce that OTU is in samples-by-species orientation + if(taxa_are_rows(OTU) ){ OTU <- t(OTU) } - # Enforce that OTU is in samples-by-species orientation - if (taxa_are_rows(OTU)) { - OTU <- t(OTU) - } - - # get the patristic distances between the species from the tree - patristicDist <- as.dist(cophenetic.phylo(tree)) - - # if the patristic distances are not Euclidean, then correct them or throw - # meaningful error. - if (!is.euclid(patristicDist)) { - patristicDist <- correction(patristicDist) - - # Check that this is now Euclidean. - if (!is.euclid(patristicDist)) { - stop("Corrected distance still not Euclidean \n", "please provide a different correction method") - } - } - - # NOTE: the dpcoa function in ade4 requires a data.frame - return(dpcoa(data.frame(OTU), patristicDist, scannf, ...)) + # get the patristic distances between the species from the tree + patristicDist <- as.dist(cophenetic.phylo(tree)) + + # if the patristic distances are not Euclidean, + # then correct them or throw meaningful error. + if( !is.euclid(patristicDist) ){ + patristicDist <- correction(patristicDist) + + # Check that this is now Euclidean. + if( !is.euclid(patristicDist) ){ + stop('Corrected distance still not Euclidean \n', + "please provide a different correction method") + } + } + + # NOTE: the dpcoa function in ade4 requires a data.frame + return( dpcoa(data.frame(OTU), patristicDist, scannf, ...) ) } -################################################################################ vegan::cca 'extension'. formula is main input to this function. This -################################################################################ complicates signature handling. A new method with a separate name is defined -################################################################################ instead. Must transpose the phyloseq otu_table to fit the vegan::cca -################################################################################ convention Whether-or-not to transpose needs to be a check, based on the -################################################################################ 'taxa_are_rows' slot value +################################################################################ +################################################################################ +# vegan::cca "extension". +# formula is main input to this function. This complicates signature handling. +# A new method with a separate name is defined instead. +# +# Must transpose the phyloseq otu_table to fit the vegan::cca convention +# Whether-or-not to transpose needs to be a check, based on the +# "taxa_are_rows" slot value +################################################################################ #' Constrained Correspondence Analysis and Redundancy Analysis. #' #' This is the internal function that simplifies getting phyloseq data @@ -381,7 +383,7 @@ DPCoA <- function(physeq, correction = cailliez, scannf = FALSE, ...) { #' from within \code{physeq}. #' #' @param method (Optional). A single \code{\link{character}} string, -#' specifying \code{'RDA'} or \code{'CCA'}. Default is \code{'CCA'}. +#' specifying \code{"RDA"} or \code{"CCA"}. Default is \code{"CCA"}. #' #' @param ... (Optional). Additional named arguments passed to #' \code{\link[vegan]{capscale}}. @@ -399,29 +401,29 @@ DPCoA <- function(physeq, correction = cailliez, scannf = FALSE, ...) { #' @keywords internal #' @examples # #' # cca.phyloseq(physeq, formula, method, ...) -setGeneric("cca.phyloseq", function(physeq, formula = NULL, method = "CCA", ...) { +setGeneric("cca.phyloseq", function(physeq, formula=NULL, method="CCA", ...){ standardGeneric("cca.phyloseq") }) #' @importFrom vegan cca #' @importFrom vegan rda #' @aliases cca.phyloseq,phyloseq,formula-method #' @rdname cca-rda-phyloseq-methods -setMethod("cca.phyloseq", signature = c("phyloseq", "formula"), function(physeq, - formula, method = "CCA", ...) { - data = data.frame(sample_data(physeq, FALSE), stringsAsFactors = FALSE) - if (length(data) < 1) { +setMethod("cca.phyloseq", signature=c("phyloseq", "formula"), +function(physeq, formula, method="CCA", ...){ + data = data.frame(sample_data(physeq, FALSE), stringsAsFactors=FALSE) + if( length(data) < 1 ){ stop("`physeq` argument must include non-empty `sample_data`") } - OTU = veganifyOTU(physeq) - # Create new formula. Left-hand side is ignored. - formchar = as.character(formula) - newFormula = as.formula(paste0("OTU ~ ", formchar[length(formchar)])) - # Note that ade4 also has a conflicting 'cca' function. You don't import - # ade4::cca to avoid the conflict. - if (method == "CCA") { - return(cca(newFormula, data = data)) - } else if (method == "RDA") { - return(rda(newFormula, data = data)) + OTU = veganifyOTU(physeq) + # Create new formula. Left-hand side is ignored. + formchar = as.character(formula) + newFormula = as.formula(paste0("OTU ~ ", formchar[length(formchar)])) + # Note that ade4 also has a conflicting "cca" function. + # You don't import ade4::cca to avoid the conflict. + if(method=="CCA"){ + return(cca(newFormula, data=data)) + } else if(method=="RDA"){ + return(rda(newFormula, data=data)) } else { warning("Unsupported `method` argument. Must be 'RDA' or 'CCA'") return(NULL) @@ -430,29 +432,29 @@ setMethod("cca.phyloseq", signature = c("phyloseq", "formula"), function(physeq, #' @importFrom vegan cca #' @aliases cca.phyloseq,otu_table-method #' @rdname cca-rda-phyloseq-methods -setMethod("cca.phyloseq", signature = "otu_table", function(physeq, formula = NULL, - method = "CCA", ...) { +setMethod("cca.phyloseq", signature="otu_table", + function(physeq, formula=NULL, method="CCA", ...){ # OTU table by itself indicates an unconstrained ordination is requested. # Formula argument is ignored. - if (method == "CCA") { - return(cca(veganifyOTU(physeq))) - } else if (method == "RDA") { - return(rda(veganifyOTU(physeq))) - } else { - warning("Unsupported `method` argument. Must be 'RDA' or 'CCA'") + if(method=="CCA"){ + return(cca(veganifyOTU(physeq))) + } else if(method=="RDA"){ + return(rda(veganifyOTU(physeq))) + } else { + warning("Unsupported `method` argument. Must be 'RDA' or 'CCA'") return(NULL) - } + } }) #' @importFrom vegan cca #' @aliases cca.phyloseq,phyloseq,NULL-method #' @rdname cca-rda-phyloseq-methods -setMethod("cca.phyloseq", signature = c("phyloseq", "NULL"), function(physeq, formula, - method = "CCA", ...) { - # Absence of a formula (NULL) indicates unconstrained ordination. Access - # otu_table, and dispatch. +setMethod("cca.phyloseq", signature=c("phyloseq", "NULL"), +function(physeq, formula, method="CCA", ...){ + # Absence of a formula (NULL) indicates unconstrained ordination. + # Access otu_table, and dispatch. return(cca.phyloseq(otu_table(physeq), NULL, method, ...)) }) -################################################################################ +################################################################################ #' Estimate the gap statistic on an ordination result #' #' This is a wrapper for the \code{\link[cluster]{clusGap}} function, @@ -463,10 +465,10 @@ setMethod("cca.phyloseq", signature = c("phyloseq", "NULL"), function(physeq, fo #' should work, ultimately by passing to the \code{\link[vegan]{scores}} function #' or its internal extensions in phyloseq. #' @param axes (Optional). The ordination axes that you want to include. -#' @param type (Optional). One of \code{'sites'} +#' @param type (Optional). One of \code{"sites"} #' (the vegan package label for samples) or -#' \code{'species'} (the vegan package label for OTUs/taxa). -#' Default is \code{'sites'}. +#' \code{"species"} (the vegan package label for OTUs/taxa). +#' Default is \code{"sites"}. #' @param FUNcluster (Optional). This is passed to \code{\link[cluster]{clusGap}}. #' The documentation is copied here for convenience: #' a function which accepts as first argument a (data) matrix like \code{x}, @@ -482,7 +484,7 @@ setMethod("cca.phyloseq", signature = c("phyloseq", "NULL"), function(physeq, fo #' Any function that has these input/output properties (performing a clustering) #' will suffice. The more appropriate the clustering method, the better chance #' your gap statistic results will be useful. -#' @param K.max\t(Optional). A single positive integer value. +#' @param K.max (Optional). A single positive integer value. #' It indicates the maximum number of clusters that will be considered. #' Value must be at least two. #' This is passed to \code{\link[cluster]{clusGap}}. @@ -494,7 +496,7 @@ setMethod("cca.phyloseq", signature = c("phyloseq", "NULL"), function(physeq, fo #' See the \code{\link[cluster]{clusGap}} documentation for more details. #' #' @return -#' An object of S3 class \code{'clusGap'}, basically a list with components. +#' An object of S3 class \code{"clusGap"}, basically a list with components. #' See the \code{\link[cluster]{clusGap}} documentation for more details. #' #' @importFrom vegan scores @@ -502,38 +504,38 @@ setMethod("cca.phyloseq", signature = c("phyloseq", "NULL"), function(physeq, fo #' @importFrom cluster pam #' @export #' @examples -#' data('soilrep') -#' sord = ordinate(soilrep, 'PCoA', 'bray') +#' data("soilrep") +#' sord = ordinate(soilrep, "PCoA", "bray") #' # Evaluate axes with scree plot #' plot_scree(sord) #' # Gap Statistic #' gs = gapstat_ord(sord, axes=1:3, verbose=FALSE) -#' # plot_ordination(soilrep, sord, color='Treatment') +#' # plot_ordination(soilrep, sord, color="Treatment") #' plot_clusgap(gs) -#' print(gs, method='Tibs2001SEmax') -gapstat_ord = function(ord, axes = c(1:2), type = "sites", FUNcluster = function(x, - k) { - list(cluster = pam(x, k, cluster.only = TRUE)) -}, K.max = 8, ...) { - # Use the scores function to get the ordination coordinates - x = scores(ord, display = type) - # If axes not explicitly defined (NULL), then use all of them - if (is.null(axes)) { - axes = 1:ncol(x) - } - # Finally, perform, and return, the gap statistic calculation using - # cluster::clusGap - return(clusGap(x[, axes], FUNcluster, K.max, ...)) +#' print(gs, method="Tibs2001SEmax") +gapstat_ord = function(ord, axes=c(1:2), type="sites", + FUNcluster=function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))}, + K.max=8, ...){ + # + # Use the scores function to get the ordination coordinates + x = scores(ord, display=type) + # If axes not explicitly defined (NULL), then use all of them + if(is.null(axes)){ + axes = 1:ncol(x) + } + # Finally, perform, and return, the gap statistic calculation using + # cluster::clusGap + return(clusGap(x[, axes], FUNcluster, K.max, ...)) } -################################################################################ Define an internal function for accessing and orienting the OTU table in a -################################################################################ fashion suitable for vegan functions @keywords internal -veganifyOTU <- function(physeq) { - if (taxa_are_rows(physeq)) { - physeq <- t(physeq) - } +################################################################################ +# Define an internal function for accessing and orienting the OTU table +# in a fashion suitable for vegan functions +# @keywords internal +veganifyOTU <- function(physeq){ + if(taxa_are_rows(physeq)){physeq <- t(physeq)} return(as(otu_table(physeq), "matrix")) } -################################################################################ +################################################################################ #' Constrained Analysis of Principal Coordinates, \code{\link[vegan]{capscale}}. #' #' See \code{\link[vegan]{capscale}} for details. A formula is main input. @@ -560,7 +562,7 @@ veganifyOTU <- function(physeq) { #' (except for possible sign reversal). However, it makes no sense to use #' \code{\link[vegan]{capscale}} with Euclidean distances, #' since direct use of \code{\link[vegan]{rda}} is much more efficient -#' (and supported in the \code{\link{ordinate}} function with \code{method=='RDA'}) +#' (and supported in the \code{\link{ordinate}} function with \code{method=="RDA"}) #' Even with non-Euclidean dissimilarities, #' the rest of the analysis will be metric and linear. #' @@ -586,11 +588,11 @@ veganifyOTU <- function(physeq) { #' # http://joey711.github.io/phyloseq/plot_ordination-examples #' data(GlobalPatterns) #' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns) -#' ordcap = ordinate(GP, 'CAP', 'bray', ~SampleType) -#' plot_ordination(GP, ordcap, 'samples', color='SampleType') -setGeneric("capscale.phyloseq", function(physeq, formula, distance, ...) { - data = data.frame(sample_data(physeq, FALSE), stringsAsFactors = FALSE) - if (length(data) < 1) { +#' ordcap = ordinate(GP, "CAP", "bray", ~SampleType) +#' plot_ordination(GP, ordcap, "samples", color="SampleType") +setGeneric("capscale.phyloseq", function(physeq, formula, distance, ...){ + data = data.frame(sample_data(physeq, FALSE), stringsAsFactors=FALSE) + if( length(data) < 1 ){ stop("`physeq` argument must include non-empty `sample_data`") } standardGeneric("capscale.phyloseq") @@ -598,43 +600,45 @@ setGeneric("capscale.phyloseq", function(physeq, formula, distance, ...) { #' @importFrom vegan capscale #' @aliases capscale.phyloseq,phyloseq,formula,dist-method #' @rdname capscale-phyloseq-methods -setMethod("capscale.phyloseq", c("phyloseq", "formula", "dist"), function(physeq, - formula, distance, ...) { - data = data.frame(sample_data(physeq), stringsAsFactors = FALSE) +setMethod("capscale.phyloseq", c("phyloseq", "formula", "dist"), +function(physeq, formula, distance, ...){ + data = data.frame(sample_data(physeq), stringsAsFactors=FALSE) # Convert formula to character vector, compute on language. - formchar = as.character(formula) + formchar = as.character(formula) newFormula = as.formula(paste0("distance ~ ", formchar[length(formchar)])) - return(capscale(formula = newFormula, data = data, ...)) + return(capscale(formula=newFormula, data=data, ...)) }) #' @importFrom vegan capscale #' @aliases capscale.phyloseq,phyloseq,formula,character-method #' @rdname capscale-phyloseq-methods -setMethod("capscale.phyloseq", c("phyloseq", "formula", "character"), function(physeq, - formula, distance, ...) { - data = data.frame(sample_data(physeq), stringsAsFactors = FALSE) - # The goal here is to process the distance identifier string and dispatch - # accordingly. - if (length(distance) != 1) { - warning("`distance` was unexpected length. \n", " `distance` argument should be a single character string", - " or dist matrix. \n", "Attempting to use first element only.") +setMethod("capscale.phyloseq", c("phyloseq", "formula", "character"), +function(physeq, formula, distance, ...){ + data = data.frame(sample_data(physeq), stringsAsFactors=FALSE) + # The goal here is to process the distance identifier string + # and dispatch accordingly. + if( length(distance) != 1 ){ + warning("`distance` was unexpected length. \n", + " `distance` argument should be a single character string", + " or dist matrix. \n", + "Attempting to use first element only.") } distance <- distance[1] - if (!distance %in% unlist(distance("list"))) { - # distance must be among the supported distance options (which is a superset of - # vegdist). + if(!distance %in% unlist(distance("list"))){ + # distance must be among the supported distance options + # (which is a superset of vegdist). stop("The distance method you specified is not supported by phyloseq") } # Convert formula to character vector, compute on language. formchar = as.character(formula) - if (distance %in% distance("list")$vegdist) { + if(distance %in% distance("list")$vegdist){ # If it is among the vegdist distances, pass it along to vegan::capscale OTU = veganifyOTU(physeq) newFormula = as.formula(paste0("OTU ~ ", formchar[length(formchar)])) - return(capscale(formula = newFormula, data = data, distance = distance, ...)) + return(capscale(formula=newFormula, data=data, distance=distance, ...)) } else { # Else calculate the distance matrix here, and dispatch. - distance <- distance(physeq = physeq, method = distance, type = "samples") + distance <- distance(physeq=physeq, method=distance, type="samples") return(capscale.phyloseq(physeq, formula, distance, ...)) } }) -################################################################################ +################################################################################ \ No newline at end of file diff --git a/R/otuTable-class.R b/R/otuTable-class.R index dceda0ba..f202de95 100644 --- a/R/otuTable-class.R +++ b/R/otuTable-class.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Build or access the otu_table. #' #' This is the suggested method for both constructing and accessing @@ -34,58 +34,56 @@ #' @examples # #' # data(GlobalPatterns) #' # otu_table(GlobalPatterns) -setGeneric("otu_table", function(object, taxa_are_rows, errorIfNULL = TRUE) { - standardGeneric("otu_table") +setGeneric("otu_table", function(object, taxa_are_rows, errorIfNULL=TRUE){ + standardGeneric("otu_table") }) # Access the otu_table slot. #' @aliases otu_table,phyloseq-method #' @rdname otu_table-methods -setMethod("otu_table", "phyloseq", function(object, errorIfNULL = TRUE) { - access(object, "otu_table", errorIfNULL) +setMethod("otu_table", "phyloseq", function(object, errorIfNULL=TRUE){ + access(object, "otu_table", errorIfNULL) }) # return the otu_table as-is. #' @aliases otu_table,otu_table-method #' @rdname otu_table-methods -setMethod("otu_table", "otu_table", function(object, errorIfNULL = TRUE) { - return(object) -}) +setMethod("otu_table", "otu_table", function(object, errorIfNULL=TRUE){ return(object) }) # Instantiate an otu_table from a raw abundance matrix. #' @aliases otu_table,matrix-method #' @rdname otu_table-methods -setMethod("otu_table", "matrix", function(object, taxa_are_rows) { - # instantiate first to check validity - otutab <- new("otu_table", object, taxa_are_rows = taxa_are_rows) - # Want dummy species/sample index names if missing - if (taxa_are_rows) { - if (is.null(rownames(otutab))) { - rownames(otutab) <- paste("sp", 1:nrow(otutab), sep = "") - } - if (is.null(colnames(otutab))) { - colnames(otutab) <- paste("sa", 1:ncol(otutab), sep = "") - } - } else { - if (is.null(rownames(otutab))) { - rownames(otutab) <- paste("sa", 1:nrow(otutab), sep = "") - } - if (is.null(colnames(otutab))) { - colnames(otutab) <- paste("sp", 1:ncol(otutab), sep = "") - } - } - return(otutab) +setMethod("otu_table", "matrix", function(object, taxa_are_rows){ + # instantiate first to check validity + otutab <- new("otu_table", object, taxa_are_rows=taxa_are_rows) + # Want dummy species/sample index names if missing + if(taxa_are_rows){ + if(is.null(rownames(otutab))){ + rownames(otutab) <- paste("sp", 1:nrow(otutab), sep="") + } + if(is.null(colnames(otutab))){ + colnames(otutab) <- paste("sa", 1:ncol(otutab), sep="") + } + } else { + if(is.null(rownames(otutab))){ + rownames(otutab) <- paste("sa",1:nrow(otutab),sep="") + } + if(is.null(colnames(otutab))){ + colnames(otutab) <- paste("sp",1:ncol(otutab),sep="") + } + } + return(otutab) }) # # # Convert to matrix, then dispatch. #' @aliases otu_table,data.frame-method #' @rdname otu_table-methods -setMethod("otu_table", "data.frame", function(object, taxa_are_rows) { - otu_table(as(object, "matrix"), taxa_are_rows) +setMethod("otu_table", "data.frame", function(object, taxa_are_rows){ + otu_table(as(object, "matrix"), taxa_are_rows) }) # Any less-specific class, not inherited by those above. #' @aliases otu_table,ANY-method #' @rdname otu_table-methods -setMethod("otu_table", "ANY", function(object, errorIfNULL = TRUE) { - access(object, "otu_table", errorIfNULL) +setMethod("otu_table", "ANY", function(object, errorIfNULL=TRUE){ + access(object, "otu_table", errorIfNULL) }) -################################################################################ +################################################################################ #' Returns the total number of individuals observed from each species/taxa/OTU. #' #' A convenience function equivalent to rowSums or colSums, but where @@ -106,15 +104,15 @@ setMethod("otu_table", "ANY", function(object, errorIfNULL = TRUE) { #' taxa_sums(enterotype) #' data(esophagus) #' taxa_sums(esophagus) -taxa_sums <- function(x) { - x <- otu_table(x) - if (taxa_are_rows(x)) { - rowSums(x) - } else { - colSums(x) - } +taxa_sums <- function(x){ + x <- otu_table(x) + if( taxa_are_rows(x) ){ + rowSums(x) + } else { + colSums(x) + } } -################################################################################ +################################################################################ #' Returns the total number of individuals observed from each sample. #' #' A convenience function equivalent to rowSums or colSums, but where @@ -136,12 +134,12 @@ taxa_sums <- function(x) { #' sample_sums(enterotype) #' data(esophagus) #' sample_sums(esophagus) -sample_sums <- function(x) { - x <- otu_table(x) - if (taxa_are_rows(x)) { - colSums(x) - } else { - rowSums(x) - } +sample_sums <- function(x){ + x <- otu_table(x) + if( taxa_are_rows(x) ){ + colSums(x) + } else { + rowSums(x) + } } -################################################################################ +################################################################################ diff --git a/R/phylo-class.R b/R/phylo-class.R index ce7f5886..ef8e7af3 100644 --- a/R/phylo-class.R +++ b/R/phylo-class.R @@ -1,23 +1,26 @@ -# Methods related to using phylo in phyloseq, including phyloseq-internal calls -# to ape internals. +# Methods related to using phylo in phyloseq, including +# phyloseq-internal calls to ape internals. +################################################################################ #' Method for fixing problems with phylo-class trees in phyloseq #' #' For now this only entails replacing each missing (\code{NA}) branch-length #' value with 0.0. #' #' @keywords internal -setGeneric("fix_phylo", function(tree) standardGeneric("fix_phylo")) +setGeneric("fix_phylo", function(tree) standardGeneric("fix_phylo") ) #' @rdname fix_phylo #' @aliases fix_phylo,phylo-method -setMethod("fix_phylo", "phylo", function(tree) { +setMethod("fix_phylo", "phylo", function(tree){ tree$edge.length[which(is.na(tree$edge.length))] <- 0 return(tree) }) -################################################################################ Define horizontal position / node-ages by depth to root For instance, `xx` in -################################################################################ `plot_tree` and `tipAges` in `fastUniFrac` +################################################################################ +# Define horizontal position / node-ages by depth to root +# For instance, `xx` in `plot_tree` and `tipAges` in `fastUniFrac` #' @keywords internal -ape_node_depth_edge_length <- function(Ntip, Nnode, edge, Nedge, edge.length) { - .C(ape:::node_depth_edgelength, PACKAGE = "ape", as.integer(Ntip), as.integer(Nnode), - as.integer(edge[, 1]), as.integer(edge[, 2]), as.integer(Nedge), as.double(edge.length), - double(Ntip + Nnode))[[7]] -} +ape_node_depth_edge_length <- function(Ntip, Nnode, edge, Nedge, edge.length){ + .C(ape:::node_depth_edgelength, PACKAGE="ape", as.integer(Ntip), + as.integer(Nnode), as.integer(edge[, 1]), + as.integer(edge[, 2]), as.integer(Nedge), + as.double(edge.length), double(Ntip + Nnode))[[7]] +} \ No newline at end of file diff --git a/R/phyloseq-class.R b/R/phyloseq-class.R index f16e74cc..ad125d26 100644 --- a/R/phyloseq-class.R +++ b/R/phyloseq-class.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Build phyloseq-class objects from their components. #' #' \code{phyloseq()} is a constructor method, This is the main method @@ -38,80 +38,93 @@ #' # # phyloseq(phy_tree(GP), otu_table(GP), sample_data(GP)) #' # # phyloseq(otu_table(GP), tax_table(GP), sample_data(GP)) #' # # phyloseq(otu_table(GP), phy_tree(GP), tax_table(GP), sample_data(GP)) -phyloseq <- function(...) { +phyloseq <- function(...){ + + arglist <- list(...) + + # Remove names from arglist. Will replace them based on their class + names(arglist) <- NULL + + # ignore all but component data classes. + arglist <- arglist[sapply(arglist, is.component.class)] + + # Make the name-replaced, splatted list + splatlist <- sapply(arglist, splat.phyloseq.objects) + + # rm any forbidden chars in index names (e.g. quotes - phylogenetic tree). + # Right now, only extra quotes are forbidden. + splatlist = lapply(splatlist, function(x){ + taxa_names(x) <- gsub("\"", "", taxa_names(x), fixed=TRUE) + taxa_names(x) <- gsub("\'", "", taxa_names(x), fixed=TRUE) + return(x) + }) + + #################### + ## Need to determine whether to + # (A) instantiate a new raw/uncleaned phyloseq object, or + # (B) return a single component, or + # (C) to stop with an error because of incorrect argument types. + if( length(splatlist) > length(get.component.classes()) ){ + stop("Too many components provided\n") + } else if( length(names(splatlist)) > length(unique(names(splatlist))) ){ + stop("Only one of each component type allowed.\n", + "For merging multiple objects of the same type/class, try merge_phyloseq(...)\n") + } else if( length(splatlist) == 1){ + return(arglist[[1]]) + } else { + # Instantiate the phyloseq-class object, ps. + ps <- do.call("new", c(list(Class="phyloseq"), splatlist) ) + } + + #################### + ## Reconcile the taxa and sample index names between components + ## in the newly-minted phyloseq object + shared_taxa = intersect_taxa(ps) + shared_samples = intersect_samples(ps) + + if( length(shared_taxa) < 1 ){ + stop("Problem with OTU/taxa indices among those you provided.\n", + "Check using intersect() and taxa_names()\n" + ) + } + if( length(shared_samples) < 1 ){ + stop("Problem with sample indices among those you provided.\n", + "Check using intersect() and taxa_names()\n" + ) + } + + # Start with OTU indices + ps = prune_taxa(shared_taxa, ps) + + # Verify there is more than one component + # that describes samples before attempting to reconcile. + ps = prune_samples(shared_samples, ps) + + # Force both samples and taxa indices to be in the same order. + ps = index_reorder(ps, "both") + + # Replace any NA branch-length values in the tree with zero. + if( !is.null(phy_tree(ps, FALSE)) ){ + ps@phy_tree <- fix_phylo(ps@phy_tree) + } - arglist <- list(...) - - # Remove names from arglist. Will replace them based on their class - names(arglist) <- NULL - - # ignore all but component data classes. - arglist <- arglist[sapply(arglist, is.component.class)] - - # Make the name-replaced, splatted list - splatlist <- sapply(arglist, splat.phyloseq.objects) - - # rm any forbidden chars in index names (e.g. quotes - phylogenetic tree). Right - # now, only extra quotes are forbidden. - splatlist = lapply(splatlist, function(x) { - taxa_names(x) <- gsub("\"", "", taxa_names(x), fixed = TRUE) - taxa_names(x) <- gsub("'", "", taxa_names(x), fixed = TRUE) - return(x) - }) - - #################### Need to determine whether to (A) instantiate a new raw/uncleaned phyloseq - #################### object, or (B) return a single component, or (C) to stop with an error because - #################### of incorrect argument types. - if (length(splatlist) > length(get.component.classes())) { - stop("Too many components provided\n") - } else if (length(names(splatlist)) > length(unique(names(splatlist)))) { - stop("Only one of each component type allowed.\n", "For merging multiple objects of the same type/class, try merge_phyloseq(...)\n") - } else if (length(splatlist) == 1) { - return(arglist[[1]]) - } else { - # Instantiate the phyloseq-class object, ps. - ps <- do.call("new", c(list(Class = "phyloseq"), splatlist)) - } - - #################### Reconcile the taxa and sample index names between components in the - #################### newly-minted phyloseq object - shared_taxa = intersect_taxa(ps) - shared_samples = intersect_samples(ps) - - if (length(shared_taxa) < 1) { - stop("Problem with OTU/taxa indices among those you provided.\n", "Check using intersect() and taxa_names()\n") - } - if (length(shared_samples) < 1) { - stop("Problem with sample indices among those you provided.\n", "Check using intersect() and taxa_names()\n") - } - - # Start with OTU indices - ps = prune_taxa(shared_taxa, ps) - - # Verify there is more than one component that describes samples before - # attempting to reconcile. - ps = prune_samples(shared_samples, ps) - - # Force both samples and taxa indices to be in the same order. - ps = index_reorder(ps, "both") - - # Replace any NA branch-length values in the tree with zero. - if (!is.null(phy_tree(ps, FALSE))) { - ps@phy_tree <- fix_phylo(ps@phy_tree) - } - - return(ps) + return(ps) } -################################################################################ A relatively fast way to access from phyloseq object components f - function -################################################################################ name as character string physeq - a phyloseq object (phyloseq-class instance) +################################################################################ +# A relatively fast way to access from phyloseq object components +# f - function name as character string +# physeq - a phyloseq object (phyloseq-class instance) #' @keywords internal -f_comp_ps = function(f, physeq) { - sapply(names(getSlots("phyloseq")), function(i, ps) { - eval(parse(text = paste(f, "(ps@", i, ")", sep = ""))) - }, physeq) +f_comp_ps = function(f, physeq){ + sapply(names(getSlots("phyloseq")), function(i, ps){ + eval(parse(text=paste(f, "(ps@", i, ")", sep=""))) + }, physeq) } -# f_comp_ps('taxa_names', ps) f_comp_ps('ntaxa', ps) Reduce('union', -# f_comp_ps('taxa_names', ps)) Reduce('intersect', f_comp_ps('taxa_names', ps)) +# f_comp_ps("taxa_names", ps) +# f_comp_ps("ntaxa", ps) +# Reduce("union", f_comp_ps("taxa_names", ps)) +# Reduce("intersect", f_comp_ps("taxa_names", ps)) +################################################################################ #' Show the component objects classes and slot names. #' #' There are no arguments to this function. It returns a named character @@ -126,43 +139,38 @@ f_comp_ps = function(f, physeq) { #' #' @examples # #' #get.component.classes() -get.component.classes <- function() { - # define classes vector - component.classes <- c("otu_table", "sample_data", "phylo", "taxonomyTable", - "XStringSet") - # the names of component.classes needs to be the slot names to match getSlots / - # splat - names(component.classes) <- c("otu_table", "sam_data", "phy_tree", "tax_table", - "refseq") - return(component.classes) +get.component.classes <- function(){ + # define classes vector + component.classes <- c("otu_table", "sample_data", "phylo", "taxonomyTable", "XStringSet") + # the names of component.classes needs to be the slot names to match getSlots / splat + names(component.classes) <- c("otu_table", "sam_data", "phy_tree", "tax_table", "refseq") + return(component.classes) } # Explicitly define components/slots that describe taxa. #' @keywords internal -taxa.components = function() { - # define classes vector - component.classes <- c("otu_table", "phylo", "taxonomyTable", "XStringSet") - # the names of component.classes needs to be the slot names to match getSlots / - # splat - names(component.classes) <- c("otu_table", "phy_tree", "tax_table", "refseq") - return(component.classes) +taxa.components = function(){ + # define classes vector + component.classes <- c("otu_table", "phylo", "taxonomyTable", "XStringSet") + # the names of component.classes needs to be the slot names to match getSlots / splat + names(component.classes) <- c("otu_table", "phy_tree", "tax_table", "refseq") + return(component.classes) } # Explicitly define components/slots that describe samples. #' @keywords internal -sample.components = function() { - # define classes vector - component.classes <- c("otu_table", "sample_data") - # the names of component.classes needs to be the slot names to match getSlots / - # splat - names(component.classes) <- c("otu_table", "sam_data") - return(component.classes) +sample.components = function(){ + # define classes vector + component.classes <- c("otu_table", "sample_data") + # the names of component.classes needs to be the slot names to match getSlots / splat + names(component.classes) <- c("otu_table", "sam_data") + return(component.classes) } -# Returns TRUE if x is a component class, FALSE otherwise. This shows up over -# and over again in data infrastructure +# Returns TRUE if x is a component class, FALSE otherwise. +# This shows up over and over again in data infrastructure #' @keywords internal -is.component.class = function(x) { - inherits(x, get.component.classes()) +is.component.class = function(x){ + inherits(x, get.component.classes()) } -################################################################################ +################################################################################ #' Convert \code{\link{phyloseq-class}} into a named list of its non-empty components. #' #' This is used in internal handling functions, and one of its key features @@ -187,26 +195,23 @@ is.component.class = function(x) { #' @seealso merge_phyloseq #' @keywords internal #' @examples # -splat.phyloseq.objects <- function(x) { - if (is.component.class(x)) { - # Check if class of x is among the component classes already (not phyloseq-class) - splatx <- list(x) - names(splatx) <- names(which(sapply(get.component.classes(), function(cclass, - x) inherits(x, cclass), x))) - } else if (inherits(x, "phyloseq")) { - # Else, check if it inherits from phyloseq, and if-so splat - slotnames = names(getSlots("phyloseq")) - allslots = sapply(slotnames, function(i, x) { - access(x, i, FALSE) - }, x) - splatx = allslots[!sapply(allslots, is.null)] - } else { - # Otherwise, who knows what it is, silently return NULL. - return(NULL) - } - return(splatx) +splat.phyloseq.objects <- function(x){ + if( is.component.class(x) ){ + # Check if class of x is among the component classes already (not phyloseq-class) + splatx <- list(x) + names(splatx) <- names(which(sapply(get.component.classes(), function(cclass, x) inherits(x, cclass), x))) + } else if( inherits(x, "phyloseq") ){ + # Else, check if it inherits from phyloseq, and if-so splat + slotnames = names(getSlots("phyloseq")) + allslots = sapply(slotnames, function(i, x){access(x, i, FALSE)}, x) + splatx = allslots[!sapply(allslots, is.null)] + } else { + # Otherwise, who knows what it is, silently return NULL. + return(NULL) + } + return(splatx) } -################################################################################ +################################################################################ #' Return the non-empty slot names of a phyloseq object. #' #' Like \code{\link{getSlots}}, but returns the class name if argument @@ -230,10 +235,10 @@ splat.phyloseq.objects <- function(x) { #' getslots.phyloseq(GlobalPatterns) #' data(esophagus) #' getslots.phyloseq(esophagus) -getslots.phyloseq = function(physeq) { - names(splat.phyloseq.objects(physeq)) +getslots.phyloseq = function(physeq){ + names(splat.phyloseq.objects(physeq)) } -################################################################################ +################################################################################ #' Universal slot accessor function for phyloseq-class. #' #' This function is used internally by many accessors and in @@ -265,37 +270,37 @@ getslots.phyloseq = function(physeq) { #' @export #' @examples # #' ## data(GlobalPatterns) -#' ## access(GlobalPatterns, 'tax_table') -#' ## access(GlobalPatterns, 'phy_tree') -#' ## access(otu_table(GlobalPatterns), 'otu_table') +#' ## access(GlobalPatterns, "tax_table") +#' ## access(GlobalPatterns, "phy_tree") +#' ## access(otu_table(GlobalPatterns), "otu_table") #' ## # Should return NULL: -#' ## access(otu_table(GlobalPatterns), 'sample_data') -#' ## access(otuTree(GlobalPatterns), 'sample_data') -#' ## access(otuSam(GlobalPatterns), 'phy_tree') -access <- function(physeq, slot, errorIfNULL = FALSE) { - if (is.component.class(physeq)) { - # If physeq is a component class, might return as-is. Depends on slot. - if (inherits(physeq, get.component.classes()[slot])) { - # if slot-name matches, return physeq as-is. - out = physeq - } else { - # If slot/component mismatch, set out to NULL. Test later if this is an error. - out = NULL - } - } else if (!slot %in% slotNames(physeq)) { - # If slot is invalid, set out to NULL. Test later if this is an error. - out = NULL - } else { - # By elimination, must be valid. Access slot - out = eval(parse(text = paste("physeq@", slot, sep = ""))) - } - if (errorIfNULL & is.null(out)) { - # Only error regarding a NULL return value if errorIfNULL is TRUE. - stop(slot, " slot is empty.") - } - return(out) +#' ## access(otu_table(GlobalPatterns), "sample_data") +#' ## access(otuTree(GlobalPatterns), "sample_data") +#' ## access(otuSam(GlobalPatterns), "phy_tree") +access <- function(physeq, slot, errorIfNULL=FALSE){ + if( is.component.class(physeq) ){ + # If physeq is a component class, might return as-is. Depends on slot. + if( inherits(physeq, get.component.classes()[slot]) ){ + # if slot-name matches, return physeq as-is. + out = physeq + } else { + # If slot/component mismatch, set out to NULL. Test later if this is an error. + out = NULL + } + } else if(!slot %in% slotNames(physeq) ){ + # If slot is invalid, set out to NULL. Test later if this is an error. + out = NULL + } else { + # By elimination, must be valid. Access slot + out = eval(parse(text=paste("physeq@", slot, sep=""))) + } + if( errorIfNULL & is.null(out) ){ + # Only error regarding a NULL return value if errorIfNULL is TRUE. + stop(slot, " slot is empty.") + } + return(out) } -################################################################################ +################################################################################ #' Returns the intersection of species and samples for the components of x #' #' This function is used internally as part of the infrastructure to ensure that @@ -317,18 +322,18 @@ access <- function(physeq, slot, errorIfNULL = FALSE) { #' @examples # #' ## data(GlobalPatterns) #' ## head(intersect_taxa(GlobalPatterns), 10) -intersect_taxa <- function(x) { - taxa_vectors = f_comp_ps("taxa_names", x) - taxa_vectors = taxa_vectors[!sapply(taxa_vectors, is.null)] - return(Reduce("intersect", taxa_vectors)) +intersect_taxa <- function(x){ + taxa_vectors = f_comp_ps("taxa_names", x) + taxa_vectors = taxa_vectors[!sapply(taxa_vectors, is.null)] + return( Reduce("intersect", taxa_vectors) ) } #' @keywords internal -intersect_samples <- function(x) { - sample_vectors = f_comp_ps("sample_names", x) - sample_vectors = sample_vectors[!sapply(sample_vectors, is.null)] - return(Reduce("intersect", sample_vectors)) +intersect_samples <- function(x){ + sample_vectors = f_comp_ps("sample_names", x) + sample_vectors = sample_vectors[!sapply(sample_vectors, is.null)] + return( Reduce("intersect", sample_vectors) ) } -################################################################################ +################################################################################ #' Force index order of phyloseq objects #' #' @usage index_reorder(ps, index_type) @@ -336,59 +341,60 @@ intersect_samples <- function(x) { #' @param ps (Required). A \code{\link{phyloseq-class}} instance. #' @param index_type (Optional). A character string #' specifying the indices to properly order. -#' Supported values are \code{c('both', 'taxa', 'samples')}. -#' Default is \code{'both'}, meaning samples and taxa indices +#' Supported values are \code{c("both", "taxa", "samples")}. +#' Default is \code{"both"}, meaning samples and taxa indices #' will be checked/re-ordered. #' #' @keywords internal #' @docType methods #' #' @examples -#' ## data('GlobalPatterns') +#' ## data("GlobalPatterns") #' ## GP = index_reorder(GlobalPatterns) -setGeneric("index_reorder", function(ps, index_type) standardGeneric("index_reorder")) +setGeneric("index_reorder", function(ps, index_type) standardGeneric("index_reorder") ) #' @rdname index_reorder #' @aliases index_reorder,phyloseq-method -setMethod("index_reorder", "phyloseq", function(ps, index_type = "both") { - if (index_type %in% c("both", "taxa")) { - ## ENFORCE CONSISTENT ORDER OF TAXA INDICES. - if (!is.null(phy_tree(ps, FALSE))) { - # If there is a phylogenetic tree included, re-order based on that, and reorder - # the otu_table The new taxa order, torder, will also trickle down to the - # taxonomyTable or XStringSet if present. - torder = taxa_names(phy_tree(ps)) - # Re-order the OTU table - if (taxa_are_rows(ps)) { - ps@otu_table = otu_table(ps)[torder, ] - } else { - ps@otu_table = otu_table(ps)[, torder] - } - } else { - # Else, re-order anything/everything else based on the OTU-table order - torder = taxa_names(otu_table(ps)) - } - if (!is.null(tax_table(ps, FALSE))) { - # If there is a taxonomyTable, re-order that too. - ps@tax_table = tax_table(ps)[torder, ] - } - if (!is.null(refseq(ps, FALSE))) { - # If there is a XStringSet, re-order that too. - ps@refseq = refseq(ps)[torder] - } - } - - if (index_type %in% c("both", "samples")) { - ## ENFORCE CONSISTENT ORDER OF SAMPLE INDICES Errors can creep when sample indices - ## do not match. - if (!is.null(sample_data(ps, FALSE))) { - # check first that ps has sample_data - if (!all(sample_names(otu_table(ps)) == rownames(sample_data(ps)))) { - # Reorder the sample_data rows so that they match the otu_table order. - ps@sam_data <- sample_data(ps)[sample_names(otu_table(ps)), ] - } - } - } - - return(ps) +setMethod("index_reorder", "phyloseq", function(ps, index_type="both"){ + if( index_type %in% c("both", "taxa") ){ + ## ENFORCE CONSISTENT ORDER OF TAXA INDICES. + if( !is.null(phy_tree(ps, FALSE)) ){ + # If there is a phylogenetic tree included, + # re-order based on that, and reorder the otu_table + # The new taxa order, torder, will also trickle down to + # the taxonomyTable or XStringSet if present. + torder = taxa_names(phy_tree(ps)) + # Re-order the OTU table + if( taxa_are_rows(ps) ){ + ps@otu_table = otu_table(ps)[torder, ] + } else { + ps@otu_table = otu_table(ps)[, torder] + } + } else { + # Else, re-order anything/everything else based on the OTU-table order + torder = taxa_names(otu_table(ps)) + } + if( !is.null(tax_table(ps, FALSE)) ){ + # If there is a taxonomyTable, re-order that too. + ps@tax_table = tax_table(ps)[torder, ] + } + if( !is.null(refseq(ps, FALSE)) ){ + # If there is a XStringSet, re-order that too. + ps@refseq = refseq(ps)[torder] + } + } + + if( index_type %in% c("both", "samples") ){ + ## ENFORCE CONSISTENT ORDER OF SAMPLE INDICES + # Errors can creep when sample indices do not match. + if( !is.null(sample_data(ps, FALSE)) ){ + # check first that ps has sample_data + if( !all(sample_names(otu_table(ps)) == rownames(sample_data(ps))) ){ + # Reorder the sample_data rows so that they match the otu_table order. + ps@sam_data <- sample_data(ps)[sample_names(otu_table(ps)), ] + } + } + } + + return(ps) }) -################################################################################ +################################################################################ \ No newline at end of file diff --git a/R/plot-methods.R b/R/plot-methods.R index 84440212..9e770653 100644 --- a/R/plot-methods.R +++ b/R/plot-methods.R @@ -1,4 +1,10 @@ +# # extension of plot methods for phyloseq object. +# +################################################################################ +################################################################################ +################################################################################ +################################################################################ #' Generic plot defaults for phyloseq. #' #' There are many useful examples of phyloseq graphics functions in the @@ -38,23 +44,22 @@ #' @examples #' data(esophagus) #' plot_phyloseq(esophagus) -setGeneric("plot_phyloseq", function(physeq, ...) { - standardGeneric("plot_phyloseq") -}) +setGeneric("plot_phyloseq", function(physeq, ...){ standardGeneric("plot_phyloseq") }) #' @aliases plot_phyloseq,phyloseq-method #' @rdname plot_phyloseq-methods -setMethod("plot_phyloseq", "phyloseq", function(physeq, ...) { - if (all(c("otu_table", "sample_data", "phy_tree") %in% getslots.phyloseq(physeq))) { - plot_tree(esophagus, color = "samples") - } else if (all(c("otu_table", "sample_data", "tax_table") %in% getslots.phyloseq(physeq))) { - plot_bar(physeq, ...) - } else if (all(c("otu_table", "phy_tree") %in% getslots.phyloseq(physeq))) { - plot_tree(esophagus, color = "samples") - } else { - plot_richness(physeq) - } +setMethod("plot_phyloseq", "phyloseq", function(physeq, ...){ + if( all(c("otu_table", "sample_data", "phy_tree") %in% getslots.phyloseq(physeq)) ){ + plot_tree(esophagus, color="samples") + } else if( all(c("otu_table", "sample_data", "tax_table") %in% getslots.phyloseq(physeq) ) ){ + plot_bar(physeq, ...) + } else if( all(c("otu_table", "phy_tree") %in% getslots.phyloseq(physeq)) ){ + plot_tree(esophagus, color="samples") + } else { + plot_richness(physeq) + } }) -################################################################################ +################################################################################ +################################################################################ #' Microbiome Network Plot using ggplot2 #' #' There are many useful examples of phyloseq network graphics in the @@ -74,11 +79,11 @@ setMethod("plot_phyloseq", "phyloseq", function(physeq, ...) { #' nodes is below a potentially arbitrary threshold, #' and special care should be given to considering the choice of this threshold. #' -#' @usage plot_network(g, physeq=NULL, type='samples', -#' \tcolor=NULL, shape=NULL, point_size=4, alpha=1, -#' \tlabel='value', hjust = 1.35, -#' \tline_weight=0.5, line_color=color, line_alpha=0.4, -#' \tlayout.method=layout.fruchterman.reingold, title=NULL) +#' @usage plot_network(g, physeq=NULL, type="samples", +#' color=NULL, shape=NULL, point_size=4, alpha=1, +#' label="value", hjust = 1.35, +#' line_weight=0.5, line_color=color, line_alpha=0.4, +#' layout.method=layout.fruchterman.reingold, title=NULL) #' #' @param g (Required). An \code{igraph}-class object created #' either by the convenience wrapper \code{\link{make_network}}, @@ -87,11 +92,11 @@ setMethod("plot_phyloseq", "phyloseq", function(physeq, ...) { #' @param physeq (Optional). Default \code{NULL}. #' A \code{\link{phyloseq-class}} object on which \code{g} is based. #' -#' @param type (Optional). Default \code{'samples'}. +#' @param type (Optional). Default \code{"samples"}. #' Whether the network represented in the primary argument, \code{g}, #' is samples or taxa/OTUs. -#' Supported arguments are \code{'samples'}, \code{'taxa'}, -#' where \code{'taxa'} indicates using the taxa indices, +#' Supported arguments are \code{"samples"}, \code{"taxa"}, +#' where \code{"taxa"} indicates using the taxa indices, #' whether they actually represent species or some other taxonomic rank. #' #' @param color (Optional). Default \code{NULL}. @@ -108,7 +113,7 @@ setMethod("plot_phyloseq", "phyloseq", function(physeq, ...) { #' @param alpha (Optional). Default \code{1}. #' A value between 0 and 1 for the alpha transparency of the vertex points. #' -#' @param label (Optional). Default \code{'value'}. +#' @param label (Optional). Default \code{"value"}. #' The name of the sample variable in \code{physeq} to use for #' labelling the vertex points. #' @@ -160,83 +165,92 @@ setMethod("plot_phyloseq", "phyloseq", function(physeq, ...) { #' #' data(enterotype) #' ig <- make_network(enterotype, max.dist=0.3) -#' plot_network(ig, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) +#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) #' # Change distance parameter #' ig <- make_network(enterotype, max.dist=0.2) -#' plot_network(ig, enterotype, color='SeqTech', shape='Enterotype', line_weight=0.3, label=NULL) -plot_network <- function(g, physeq = NULL, type = "samples", color = NULL, shape = NULL, - point_size = 4, alpha = 1, label = "value", hjust = 1.35, line_weight = 0.5, - line_color = color, line_alpha = 0.4, layout.method = layout.fruchterman.reingold, - title = NULL) { - - if (vcount(g) < 2) { +#' plot_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL) +plot_network <- function(g, physeq=NULL, type="samples", + color=NULL, shape=NULL, point_size=4, alpha=1, + label="value", hjust = 1.35, + line_weight=0.5, line_color=color, line_alpha=0.4, + layout.method=layout.fruchterman.reingold, title=NULL){ + + if( vcount(g) < 2 ){ # Report a warning if the graph is empty - stop("The graph you provided, `g`, has too few vertices. \n Check your graph, or the output of `make_network` and try again.") - } - - # disambiguate species/OTU/taxa as argument type... - if (type %in% c("taxa", "species", "OTUs", "otus", "otu")) { - type <- "taxa" - } - - # Make the edge-coordinates data.frame - edgeDF <- data.frame(get.edgelist(g)) - edgeDF$id <- 1:length(edgeDF[, 1]) - - # Make the vertices-coordinates data.frame - vertDF <- layout.method(g) - colnames(vertDF) <- c("x", "y") - vertDF <- data.frame(value = get.vertex.attribute(g, "name"), vertDF) - - # If phyloseq object provided, AND it has the relevant additional data THEN add - # it to vertDF - if (!is.null(physeq)) { - extraData <- NULL - if (type == "samples" & !is.null(sample_data(physeq, FALSE))) { - extraData = data.frame(sample_data(physeq))[as.character(vertDF$value), - , drop = FALSE] - } else if (type == "taxa" & !is.null(tax_table(physeq, FALSE))) { - extraData = data.frame(tax_table(physeq))[as.character(vertDF$value), - , drop = FALSE] - } - # Only mod vertDF if extraData exists - if (!is.null(extraData)) { - vertDF <- data.frame(vertDF, extraData) - } + stop("The graph you provided, `g`, has too few vertices. + Check your graph, or the output of `make_network` and try again.") } - # Combine vertex and edge coordinate data.frames - graphDF <- merge(reshape2::melt(edgeDF, id = "id"), vertDF, by = "value") - - # Initialize the ggplot - p <- ggplot(vertDF, aes(x, y)) - - # Strip all the typical annotations from the plot, leave the legend - p <- p + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - axis.text.x = element_blank(), axis.text.y = element_blank(), axis.title.x = element_blank(), - axis.title.y = element_blank(), axis.ticks = element_blank(), panel.border = element_blank()) - - # Add the graph vertices as points - p <- p + geom_point(aes_string(color = color, shape = shape), size = point_size, - na.rm = TRUE) - - # Add the text labels - if (!is.null(label)) { - p <- p + geom_text(aes_string(label = label), size = 2, hjust = hjust, na.rm = TRUE) - } - - # Add the edges: - p <- p + geom_line(aes_string(group = "id", color = line_color), graphDF, size = line_weight, - alpha = line_alpha, na.rm = TRUE) - - # Optionally add a title to the plot - if (!is.null(title)) { - p <- p + ggtitle(title) - } - - return(p) + # disambiguate species/OTU/taxa as argument type... + if( type %in% c("taxa", "species", "OTUs", "otus", "otu") ){ + type <- "taxa" + } + + # Make the edge-coordinates data.frame + edgeDF <- data.frame(get.edgelist(g)) + edgeDF$id <- 1:length(edgeDF[, 1]) + + # Make the vertices-coordinates data.frame + vertDF <- layout.method(g) + colnames(vertDF) <- c("x", "y") + vertDF <- data.frame(value=get.vertex.attribute(g, "name"), vertDF) + + # If phyloseq object provided, + # AND it has the relevant additional data + # THEN add it to vertDF + if( !is.null(physeq) ){ + extraData <- NULL + if( type == "samples" & !is.null(sample_data(physeq, FALSE)) ){ + extraData = data.frame(sample_data(physeq))[as.character(vertDF$value), , drop=FALSE] + } else if( type == "taxa" & !is.null(tax_table(physeq, FALSE)) ){ + extraData = data.frame(tax_table(physeq))[as.character(vertDF$value), , drop=FALSE] + } + # Only mod vertDF if extraData exists + if( !is.null(extraData) ){ + vertDF <- data.frame(vertDF, extraData) + } + } + + # Combine vertex and edge coordinate data.frames + graphDF <- merge(reshape2::melt(edgeDF, id="id"), vertDF, by = "value") + + # Initialize the ggplot + p <- ggplot(vertDF, aes(x, y)) + + # Strip all the typical annotations from the plot, leave the legend + p <- p + theme_bw() + + theme( + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.ticks = element_blank(), + panel.border = element_blank() + ) + + # Add the graph vertices as points + p <- p + geom_point(aes_string(color=color, shape=shape), size=point_size, na.rm=TRUE) + + # Add the text labels + if( !is.null(label) ){ + p <- p + geom_text(aes_string(label=label), size = 2, hjust=hjust, na.rm=TRUE) + } + + # Add the edges: + p <- p + geom_line(aes_string(group="id", color=line_color), + graphDF, size=line_weight, alpha=line_alpha, na.rm=TRUE) + + # Optionally add a title to the plot + if( !is.null(title) ){ + p <- p + ggtitle(title) + } + + return(p) } -################################################################################ +################################################################################ +################################################################################ #' Microbiome Network Plot using ggplot2 #' #' There are many useful examples of phyloseq network graphics in the @@ -263,7 +277,7 @@ plot_network <- function(g, physeq = NULL, type = "samples", color = NULL, shape #' @param physeq (Required). #' The \code{\link{phyloseq-class}} object that you want to represent as a network. #' -#' @param distance (Optional). Default is \code{'bray'}. +#' @param distance (Optional). Default is \code{"bray"}. #' Can be either a distance method supported by \code{\link[phyloseq]{distance}}, #' or an already-computed \code{\link{dist}}-class with labels that match #' the indices implied by both the \code{physeq} and \code{type} arguments @@ -275,14 +289,14 @@ plot_network <- function(g, physeq = NULL, type = "samples", color = NULL, shape #' The maximum distance value between two vertices #' to connect with an edge in the graphic. #' -#' @param type (Optional). Default \code{'samples'}. +#' @param type (Optional). Default \code{"samples"}. #' Whether the network represented in the primary argument, \code{g}, #' is samples or taxa/OTUs. -#' Supported arguments are \code{'samples'}, \code{'taxa'}, -#' where \code{'taxa'} indicates using the taxa indices, +#' Supported arguments are \code{"samples"}, \code{"taxa"}, +#' where \code{"taxa"} indicates using the taxa indices, #' whether they actually represent species or some other taxonomic rank. #' -#' @param laymeth (Optional). Default \code{'fruchterman.reingold'}. +#' @param laymeth (Optional). Default \code{"fruchterman.reingold"}. #' A character string that indicates the method that will determine #' the placement of vertices, typically based on conectedness of vertices #' and the number of vertices. @@ -291,8 +305,8 @@ plot_network <- function(g, physeq = NULL, type = "samples", color = NULL, shape #' and see \code{\link[igraph]{layout.auto}} for descriptions of various #' alternative layout method options supported here. #' The character string argument should match exactly the -#' layout function name with the \code{'layout.'} omitted. -#' Try \code{laymeth='list'} to see a list of options. +#' layout function name with the \code{"layout."} omitted. +#' Try \code{laymeth="list"} to see a list of options. #' #' @param color (Optional). Default \code{NULL}. #' The name of the sample variable in \code{physeq} to use for color mapping @@ -353,86 +367,95 @@ plot_network <- function(g, physeq = NULL, type = "samples", color = NULL, shape #' @export #' @examples #' data(enterotype) -#' plot_net(enterotype, color='SeqTech', maxdist = 0.3) -#' plot_net(enterotype, color='SeqTech', maxdist = 0.3, laymeth = 'auto') -#' plot_net(enterotype, color='SeqTech', maxdist = 0.3, laymeth = 'svd') -#' plot_net(enterotype, color='SeqTech', maxdist = 0.3, laymeth = 'circle') -#' plot_net(enterotype, color='SeqTech', shape='Enterotype', maxdist = 0.3, laymeth = 'circle') -plot_net <- function(physeq, distance = "bray", type = "samples", maxdist = 0.7, - laymeth = "fruchterman.reingold", color = NULL, shape = NULL, rescale = FALSE, - point_size = 5, point_alpha = 1, point_label = NULL, hjust = 1.35, title = NULL) { +#' plot_net(enterotype, color="SeqTech", maxdist = 0.3) +#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "auto") +#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "svd") +#' plot_net(enterotype, color="SeqTech", maxdist = 0.3, laymeth = "circle") +#' plot_net(enterotype, color="SeqTech", shape="Enterotype", maxdist = 0.3, laymeth = "circle") +plot_net <- function(physeq, distance="bray", type="samples", maxdist = 0.7, + laymeth="fruchterman.reingold", color=NULL, shape=NULL, rescale=FALSE, + point_size=5, point_alpha=1, point_label=NULL, hjust = 1.35, title=NULL){ # Supported layout methods - available_layouts = list(auto = layout.auto, random = layout.random, circle = layout.circle, - sphere = layout.sphere, fruchterman.reingold = layout.fruchterman.reingold, - kamada.kawai = layout.kamada.kawai, spring = layout.spring, reingold.tilford = layout.reingold.tilford, - fruchterman.reingold.grid = layout.fruchterman.reingold.grid, lgl = layout.lgl, - graphopt = layout.graphopt, svd = layout.svd) - if (laymeth == "list") { + available_layouts = list( + auto = layout.auto, + random = layout.random, + circle = layout.circle, + sphere = layout.sphere, + fruchterman.reingold = layout.fruchterman.reingold, + kamada.kawai = layout.kamada.kawai, + spring = layout.spring, + reingold.tilford = layout.reingold.tilford, + fruchterman.reingold.grid = layout.fruchterman.reingold.grid, + lgl = layout.lgl, + graphopt = layout.graphopt, + svd = layout.svd + ) + if(laymeth=="list"){ return(names(available_layouts)) } - if (!laymeth %in% names(available_layouts)) { + if(!laymeth %in% names(available_layouts)){ stop("Unsupported argument to `laymeth` option. Please use an option returned by `plot_net(laymeth='list')`") } - # 1. Calculate Distance - if (inherits(distance, "dist")) { + # 1. + # Calculate Distance + if( inherits(distance, "dist") ){ # If distance a distance object, use it rather than re-calculate Distance <- distance # Check that it at least has (a subset of) the correct labels - possibleVertexLabels = switch(type, taxa = taxa_names(physeq), samples = sample_names(physeq)) - if (!all(attributes(distance)$Labels %in% possibleVertexLabels)) { + possibleVertexLabels = switch(type, taxa=taxa_names(physeq), samples=sample_names(physeq)) + if( !all(attributes(distance)$Labels %in% possibleVertexLabels) ){ stop("Some or all `distance` index labels do not match ", type, " names in `physeq`") } } else { # Coerce to character and attempt distance calculation - scaled_distance = function(physeq, method, type, rescale = TRUE) { + scaled_distance = function(physeq, method, type, rescale=TRUE){ Dist = distance(physeq, method, type) - if (rescale) { + if(rescale){ # rescale the distance matrix to be [0, 1] - Dist <- Dist/max(Dist, na.rm = TRUE) - Dist <- Dist - min(Dist, na.rm = TRUE) + Dist <- Dist / max(Dist, na.rm=TRUE) + Dist <- Dist - min(Dist, na.rm=TRUE) } return(Dist) } distance <- as(distance[1], "character") Distance = scaled_distance(physeq, distance, type, rescale) } - # 2. Create edge data.table - dist_to_edge_table = function(Dist, MaxDistance = NULL, vnames = c("v1", "v2")) { + # 2. + # Create edge data.table + dist_to_edge_table = function(Dist, MaxDistance=NULL, vnames = c("v1", "v2")){ dmat <- as.matrix(Dist) # Set duplicate entries and self-links to Inf dmat[upper.tri(dmat, diag = TRUE)] <- Inf - LinksData = data.table(reshape2::melt(dmat, varnames = vnames, as.is = TRUE)) + LinksData = data.table(reshape2::melt(dmat, varnames=vnames, as.is = TRUE)) setnames(LinksData, old = "value", new = "Distance") # Remove self-links and duplicate links LinksData <- LinksData[is.finite(Distance), ] # Remove entries above the threshold, MaxDistance - if (!is.null(MaxDistance)) { + if(!is.null(MaxDistance)){ LinksData <- LinksData[Distance < MaxDistance, ] } return(LinksData) } LinksData0 = dist_to_edge_table(Distance, maxdist) - # 3. Create vertex layout Make the vertices-coordinates data.table - vertex_layout = function(LinksData, physeq = NULL, type = "samples", laymeth = igraph::layout.fruchterman.reingold, - ...) { - # `physeq` can be anything, only has effect when non-NULL returned by sample_data - # or tax_table - g = igraph::graph.data.frame(LinksData, directed = FALSE) - vertexDT = data.table(laymeth(g, ...), vertex = get.vertex.attribute(g, "name")) + # 3. Create vertex layout + # Make the vertices-coordinates data.table + vertex_layout = function(LinksData, physeq=NULL, type="samples", + laymeth=igraph::layout.fruchterman.reingold, ...){ + # `physeq` can be anything, only has effect when non-NULL returned by sample_data or tax_table + g = igraph::graph.data.frame(LinksData, directed=FALSE) + vertexDT = data.table(laymeth(g, ...), + vertex=get.vertex.attribute(g, "name")) setkey(vertexDT, vertex) setnames(vertexDT, old = c(1, 2), new = c("x", "y")) extraData = NULL - if (type == "samples" & !is.null(sample_data(physeq, FALSE))) { - extraData <- data.table(data.frame(sample_data(physeq)), key = "rn", - keep.rownames = TRUE) - } else if (type == "taxa" & !is.null(tax_table(physeq, FALSE))) { - extraData <- data.table(as(tax_table(physeq), "matrix"), key = "rn", - keep.rownames = TRUE) + if( type == "samples" & !is.null(sample_data(physeq, FALSE)) ){ + extraData <- data.table(data.frame(sample_data(physeq)), key = "rn", keep.rownames = TRUE) + } else if( type == "taxa" & !is.null(tax_table(physeq, FALSE)) ){ + extraData <- data.table(as(tax_table(physeq), "matrix"), key = "rn", keep.rownames = TRUE) } # Only mod vertexDT if extraData exists - if (!is.null(extraData)) { - # Join vertexDT, extraData using data.table syntax. Presumes `vertex` is key in - # both. + if(!is.null(extraData)){ + # Join vertexDT, extraData using data.table syntax. Presumes `vertex` is key in both. setnames(extraData, old = "rn", new = "vertex") vertexDT <- vertexDT[extraData] vertexDT <- vertexDT[!is.na(x), ] @@ -440,38 +463,49 @@ plot_net <- function(physeq, distance = "bray", type = "samples", maxdist = 0.7, return(vertexDT) } vertexDT = vertex_layout(LinksData0, physeq, type, available_layouts[[laymeth]]) - # 4. Update the links layout for ggplot: x, y, xend, yend - link_layout = function(LinksData, vertexDT) { + # 4. + # Update the links layout for ggplot: x, y, xend, yend + link_layout = function(LinksData, vertexDT){ linkstart = vertexDT[LinksData$v1, x, y] linkend = vertexDT[LinksData$v2, x, y] setnames(linkend, old = c("y", "x"), new = c("yend", "xend")) LinksData <- cbind(LinksData, linkstart, linkend) - return(LinksData) + return(LinksData) } LinksData = link_layout(LinksData0, vertexDT) - # 5. Define ggplot2 network plot - links_to_ggplot = function(LinksData, vertexDT, vertmap = aes(x, y)) { - p0 = ggplot(data = LinksData) + geom_segment(aes(x, y, xend = xend, yend = yend, - size = Distance, alpha = Distance)) + geom_point(mapping = vertmap, data = vertexDT, - size = 5, na.rm = TRUE) + scale_alpha(range = c(1, 0.1)) + scale_size(range = c(2, - 0.25)) + # 5. + # Define ggplot2 network plot + links_to_ggplot = function(LinksData, vertexDT, vertmap=aes(x, y)){ + p0 = ggplot(data=LinksData) + + geom_segment(aes(x, y, xend=xend, yend=yend, size=Distance, alpha=Distance)) + + geom_point(mapping = vertmap, data=vertexDT, size=5, na.rm = TRUE) + + scale_alpha(range = c(1, 0.1)) + + scale_size(range = c(2, 0.25)) return(p0) } - p = links_to_ggplot(LinksData, vertexDT, vertmap = aes_string(x = "x", y = "y", - color = color, shape = shape)) + p = links_to_ggplot(LinksData, vertexDT, + vertmap = aes_string(x="x", y="y", color=color, shape=shape)) # Add labels - if (!is.null(point_label)) { - p <- p + geom_text(aes_string(x = "x", y = "y", label = point_label), data = vertexDT, - size = 2, hjust = hjust, na.rm = TRUE) + if(!is.null(point_label)){ + p <- p + geom_text(aes_string(x="x", y="y", label=point_label), + data = vertexDT, size = 2, hjust = hjust, na.rm = TRUE) } # Add default theme - net_theme = theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - axis.text.x = element_blank(), axis.text.y = element_blank(), axis.title.x = element_blank(), - axis.title.y = element_blank(), axis.ticks = element_blank(), panel.border = element_blank()) + net_theme = theme( + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + axis.text.x = element_blank(), + axis.text.y = element_blank(), + axis.title.x = element_blank(), + axis.title.y = element_blank(), + axis.ticks = element_blank(), + panel.border = element_blank() + ) p <- p + theme_bw() + net_theme return(p) } -################################################################################ +################################################################################ +################################################################################ #' Plot alpha diversity, flexibly with ggplot2 #' #' There are many useful examples of alpha-diversity graphics in the @@ -494,7 +528,7 @@ plot_net <- function(physeq, distance = "bray", type = "samples", maxdist = 0.7, #' plot might be kindof strange, and not the intended behavior of this function). #' The following are the names you will want to avoid using in \code{x} or \code{color}: #' -#' \code{c('Observed', 'Chao1', 'ACE', 'Shannon', 'Simpson', 'InvSimpson', 'Fisher')}. +#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}. #' #' @param physeq (Required). \code{\link{phyloseq-class}}, or alternatively, #' an \code{\link{otu_table-class}}. The data about which you want to estimate. @@ -508,7 +542,7 @@ plot_net <- function(physeq, distance = "bray", type = "samples", maxdist = 0.7, #' or a custom supplied vector with length equal to the number of samples #' in the dataset (nsamples(physeq)). #' -#' The default value is \code{'samples'}, which will map each sample's name +#' The default value is \code{"samples"}, which will map each sample's name #' to a separate horizontal position in the plot. #' #' @param color (Optional). Default \code{NULL}. @@ -537,11 +571,11 @@ plot_net <- function(physeq, distance = "bray", type = "samples", maxdist = 0.7, #' @param title (Optional). Default \code{NULL}. Character string. #' The main title for the graphic. #' -#' @param scales (Optional). Default \code{'free_y'}. +#' @param scales (Optional). Default \code{"free_y"}. #' Whether to let vertical axis have free scale that adjusts to #' the data in each panel. #' This argument is passed to \code{\link[ggplot2]{facet_wrap}}. -#' If set to \code{'fixed'}, a single vertical scale will +#' If set to \code{"fixed"}, a single vertical scale will #' be used in all panels. This can obscure values if the #' \code{measures} argument includes both #' richness estimates and diversity indices, for example. @@ -561,7 +595,7 @@ plot_net <- function(physeq, distance = "bray", type = "samples", maxdist = 0.7, #' Alternatively, you can specify one or more measures #' as a character vector of measure names. #' Values must be among those supported: -#' \code{c('Observed', 'Chao1', 'ACE', 'Shannon', 'Simpson', 'InvSimpson', 'Fisher')}. +#' \code{c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")}. #' #' @param sortby (Optional). A character string subset of \code{measures} argument. #' Sort x-indices by the mean of one or more \code{measures}, @@ -589,71 +623,72 @@ plot_net <- function(physeq, distance = "bray", type = "samples", maxdist = 0.7, #' @examples #' ## There are many more interesting examples at the phyloseq online tutorials. #' ## http://joey711.github.io/phyloseq/plot_richness-examples -#' data('soilrep') -#' plot_richness(soilrep, measures=c('InvSimpson', 'Fisher')) -#' plot_richness(soilrep, 'Treatment', 'warmed', measures=c('Chao1', 'ACE', 'InvSimpson'), nrow=3) -#' data('GlobalPatterns') -#' plot_richness(GlobalPatterns, x='SampleType', measures=c('InvSimpson')) -#' plot_richness(GlobalPatterns, x='SampleType', measures=c('Chao1', 'ACE', 'InvSimpson'), nrow=3) -#' plot_richness(GlobalPatterns, x='SampleType', measures=c('Chao1', 'ACE', 'InvSimpson'), nrow=3, sortby = 'Chao1') -plot_richness = function(physeq, x = "samples", color = NULL, shape = NULL, title = NULL, - scales = "free_y", nrow = 1, shsi = NULL, measures = NULL, sortby = NULL) { +#' data("soilrep") +#' plot_richness(soilrep, measures=c("InvSimpson", "Fisher")) +#' plot_richness(soilrep, "Treatment", "warmed", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3) +#' data("GlobalPatterns") +#' plot_richness(GlobalPatterns, x="SampleType", measures=c("InvSimpson")) +#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3) +#' plot_richness(GlobalPatterns, x="SampleType", measures=c("Chao1", "ACE", "InvSimpson"), nrow=3, sortby = "Chao1") +plot_richness = function(physeq, x="samples", color=NULL, shape=NULL, title=NULL, + scales="free_y", nrow=1, shsi=NULL, measures=NULL, sortby=NULL){ # Calculate the relevant alpha-diversity measures - erDF = estimate_richness(physeq, split = TRUE, measures = measures) + erDF = estimate_richness(physeq, split=TRUE, measures=measures) # Measures may have been renamed in `erDF`. Replace it with the name from erDF measures = colnames(erDF) - # Define 'measure' variables and s.e. labels, for melting. + # Define "measure" variables and s.e. labels, for melting. ses = colnames(erDF)[grep("^se\\.", colnames(erDF))] # Remove any S.E. from `measures` measures = measures[!measures %in% ses] - # Make the plotting data.frame. This coerces to data.frame, required for - # reliable output from reshape2::melt() - if (!is.null(sample_data(physeq, errorIfNULL = FALSE))) { + # Make the plotting data.frame. + # This coerces to data.frame, required for reliable output from reshape2::melt() + if( !is.null(sample_data(physeq, errorIfNULL=FALSE)) ){ # Include the sample data, if it is there. - DF <- data.frame(erDF, sample_data(physeq)) + DF <- data.frame(erDF, sample_data(physeq)) } else { # If no sample data, leave it out. DF <- data.frame(erDF) } - if (!"samples" %in% colnames(DF)) { - # If there is no 'samples' variable in DF, add it - DF$samples <- sample_names(physeq) - } - # sample_names used to be default, and should also work. #backwardcompatibility - if (!is.null(x)) { - if (x %in% c("sample", "samples", "sample_names", "sample.names")) { - x <- "samples" - } - } else { - # If x was NULL for some reason, set it to 'samples' - x <- "samples" - } - # melt to display different alpha-measures separately - mdf = reshape2::melt(DF, measure.vars = measures) + if( !"samples" %in% colnames(DF) ){ + # If there is no "samples" variable in DF, add it + DF$samples <- sample_names(physeq) + } + # sample_names used to be default, and should also work. + # #backwardcompatibility + if( !is.null(x) ){ + if( x %in% c("sample", "samples", "sample_names", "sample.names") ){ + x <- "samples" + } + } else { + # If x was NULL for some reason, set it to "samples" + x <- "samples" + } + # melt to display different alpha-measures separately + mdf = reshape2::melt(DF, measure.vars=measures) # Initialize the se column. Helpful even if not used. mdf$se <- NA_integer_ - if (length(ses) > 0) { - ## Merge s.e. into one 'se' column Define conversion vector, `selabs` + if( length(ses) > 0 ){ + ## Merge s.e. into one "se" column + # Define conversion vector, `selabs` selabs = ses - # Trim the 'se.' from the names + # Trim the "se." from the names names(selabs) <- substr(selabs, 4, 100) # Make first letter of selabs' names uppercase substr(names(selabs), 1, 1) <- toupper(substr(names(selabs), 1, 1)) # use selabs conversion vector to process `mdf` - mdf$wse <- sapply(as.character(mdf$variable), function(i, selabs) { - selabs[i] - }, selabs) - for (i in 1:nrow(mdf)) { - if (!is.na(mdf[i, "wse"])) { + mdf$wse <- sapply(as.character(mdf$variable), function(i, selabs){selabs[i]}, selabs) + for( i in 1:nrow(mdf) ){ + if( !is.na(mdf[i, "wse"]) ){ mdf[i, "se"] <- mdf[i, (mdf[i, "wse"])] } } # prune the redundant columns mdf <- mdf[, -which(colnames(mdf) %in% c(selabs, "wse"))] } - ## Interpret measures If not provided (default), keep all - if (!is.null(measures)) { - if (any(measures %in% as.character(mdf$variable))) { + ## Interpret measures + # If not provided (default), keep all + if( !is.null(measures) ){ + if( any(measures %in% as.character(mdf$variable)) ){ # If any measures were in mdf, then subset to just those. mdf <- mdf[as.character(mdf$variable) %in% measures, ] } else { @@ -661,49 +696,53 @@ plot_richness = function(physeq, x = "samples", color = NULL, shape = NULL, titl warning("Argument to `measures` not supported. All alpha-diversity measures (should be) included in plot.") } } - if (!is.null(shsi)) { - # Deprecated: If shsi is anything but NULL, print a warning about its being - # deprecated - warning("shsi no longer supported option in plot_richness. Please use `measures` instead") - } + if( !is.null(shsi) ){ + # Deprecated: + # If shsi is anything but NULL, print a warning about its being deprecated + warning("shsi no longer supported option in plot_richness. Please use `measures` instead") + } # Address `sortby` argument - if (!is.null(sortby)) { - if (!all(sortby %in% levels(mdf$variable))) { + if(!is.null(sortby)){ + if(!all(sortby %in% levels(mdf$variable))){ warning("`sortby` argument not among `measures`. Ignored.") } - if (!is.discrete(mdf[, x])) { + if(!is.discrete(mdf[, x])){ warning("`sortby` argument provided, but `x` not a discrete variable. `sortby` is ignored.") } - if (all(sortby %in% levels(mdf$variable)) & is.discrete(mdf[, x])) { - # Replace x-factor with same factor that has levels re-ordered according to - # `sortby` + if(all(sortby %in% levels(mdf$variable)) & is.discrete(mdf[, x])){ + # Replace x-factor with same factor that has levels re-ordered according to `sortby` wh.sortby = which(mdf$variable %in% sortby) - mdf[, x] <- factor(mdf[, x], levels = names(sort(tapply(X = mdf[wh.sortby, - "value"], INDEX = mdf[wh.sortby, x], mean, na.rm = TRUE, simplify = TRUE)))) + mdf[, x] <- factor(mdf[, x], + levels = names(sort(tapply(X = mdf[wh.sortby, "value"], + INDEX = mdf[wh.sortby, x], + mean, + na.rm=TRUE, simplify = TRUE)))) } } # Define variable mapping - richness_map = aes_string(x = x, y = "value", colour = color, shape = shape) + richness_map = aes_string(x=x, y="value", colour=color, shape=shape) # Make the ggplot. - p = ggplot(mdf, richness_map) + geom_point(na.rm = TRUE) + p = ggplot(mdf, richness_map) + geom_point(na.rm=TRUE) # Add error bars if mdf$se is not all NA - if (any(!is.na(mdf[, "se"]))) { - p = p + geom_errorbar(aes(ymax = value + se, ymin = value - se), width = 0.1) + if( any(!is.na(mdf[, "se"])) ){ + p = p + geom_errorbar(aes(ymax=value + se, ymin=value - se), width=0.1) } # Rotate horizontal axis labels, and adjust - p = p + theme(axis.text.x = element_text(angle = -90, vjust = 0.5, hjust = 0)) - # Add y-label - p = p + ylab("Alpha Diversity Measure") + p = p + theme(axis.text.x=element_text(angle=-90, vjust=0.5, hjust=0)) + # Add y-label + p = p + ylab('Alpha Diversity Measure') # Facet wrap using user-options - p = p + facet_wrap(~variable, nrow = nrow, scales = scales) - # Optionally add a title to the plot - if (!is.null(title)) { - p <- p + ggtitle(title) - } - return(p) + p = p + facet_wrap(~variable, nrow=nrow, scales=scales) + # Optionally add a title to the plot + if( !is.null(title) ){ + p <- p + ggtitle(title) + } + return(p) } -################################################################################ The general case, could plot samples, taxa, or both (biplot/split). Default -################################################################################ samples. +################################################################################ +################################################################################ +# The general case, could plot samples, taxa, or both (biplot/split). Default samples. +################################################################################ #' General ordination plotter based on ggplot2. #' #' There are many useful examples of phyloseq ordination graphics in the @@ -723,20 +762,20 @@ plot_richness = function(physeq, x = "samples", color = NULL, shape = NULL, titl #' supported here. There is no default, as the expectation is that the #' ordination will be performed and saved prior to calling this plot function. #' -#' @param type (Optional). The plot type. Default is \code{'samples'}. The +#' @param type (Optional). The plot type. Default is \code{"samples"}. The #' currently supported options are -#' \code{c('samples', 'sites', 'species', 'taxa', 'biplot', 'split', 'scree')}. +#' \code{c("samples", "sites", "species", "taxa", "biplot", "split", "scree")}. #' The option #' ``taxa'' is equivalent to ``species'' in this case, and similarly, #' ``samples'' is equivalent to ``sites''. #' The options -#' \code{'sites'} and \code{'species'} result in a single-plot of just the +#' \code{"sites"} and \code{"species"} result in a single-plot of just the #' sites/samples or species/taxa of the ordination, respectively. -#' The \code{'biplot'} and \code{'split'} options result in a combined +#' The \code{"biplot"} and \code{"split"} options result in a combined #' plot with both taxa and samples, either combined into one plot (``biplot'') #' or #' separated in two facet panels (``split''), respectively. -#' The \code{'scree'} option results in a call to \code{\link{plot_scree}}, +#' The \code{"scree"} option results in a call to \code{\link{plot_scree}}, #' which produces an ordered bar plot of the normalized eigenvalues #' associated with each ordination axis. #' @@ -806,280 +845,270 @@ plot_richness = function(physeq, x = "samples", color = NULL, shape = NULL, titl #' # http://joey711.github.io/phyloseq/plot_ordination-examples #' data(GlobalPatterns) #' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns) -#' gp_bray_pcoa = ordinate(GP, 'CCA', 'bray') -#' plot_ordination(GP, gp_bray_pcoa, 'samples', color='SampleType') -plot_ordination = function(physeq, ordination, type = "samples", axes = 1:2, color = NULL, - shape = NULL, label = NULL, title = NULL, justDF = FALSE) { - if (length(type) > 1) { - warning("`type` can only be a single option,\n but more than one provided. Using only the first.") +#' gp_bray_pcoa = ordinate(GP, "CCA", "bray") +#' plot_ordination(GP, gp_bray_pcoa, "samples", color="SampleType") +plot_ordination = function(physeq, ordination, type="samples", axes=1:2, + color=NULL, shape=NULL, label=NULL, title=NULL, justDF=FALSE){ + if(length(type) > 1){ + warning("`type` can only be a single option, + but more than one provided. Using only the first.") type <- type[[1]] } - if (length(color) > 1) { - warning("The `color` variable argument should have length equal to 1.", "Taking first value.") + if(length(color) > 1){ + warning("The `color` variable argument should have length equal to 1.", + "Taking first value.") color = color[[1]][1] } - if (length(shape) > 1) { - warning("The `shape` variable argument should have length equal to 1.", "Taking first value.") + if(length(shape) > 1){ + warning("The `shape` variable argument should have length equal to 1.", + "Taking first value.") shape = shape[[1]][1] } - if (length(label) > 1) { - warning("The `label` variable argument should have length equal to 1.", "Taking first value.") + if(length(label) > 1){ + warning("The `label` variable argument should have length equal to 1.", + "Taking first value.") label = label[[1]][1] } official_types = c("sites", "species", "biplot", "split", "scree") - if (!inherits(physeq, "phyloseq")) { - if (inherits(physeq, "character")) { - if (physeq == "list") { + if(!inherits(physeq, "phyloseq")){ + if(inherits(physeq, "character")){ + if(physeq=="list"){ return(official_types) } - } - warning("Full functionality requires `physeq` be phyloseq-class ", "with multiple components.") + } + warning("Full functionality requires `physeq` be phyloseq-class ", + "with multiple components.") } # Catch typos and synonyms - type = gsub("^.*site[s]*.*$", "sites", type, ignore.case = TRUE) - type = gsub("^.*sample[s]*.*$", "sites", type, ignore.case = TRUE) - type = gsub("^.*species.*$", "species", type, ignore.case = TRUE) - type = gsub("^.*taxa.*$", "species", type, ignore.case = TRUE) - type = gsub("^.*OTU[s]*.*$", "species", type, ignore.case = TRUE) - type = gsub("^.*biplot[s]*.*$", "biplot", type, ignore.case = TRUE) - type = gsub("^.*split[s]*.*$", "split", type, ignore.case = TRUE) - type = gsub("^.*scree[s]*.*$", "scree", type, ignore.case = TRUE) + type = gsub("^.*site[s]*.*$", "sites", type, ignore.case=TRUE) + type = gsub("^.*sample[s]*.*$", "sites", type, ignore.case=TRUE) + type = gsub("^.*species.*$", "species", type, ignore.case=TRUE) + type = gsub("^.*taxa.*$", "species", type, ignore.case=TRUE) + type = gsub("^.*OTU[s]*.*$", "species", type, ignore.case=TRUE) + type = gsub("^.*biplot[s]*.*$", "biplot", type, ignore.case=TRUE) + type = gsub("^.*split[s]*.*$", "split", type, ignore.case=TRUE) + type = gsub("^.*scree[s]*.*$", "scree", type, ignore.case=TRUE) # If type argument is not supported... - if (!type %in% official_types) { - warning("type argument not supported. `type` set to 'samples'.\n", "See `plot_ordination('list')`") + if( !type %in% official_types ){ + warning("type argument not supported. `type` set to 'samples'.\n", + "See `plot_ordination('list')`") type <- "sites" } - if (type %in% c("scree")) { - # Stop early by passing to plot_scree() if 'scree' was chosen as a type - return(plot_scree(ordination, title = title)) + if( type %in% c("scree") ){ + # Stop early by passing to plot_scree() if "scree" was chosen as a type + return( plot_scree(ordination, title=title) ) } # Define a function to check if a data.frame is empty - is_empty = function(x) { + is_empty = function(x){ length(x) < 2 | suppressWarnings(all(is.na(x))) } - # The plotting data frames. Call scores to get coordinates. Silently returns - # only the coordinate systems available. e.g. sites-only, even if species - # requested. + # The plotting data frames. + # Call scores to get coordinates. + # Silently returns only the coordinate systems available. + # e.g. sites-only, even if species requested. specDF = siteDF = NULL - trash1 = try({ - siteDF <- scores(ordination, choices = axes, display = "sites", physeq = physeq) - }, silent = TRUE) - trash2 = try({ - specDF <- scores(ordination, choices = axes, display = "species", physeq = physeq) - }, silent = TRUE) + trash1 = try({siteDF <- scores(ordination, choices = axes, + display="sites", physeq=physeq)}, + silent = TRUE) + trash2 = try({specDF <- scores(ordination, choices = axes, + display="species", physeq=physeq)}, + silent = TRUE) # Check that have assigned coordinates to the correct object siteSampIntx = length(intersect(rownames(siteDF), sample_names(physeq))) siteTaxaIntx = length(intersect(rownames(siteDF), taxa_names(physeq))) specSampIntx = length(intersect(rownames(specDF), sample_names(physeq))) specTaxaIntx = length(intersect(rownames(specDF), taxa_names(physeq))) - if (siteSampIntx < specSampIntx & specTaxaIntx < siteTaxaIntx) { + if(siteSampIntx < specSampIntx & specTaxaIntx < siteTaxaIntx){ # Double-swap co = specDF specDF <- siteDF siteDF <- co rm(co) } else { - if (siteSampIntx < specSampIntx) { + if(siteSampIntx < specSampIntx){ # Single swap siteDF <- specDF specDF <- NULL } - if (specTaxaIntx < siteTaxaIntx) { - # Single swap + if(specTaxaIntx < siteTaxaIntx){ + # Single swap specDF <- siteDF siteDF <- NULL } } # If both empty, warn and return NULL - if (is_empty(siteDF) & is_empty(specDF)) { - warning("Could not obtain coordinates from the provided `ordination`. \n", - "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.") + if(is_empty(siteDF) & is_empty(specDF)){ + warning("Could not obtain coordinates from the provided `ordination`. \n", + "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.") return(NULL) } # If either is missing, do weighted average - if (is_empty(specDF) & type != "sites") { + if(is_empty(specDF) & type != "sites"){ message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)") - specDF <- data.frame(wascores(siteDF, w = veganifyOTU(physeq)), stringsAsFactors = FALSE) + specDF <- data.frame(wascores(siteDF, w = veganifyOTU(physeq)), stringsAsFactors=FALSE) } - if (is_empty(siteDF) & type != "species") { + if(is_empty(siteDF) & type != "species"){ message("Species coordinates not found directly in ordination object. Attempting weighted average (`vegan::wascores`)") - siteDF <- data.frame(wascores(specDF, w = t(veganifyOTU(physeq))), stringsAsFactors = FALSE) + siteDF <- data.frame(wascores(specDF, w = t(veganifyOTU(physeq))), stringsAsFactors=FALSE) } # Double-check that have assigned coordinates to the correct object specTaxaIntx <- siteSampIntx <- NULL siteSampIntx <- length(intersect(rownames(siteDF), sample_names(physeq))) specTaxaIntx <- length(intersect(rownames(specDF), taxa_names(physeq))) - if (siteSampIntx < 1L & !is_empty(siteDF)) { - # If siteDF is not empty, but it doesn't intersect the sample_names in physeq, - # warn and set to NULL + if(siteSampIntx < 1L & !is_empty(siteDF)){ + # If siteDF is not empty, but it doesn't intersect the sample_names in physeq, warn and set to NULL warning("`Ordination site/sample coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.") siteDF <- NULL } - if (specTaxaIntx < 1L & !is_empty(specDF)) { - # If specDF is not empty, but it doesn't intersect the taxa_names in physeq, warn - # and set to NULL + if(specTaxaIntx < 1L & !is_empty(specDF)){ + # If specDF is not empty, but it doesn't intersect the taxa_names in physeq, warn and set to NULL warning("`Ordination species/OTU/taxa coordinate indices did not match `physeq` index names. Setting corresponding coordinates to NULL.") specDF <- NULL } # If you made it this far and both NULL, return NULL and throw a warning - if (is_empty(siteDF) & is_empty(specDF)) { - warning("Could not obtain coordinates from the provided `ordination`. \n", - "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.") + if(is_empty(siteDF) & is_empty(specDF)){ + warning("Could not obtain coordinates from the provided `ordination`. \n", + "Please check your ordination method, and whether it is supported by `scores` or listed by phyloseq-package.") return(NULL) } - if (type %in% c("biplot", "split") & (is_empty(siteDF) | is_empty(specDF))) { - # biplot and split require both coordinates systems available. Both were - # attempted, or even evaluated by weighted average. If still empty, warn and - # switch to relevant type. - if (is_empty(siteDF)) { + if(type %in% c("biplot", "split") & (is_empty(siteDF) | is_empty(specDF)) ){ + # biplot and split require both coordinates systems available. + # Both were attempted, or even evaluated by weighted average. + # If still empty, warn and switch to relevant type. + if(is_empty(siteDF)){ warning("Could not access/evaluate site/sample coordinates. Switching type to 'species'") type <- "species" } - if (is_empty(specDF)) { + if(is_empty(specDF)){ warning("Could not access/evaluate species/taxa/OTU coordinates. Switching type to 'sites'") type <- "sites" } } - if (type != "species") { + if(type != "species"){ # samples covariate data frame, `sdf` sdf = NULL - sdf = data.frame(access(physeq, slot = "sam_data"), stringsAsFactors = FALSE) - if (!is_empty(sdf) & !is_empty(siteDF)) { + sdf = data.frame(access(physeq, slot="sam_data"), stringsAsFactors=FALSE) + if( !is_empty(sdf) & !is_empty(siteDF) ){ # The first two axes should always be x and y, the ordination axes. siteDF <- cbind(siteDF, sdf[rownames(siteDF), ]) } } - if (type != "sites") { + if(type != "sites"){ # taxonomy data frame `tdf` tdf = NULL - tdf = data.frame(access(physeq, slot = "tax_table"), stringsAsFactors = FALSE) - if (!is_empty(tdf) & !is_empty(specDF)) { + tdf = data.frame(access(physeq, slot="tax_table"), stringsAsFactors=FALSE) + if( !is_empty(tdf) & !is_empty(specDF) ){ # The first two axes should always be x and y, the ordination axes. specDF = cbind(specDF, tdf[rownames(specDF), ]) } } - # In 'naked' OTU-table cases, `siteDF` or `specDF` could be matrix. - if (!inherits(siteDF, "data.frame")) { - # warning('Sample Co-variables apparently missing in provided `physeq` for this - # plot-type. Coercing coord matrix to data.frame.') + # In "naked" OTU-table cases, `siteDF` or `specDF` could be matrix. + if(!inherits(siteDF, "data.frame")){ + #warning("Sample Co-variables apparently missing in provided `physeq` for this plot-type. Coercing coord matrix to data.frame.") siteDF <- as.data.frame(siteDF, stringsAsFactors = FALSE) - } - if (!inherits(specDF, "data.frame")) { - # warning('Taxonomy apparently missing in provided `physeq` for this plot-type. - # Coercing coord matrix to data.frame.') + } + if(!inherits(specDF, "data.frame")){ + #warning("Taxonomy apparently missing in provided `physeq` for this plot-type. Coercing coord matrix to data.frame.") specDF <- as.data.frame(specDF, stringsAsFactors = FALSE) } # Define the main plot data frame, `DF` DF = NULL DF <- switch(EXPR = type, sites = siteDF, species = specDF, { - # Anything else. In practice, type should be 'biplot' or 'split' here. Add - # id.type label + # Anything else. In practice, type should be "biplot" or "split" here. + # Add id.type label specDF$id.type <- "Taxa" siteDF$id.type <- "Samples" - # But what if the axis variables differ b/w them? Coerce specDF to match samples - # (siteDF) axis names + # But what if the axis variables differ b/w them? + # Coerce specDF to match samples (siteDF) axis names colnames(specDF)[1:2] <- colnames(siteDF)[1:2] # Merge the two data frames together for joint plotting. - DF = merge(specDF, siteDF, all = TRUE) - # Replace NA with 'samples' or 'taxa', where appropriate (factor/character) - if (!is.null(shape)) { - DF <- rp.joint.fill(DF, shape, "Samples") - } - if (!is.null(shape)) { - DF <- rp.joint.fill(DF, shape, "Taxa") - } - if (!is.null(color)) { - DF <- rp.joint.fill(DF, color, "Samples") - } - if (!is.null(color)) { - DF <- rp.joint.fill(DF, color, "Taxa") - } + DF = merge(specDF, siteDF, all=TRUE) + # Replace NA with "samples" or "taxa", where appropriate (factor/character) + if(!is.null(shape)){ DF <- rp.joint.fill(DF, shape, "Samples") } + if(!is.null(shape)){ DF <- rp.joint.fill(DF, shape, "Taxa") } + if(!is.null(color)){ DF <- rp.joint.fill(DF, color, "Samples") } + if(!is.null(color)){ DF <- rp.joint.fill(DF, color, "Taxa") } DF }) # In case user wants the plot-DF for some other purpose, return early - if (justDF) { - return(DF) - } + if(justDF){return(DF)} # Check variable availability before defining mapping. - if (!is.null(color)) { - if (!color %in% names(DF)) { - warning("Color variable was not found in the available data you provided.", - "No color mapped.") + if(!is.null(color)){ + if(!color %in% names(DF)){ + warning("Color variable was not found in the available data you provided.", + "No color mapped.") color <- NULL } } - if (!is.null(shape)) { - if (!shape %in% names(DF)) { - warning("Shape variable was not found in the available data you provided.", - "No shape mapped.") + if(!is.null(shape)){ + if(!shape %in% names(DF)){ + warning("Shape variable was not found in the available data you provided.", + "No shape mapped.") shape <- NULL } } - if (!is.null(label)) { - if (!label %in% names(DF)) { - warning("Label variable was not found in the available data you provided.", - "No label mapped.") + if(!is.null(label)){ + if(!label %in% names(DF)){ + warning("Label variable was not found in the available data you provided.", + "No label mapped.") label <- NULL } } # Grab the ordination axis names from the plot data frame (as strings) x = colnames(DF)[1] - y = colnames(DF)[2] + y = colnames(DF)[2] # Mapping section - if (ncol(DF) <= 2) { + if( ncol(DF) <= 2){ # If there is nothing to map, enforce simple mapping. message("No available covariate data to map on the points for this plot `type`") - ord_map = aes_string(x = x, y = y) - } else if (type %in% c("sites", "species", "split")) { - ord_map = aes_string(x = x, y = y, color = color, shape = shape, na.rm = TRUE) - } else if (type == "biplot") { - # biplot, `id.type` should try to map to color and size. Only size if color - # specified. - if (is.null(color)) { - ord_map = aes_string(x = x, y = y, size = "id.type", color = "id.type", - shape = shape, na.rm = TRUE) + ord_map = aes_string(x=x, y=y) + } else if( type %in% c("sites", "species", "split") ){ + ord_map = aes_string(x=x, y=y, color=color, shape=shape, na.rm=TRUE) + } else if(type=="biplot"){ + # biplot, `id.type` should try to map to color and size. Only size if color specified. + if( is.null(color) ){ + ord_map = aes_string(x=x, y=y, size="id.type", color="id.type", shape=shape, na.rm=TRUE) } else { - ord_map = aes_string(x = x, y = y, size = "id.type", color = color, shape = shape, - na.rm = TRUE) + ord_map = aes_string(x=x, y=y, size="id.type", color=color, shape=shape, na.rm=TRUE) } } # Plot-building section - p <- ggplot(DF, ord_map) + geom_point(na.rm = TRUE) + p <- ggplot(DF, ord_map) + geom_point(na.rm=TRUE) # split/facet color and shape can be anything in one or other. - if (type == "split") { + if( type=="split" ){ # split-option requires a facet_wrap - p <- p + facet_wrap(~id.type, nrow = 1) + p <- p + facet_wrap(~id.type, nrow=1) } # If biplot, adjust scales - if (type == "biplot") { - if (is.null(color)) { + if( type=="biplot" ){ + if( is.null(color) ){ # Rename color title in legend. - p <- update_labels(p, list(colour = "Ordination Type")) - } + p <- update_labels(p, list(colour="Ordination Type")) + } # Adjust size so that samples are bigger than taxa by default. - p <- p + scale_size_manual("type", values = c(Samples = 5, Taxa = 2)) + p <- p + scale_size_manual("type", values=c(Samples=5, Taxa=2)) } # Add text labels to points - if (!is.null(label)) { - label_map <- aes_string(x = x, y = y, label = label, na.rm = TRUE) - p = p + geom_text(label_map, data = rm.na.phyloseq(DF, label), size = 2, - vjust = 1.5, na.rm = TRUE) + if( !is.null(label) ){ + label_map <- aes_string(x=x, y=y, label=label, na.rm=TRUE) + p = p + geom_text(label_map, data=rm.na.phyloseq(DF, label), + size=2, vjust=1.5, na.rm=TRUE) } # Optionally add a title to the plot - if (!is.null(title)) { + if( !is.null(title) ){ p = p + ggtitle(title) } # Add fraction variability to axis labels, if available - if (length(extract_eigenvalue(ordination)[axes]) > 0) { - # Only attempt to add fraction variability if extract_eigenvalue returns - # something + if( length(extract_eigenvalue(ordination)[axes]) > 0 ){ + # Only attempt to add fraction variability + # if extract_eigenvalue returns something eigvec = extract_eigenvalue(ordination) # Fraction variability, fracvar - fracvar = eigvec[axes]/sum(eigvec) + fracvar = eigvec[axes] / sum(eigvec) # Percent variability, percvar - percvar = round(100 * fracvar, 1) - # The string to add to each axis label, strivar Start with the curent axis labels - # in the plot + percvar = round(100*fracvar, 1) + # The string to add to each axis label, strivar + # Start with the curent axis labels in the plot strivar = as(c(p$label$x, p$label$y), "character") # paste the percent variability string at the end strivar = paste0(strivar, " [", percvar, "%]") @@ -1089,35 +1118,38 @@ plot_ordination = function(physeq, ordination, type = "samples", axes = 1:2, col # Return the ggplot object return(p) } -################################################################################ Remove NA elements from data.frame prior to plotting Remove NA level from -################################################################################ factor +################################################################################ +# Remove NA elements from data.frame prior to plotting +# Remove NA level from factor +################################################################################ #' @keywords internal -rm.na.phyloseq <- function(DF, key.var) { - # (1) Remove elements from DF if key.var has NA DF[!is.na(DF[, key.var]), ] - DF <- subset(DF, !is.na(eval(parse(text = key.var)))) - # (2) Remove NA from the factor level, if a factor. - if (class(DF[, key.var]) == "factor") { - DF[, key.var] <- factor(as(DF[, key.var], "character")) - } - return(DF) +rm.na.phyloseq <- function(DF, key.var){ + # (1) Remove elements from DF if key.var has NA + # DF[!is.na(DF[, key.var]), ] + DF <- subset(DF, !is.na(eval(parse(text=key.var)))) + # (2) Remove NA from the factor level, if a factor. + if( class(DF[, key.var]) == "factor" ){ + DF[, key.var] <- factor(as(DF[, key.var], "character")) + } + return(DF) } -################################################################################ +################################################################################ +################################################################################ #' @keywords internal #' @importFrom plyr is.discrete -rp.joint.fill <- function(DF, map.var, id.type.rp = "samples") { - # If all of the map.var values for samples/species are NA, replace with - # id.type.rp - if (all(is.na(DF[DF$id.type == id.type.rp, map.var]))) { - # If discrete, coerce to character, convert to factor, replace, relevel. - if (is.discrete(DF[, map.var])) { - temp.vec <- as(DF[, map.var], "character") - temp.vec[is.na(temp.vec)] <- id.type.rp - DF[, map.var] <- relevel(factor(temp.vec), id.type.rp) - } - } - return(DF) +rp.joint.fill <- function(DF, map.var, id.type.rp="samples"){ + # If all of the map.var values for samples/species are NA, replace with id.type.rp + if( all(is.na(DF[DF$id.type==id.type.rp, map.var])) ){ + # If discrete, coerce to character, convert to factor, replace, relevel. + if( is.discrete(DF[, map.var]) ){ + temp.vec <- as(DF[, map.var], "character") + temp.vec[is.na(temp.vec)] <- id.type.rp + DF[, map.var] <- relevel(factor(temp.vec), id.type.rp) + } + } + return(DF) } -################################################################################ +################################################################################ #' Subset points from an ordination-derived ggplot #' #' Easily retrieve a plot-derived \code{data.frame} with a subset of points @@ -1126,7 +1158,7 @@ rp.joint.fill <- function(DF, map.var, id.type.rp = "samples") { #' There are many useful examples of phyloseq ordination graphics in the #' \href{http://joey711.github.io/phyloseq/subset_ord_plot-examples}{phyloseq online tutorials}. #' -#' @usage subset_ord_plot(p, threshold=0.05, method='farthest') +#' @usage subset_ord_plot(p, threshold=0.05, method="farthest") #' #' @param p (Required). A \code{\link{ggplot}} object created by #' \code{\link{plot_ordination}}. It contains the complete data that you @@ -1138,7 +1170,7 @@ rp.joint.fill <- function(DF, map.var, id.type.rp = "samples") { #' determining which points are included in returned \code{data.frame}. #' #' @param method (Optional). A character string. One of -#' \code{c('farthest', 'radial', 'square')}. Default is \code{'farthest'}. +#' \code{c("farthest", "radial", "square")}. Default is \code{"farthest"}. #' This determines how threshold will be interpreted. #' #' \describe{ @@ -1154,7 +1186,7 @@ rp.joint.fill <- function(DF, map.var, id.type.rp = "samples") { #' } #' #' \item{radial}{ -#'\t Keep only those points that are beyond \code{threshold} +#' Keep only those points that are beyond \code{threshold} #' radial distance from the origin. Has the effect of removing a #' circle of points from the plot, centered at the origin. #' } @@ -1181,36 +1213,36 @@ rp.joint.fill <- function(DF, map.var, id.type.rp = "samples") { #' @examples #' ## See the online tutorials. #' ## http://joey711.github.io/phyloseq/subset_ord_plot-examples -subset_ord_plot <- function(p, threshold = 0.05, method = "farthest") { - threshold <- threshold[1] # ignore all but first threshold value. - method <- method[1] # ignore all but first string. - method.names <- c("farthest", "radial", "square") - # Subset to only some small fraction of points with furthest distance from origin - df <- p$data[, c(1, 2)] - d <- sqrt(df[, 1]^2 + df[, 2]^2) - names(d) <- rownames(df) - if (method.names[pmatch(method, method.names)] == "farthest") { - if (threshold >= 1) { - show.names <- names(sort(d, TRUE)[1:threshold]) - } else if (threshold < 1) { - show.names <- names(sort(d, TRUE)[1:round(threshold * length(d))]) - } else { - stop("threshold not a valid positive numeric scalar") - } - } else if (method.names[pmatch(method, method.names)] == "radial") { - show.names <- names(d[d > threshold]) - } else if (method.names[pmatch(method, method.names)] == "square") { - # show.names <- rownames(df)[as.logical((abs(df[, 1]) > threshold) + (abs(df[, - # 2]) > threshold))] - show.names <- rownames(df)[((abs(df[, 1]) > threshold) | (abs(df[, 2]) > - threshold))] - } else { - stop("method name not supported. Please select a valid method") - } - - return(p$data[show.names, ]) +subset_ord_plot <- function(p, threshold=0.05, method="farthest"){ + threshold <- threshold[1] # ignore all but first threshold value. + method <- method[1] # ignore all but first string. + method.names <- c("farthest", "radial", "square") + # Subset to only some small fraction of points + # with furthest distance from origin + df <- p$data[, c(1, 2)] + d <- sqrt(df[, 1]^2 + df[, 2]^2) + names(d) <- rownames(df) + if( method.names[pmatch(method, method.names)] == "farthest"){ + if( threshold >= 1){ + show.names <- names(sort(d, TRUE)[1:threshold]) + } else if( threshold < 1 ){ + show.names <- names(sort(d, TRUE)[1:round(threshold*length(d))]) + } else { + stop("threshold not a valid positive numeric scalar") + } + } else if( method.names[pmatch(method, method.names)] == "radial"){ + show.names <- names(d[d > threshold]) + } else if( method.names[pmatch(method, method.names)] == "square"){ + # show.names <- rownames(df)[as.logical((abs(df[, 1]) > threshold) + (abs(df[, 2]) > threshold))] + show.names <- rownames(df)[((abs(df[, 1]) > threshold) | (abs(df[, 2]) > threshold))] + } else { + stop("method name not supported. Please select a valid method") + } + + return(p$data[show.names, ]) } -################################################################################ +################################################################################ +################################################################################ #' General ordination eigenvalue plotter using ggplot2. #' #' Convenience wrapper for plotting ordination eigenvalues (if available) @@ -1243,61 +1275,61 @@ subset_ord_plot <- function(p, threshold = 0.05, method = "farthest") { #' @export #' @examples #' # First load and trim a dataset -#' data('GlobalPatterns') +#' data("GlobalPatterns") #' GP = prune_taxa(names(sort(taxa_sums(GlobalPatterns), TRUE)[1:50]), GlobalPatterns) #' # Test plots (preforms ordination in-line, then makes scree plot) -#' plot_scree(ordinate(GP, 'DPCoA', 'bray')) -#' plot_scree(ordinate(GP, 'PCoA', 'bray')) +#' plot_scree(ordinate(GP, "DPCoA", "bray")) +#' plot_scree(ordinate(GP, "PCoA", "bray")) #' # Empty return with message -#' plot_scree(ordinate(GP, 'NMDS', 'bray')) +#' plot_scree(ordinate(GP, "NMDS", "bray")) #' # Constrained ordinations -#' plot_scree(ordinate(GP, 'CCA', formula=~SampleType)) -#' plot_scree(ordinate(GP, 'RDA', formula=~SampleType)) -#' plot_scree(ordinate(GP, 'CAP', formula=~SampleType)) +#' plot_scree(ordinate(GP, "CCA", formula=~SampleType)) +#' plot_scree(ordinate(GP, "RDA", formula=~SampleType)) +#' plot_scree(ordinate(GP, "CAP", formula=~SampleType)) #' # Deprecated example of constrained ordination (emits a warning) -#' #plot_scree(ordinate(GP ~ SampleType, 'RDA')) -#' plot_scree(ordinate(GP, 'DCA')) -#' plot_ordination(GP, ordinate(GP, 'DCA'), type='scree') -plot_scree = function(ordination, title = NULL) { - # Use get_eigenvalue method dispatch. It always returns a numeric vector. - x = extract_eigenvalue(ordination) - # Were eigenvalues found? If not, return NULL - if (is.null(x)) { - cat("No eigenvalues found in ordination\n") - return(NULL) - } else { - # If no names, add them arbitrarily 'axis1, axis2, ..., axisN' - if (is.null(names(x))) - names(x) = 1:length(x) - # For scree plot, want to show the fraction of total eigenvalues - x = x/sum(x) - # Set negative values to zero - x[x <= 0] = 0 - # Create the ggplot2 data.frame, and basic ggplot2 plot - gdf = data.frame(axis = names(x), eigenvalue = x) - p = ggplot(gdf, aes(x = axis, y = eigenvalue)) + geom_bar(stat = "identity") - # Force the order to be same as original in x - p = p + scale_x_discrete(limits = names(x)) - # Orient the x-labels for space. - p = p + theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) - # Optionally add a title to the plot - if (!is.null(title)) { - p <- p + ggtitle(title) - } - return(p) - } +#' #plot_scree(ordinate(GP ~ SampleType, "RDA")) +#' plot_scree(ordinate(GP, "DCA")) +#' plot_ordination(GP, ordinate(GP, "DCA"), type="scree") +plot_scree = function(ordination, title=NULL){ + # Use get_eigenvalue method dispatch. It always returns a numeric vector. + x = extract_eigenvalue(ordination) + # Were eigenvalues found? If not, return NULL + if( is.null(x) ){ + cat("No eigenvalues found in ordination\n") + return(NULL) + } else { + # If no names, add them arbitrarily "axis1, axis2, ..., axisN" + if( is.null(names(x)) ) names(x) = 1:length(x) + # For scree plot, want to show the fraction of total eigenvalues + x = x/sum(x) + # Set negative values to zero + x[x <= 0.0] = 0.0 + # Create the ggplot2 data.frame, and basic ggplot2 plot + gdf = data.frame(axis=names(x), eigenvalue = x) + p = ggplot(gdf, aes(x=axis, y=eigenvalue)) + geom_bar(stat="identity") + # Force the order to be same as original in x + p = p + scale_x_discrete(limits = names(x)) + # Orient the x-labels for space. + p = p + theme(axis.text.x=element_text(angle=90, vjust=0.5)) + # Optionally add a title to the plot + if( !is.null(title) ){ + p <- p + ggtitle(title) + } + return(p) + } } -################################################################################ Define S3 generic extract_eigenvalue function; formerly S4 generic -################################################################################ get_eigenvalue() Function is used by `plot_scree` to get the eigenvalue vector -################################################################################ from different types of ordination objects. Used S3 generic in this case -################################################################################ because many ordination objects, the input, are not formally-defined S4 -################################################################################ classes, but vaguely-/un-defined S3. This throws warnings during package build -################################################################################ if extract_eigenvalue were S4 generic method, because the ordination classes -################################################################################ don't appear to have any definition in phyloseq or dependencies. +################################################################################ +# Define S3 generic extract_eigenvalue function; formerly S4 generic get_eigenvalue() +# Function is used by `plot_scree` to get the eigenvalue vector from different +# types of ordination objects. +# Used S3 generic in this case because many ordination objects, the input, are +# not formally-defined S4 classes, but vaguely-/un-defined S3. This throws +# warnings during package build if extract_eigenvalue were S4 generic method, +# because the ordination classes don't appear to have any definition in phyloseq +# or dependencies. #' @keywords internal extract_eigenvalue = function(ordination) UseMethod("extract_eigenvalue", ordination) -# Default is to return NULL (e.g. for NMDS, or non-supported -# ordinations/classes). +# Default is to return NULL (e.g. for NMDS, or non-supported ordinations/classes). extract_eigenvalue.default = function(ordination) NULL # for pcoa objects extract_eigenvalue.pcoa = function(ordination) ordination$values$Relative_eig @@ -1309,7 +1341,7 @@ extract_eigenvalue.rda = function(ordination) c(ordination$CCA$eig, ordination$C extract_eigenvalue.dpcoa = function(ordination) ordination$eig # for decorana (dca) objects extract_eigenvalue.decorana = function(ordination) ordination$evals -################################################################################ +################################################################################ #' Melt phyloseq data object into large data.frame #' #' The psmelt function is a specialized melt function for melting phyloseq objects @@ -1320,7 +1352,7 @@ extract_eigenvalue.decorana = function(ordination) ordination$evals #' have reserved the following variable names that should not be used #' as the names of \code{\link{sample_variables}} #' or taxonomic \code{\link{rank_names}}. -#' These reserved names are \code{c('Sample', 'Abundance', 'OTU')}. +#' These reserved names are \code{c("Sample", "Abundance", "OTU")}. #' Also, you should not have identical names for #' sample variables and taxonomic ranks. #' That is, the intersection of the output of the following two functions @@ -1364,107 +1396,116 @@ extract_eigenvalue.decorana = function(ordination) ordination$evals #' @export #' #' @examples -#' data('GlobalPatterns') -#' gp.ch = subset_taxa(GlobalPatterns, Phylum == 'Chlamydiae') +#' data("GlobalPatterns") +#' gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae") #' mdf = psmelt(gp.ch) #' nrow(mdf) #' ncol(mdf) #' colnames(mdf) #' head(rownames(mdf)) #' # Create a ggplot similar to -#' library('ggplot2') +#' library("ggplot2") #' p = ggplot(mdf, aes(x=SampleType, y=Abundance, fill=Genus)) -#' p = p + geom_bar(color='black', stat='identity', position='stack') +#' p = p + geom_bar(color="black", stat="identity", position="stack") #' print(p) -psmelt = function(physeq) { +psmelt = function(physeq){ # Access covariate names from object, if present - if (!inherits(physeq, "phyloseq")) { + if(!inherits(physeq, "phyloseq")){ rankNames = NULL sampleVars = NULL } else { # Still might be NULL, but attempt access rankNames = rank_names(physeq, FALSE) - sampleVars = sample_variables(physeq, FALSE) + sampleVars = sample_variables(physeq, FALSE) } # Define reserved names - reservedVarnames = c("Sample", "Abundance", "OTU") - # type-1a conflict: between sample_data and reserved psmelt variable names + reservedVarnames = c("Sample", "Abundance", "OTU") + # type-1a conflict: between sample_data + # and reserved psmelt variable names type1aconflict = intersect(reservedVarnames, sampleVars) - if (length(type1aconflict) > 0) { + if(length(type1aconflict) > 0){ wh1a = which(sampleVars %in% type1aconflict) new1a = paste0("sample_", sampleVars[wh1a]) # First warn about the change - warning("The sample variables: \n", paste(sampleVars[wh1a], collapse = ", "), - "\n have been renamed to: \n", paste0(new1a, collapse = ", "), "\n", - "to avoid conflicts with special phyloseq plot attribute names.") + warning("The sample variables: \n", + paste(sampleVars[wh1a], collapse=", "), + "\n have been renamed to: \n", + paste0(new1a, collapse=", "), "\n", + "to avoid conflicts with special phyloseq plot attribute names.") # Rename the sample variables. colnames(sample_data(physeq))[wh1a] <- new1a } - # type-1b conflict: between tax_table and reserved psmelt variable names + # type-1b conflict: between tax_table + # and reserved psmelt variable names type1bconflict = intersect(reservedVarnames, rankNames) - if (length(type1bconflict) > 0) { + if(length(type1bconflict) > 0){ wh1b = which(rankNames %in% type1bconflict) new1b = paste0("taxa_", rankNames[wh1b]) # First warn about the change - warning("The rank names: \n", paste(rankNames[wh1b], collapse = ", "), "\n have been renamed to: \n", - paste0(new1b, collapse = ", "), "\n", "to avoid conflicts with special phyloseq plot attribute names.") + warning("The rank names: \n", + paste(rankNames[wh1b], collapse=", "), + "\n have been renamed to: \n", + paste0(new1b, collapse=", "), "\n", + "to avoid conflicts with special phyloseq plot attribute names.") # Rename the conflicting taxonomic ranks colnames(tax_table(physeq))[wh1b] <- new1b } # type-2 conflict: internal between tax_table and sample_data type2conflict = intersect(sampleVars, rankNames) - if (length(type2conflict) > 0) { + if(length(type2conflict) > 0){ wh2 = which(sampleVars %in% type2conflict) new2 = paste0("sample_", sampleVars[wh2]) # First warn about the change - warning("The sample variables: \n", paste0(sampleVars[wh2], collapse = ", "), - "\n have been renamed to: \n", paste0(new2, collapse = ", "), "\n", "to avoid conflicts with taxonomic rank names.") + warning("The sample variables: \n", + paste0(sampleVars[wh2], collapse=", "), + "\n have been renamed to: \n", + paste0(new2, collapse=", "), "\n", + "to avoid conflicts with taxonomic rank names.") # Rename the sample variables colnames(sample_data(physeq))[wh2] <- new2 } - # Enforce OTU table orientation. Redundant-looking step supports 'naked' - # otu_table as `physeq` input. + # Enforce OTU table orientation. Redundant-looking step + # supports "naked" otu_table as `physeq` input. otutab = otu_table(physeq) - if (!taxa_are_rows(otutab)) { - otutab <- t(otutab) - } + if(!taxa_are_rows(otutab)){otutab <- t(otutab)} # Melt the OTU table: wide form to long form table mdf = reshape2::melt(as(otutab, "matrix")) colnames(mdf)[1] <- "OTU" colnames(mdf)[2] <- "Sample" colnames(mdf)[3] <- "Abundance" - # Row and Col names are coerced to integer or factor if possible. Do not want - # this. Coerce these to character. e.g. `OTU` should always be discrete, even if - # OTU ID values can be coerced to integer + # Row and Col names are coerced to integer or factor if possible. + # Do not want this. Coerce these to character. + # e.g. `OTU` should always be discrete, even if OTU ID values can be coerced to integer mdf$OTU <- as.character(mdf$OTU) mdf$Sample <- as.character(mdf$Sample) # Merge the sample data.frame if present - if (!is.null(sampleVars)) { - sdf = data.frame(sample_data(physeq), stringsAsFactors = FALSE) + if(!is.null(sampleVars)){ + sdf = data.frame(sample_data(physeq), stringsAsFactors=FALSE) sdf$Sample <- sample_names(physeq) # merge the sample-data and the melted otu table - mdf <- merge(mdf, sdf, by.x = "Sample") + mdf <- merge(mdf, sdf, by.x="Sample") } # Next merge taxonomy data, if present - if (!is.null(rankNames)) { + if(!is.null(rankNames)){ TT = access(physeq, "tax_table") # First, check for empty TT columns (all NA) keepTTcols <- colSums(is.na(TT)) < ntaxa(TT) # Protect against all-empty columns, or col-less matrix - if (length(which(keepTTcols)) > 0 & ncol(TT) > 0) { + if(length(which(keepTTcols)) > 0 & ncol(TT) > 0){ # Remove the empty columns TT <- TT[, keepTTcols] - # Add TT to the 'psmelt' data.frame - tdf = data.frame(TT, OTU = taxa_names(physeq)) - # Now add to the 'psmelt' output data.frame, `mdf` - mdf <- merge(mdf, tdf, by.x = "OTU") + # Add TT to the "psmelt" data.frame + tdf = data.frame(TT, OTU=taxa_names(physeq)) + # Now add to the "psmelt" output data.frame, `mdf` + mdf <- merge(mdf, tdf, by.x="OTU") } } # Sort the entries by abundance - mdf = mdf[order(mdf$Abundance, decreasing = TRUE), ] + mdf = mdf[order(mdf$Abundance, decreasing=TRUE), ] return(mdf) } -################################################################################ +################################################################################ +################################################################################ #' A flexible, informative barplot phyloseq data #' #' There are many useful examples of phyloseq barplot graphics in the @@ -1476,7 +1517,7 @@ psmelt = function(physeq) { #' summary graphics of the differences in taxa abundance between samples in #' an experiment. #' -#' @usage plot_bar(physeq, x='Sample', y='Abundance', fill=NULL, +#' @usage plot_bar(physeq, x="Sample", y="Abundance", fill=NULL, #' title=NULL, facet_grid=NULL) #' #' @param physeq (Required). An \code{\link{otu_table-class}} or @@ -1490,7 +1531,7 @@ psmelt = function(physeq) { #' #' @param y (Optional). A character string. #' The variable in the melted-data that should be mapped to the y-axis. -#' Typically this will be \code{'Abundance'}, in order to +#' Typically this will be \code{"Abundance"}, in order to #' quantitatively display the abundance values for each OTU/group. #' However, alternative variables could be used instead, #' producing a very different, though possibly still informative, plot. @@ -1526,42 +1567,46 @@ psmelt = function(physeq) { #' @export #' #' @examples -#' data('GlobalPatterns') -#' gp.ch = subset_taxa(GlobalPatterns, Phylum == 'Chlamydiae') +#' data("GlobalPatterns") +#' gp.ch = subset_taxa(GlobalPatterns, Phylum == "Chlamydiae") #' plot_bar(gp.ch) -#' plot_bar(gp.ch, fill='Genus') -#' plot_bar(gp.ch, x='SampleType', fill='Genus') -#' plot_bar(gp.ch, 'SampleType', fill='Genus', facet_grid=~Family) +#' plot_bar(gp.ch, fill="Genus") +#' plot_bar(gp.ch, x="SampleType", fill="Genus") +#' plot_bar(gp.ch, "SampleType", fill="Genus", facet_grid=~Family) #' # See additional examples in the plot_bar online tutorial. Link above. -plot_bar = function(physeq, x = "Sample", y = "Abundance", fill = NULL, title = NULL, - facet_grid = NULL) { - - # Start by melting the data in the 'standard' way using psmelt. - mdf = psmelt(physeq) - - # Build the plot data structure - p = ggplot(mdf, aes_string(x = x, y = y, fill = fill)) - - # Add the bar geometric object. Creates a basic graphic. Basis for the rest. - # Test weather additional - p = p + geom_bar(stat = "identity", position = "stack", color = "black") - - # By default, rotate the x-axis labels (they might be long) - p = p + theme(axis.text.x = element_text(angle = -90, hjust = 0)) - - # Add faceting, if given - if (!is.null(facet_grid)) { - p <- p + facet_grid(facet_grid) - } - - # Optionally add a title to the plot - if (!is.null(title)) { - p <- p + ggtitle(title) - } - - return(p) +plot_bar = function(physeq, x="Sample", y="Abundance", fill=NULL, + title=NULL, facet_grid=NULL){ + + # Start by melting the data in the "standard" way using psmelt. + mdf = psmelt(physeq) + + # Build the plot data structure + p = ggplot(mdf, aes_string(x=x, y=y, fill=fill)) + + # Add the bar geometric object. Creates a basic graphic. Basis for the rest. + # Test weather additional + p = p + geom_bar(stat="identity", position="stack", color="black") + + # By default, rotate the x-axis labels (they might be long) + p = p + theme(axis.text.x=element_text(angle=-90, hjust=0)) + + # Add faceting, if given + if( !is.null(facet_grid) ){ + p <- p + facet_grid(facet_grid) + } + + # Optionally add a title to the plot + if( !is.null(title) ){ + p <- p + ggtitle(title) + } + + return(p) } -################################################################################ plot_tree section. +################################################################################ +################################################################################ +# plot_tree section. +################################################################################ +################################################################################ #' Returns a data table defining the line segments of a phylogenetic tree. #' #' This function takes a \code{\link{phylo}} or \code{\link{phyloseq-class}} object @@ -1574,16 +1619,16 @@ plot_bar = function(physeq, x = "Sample", y = "Abundance", fill = NULL, title = #' suitable for plotting with \code{\link[ggplot2]{ggplot}}2. #' #' @param ladderize (Optional). Boolean or character string (either -#' \code{FALSE}, \code{TRUE}, or \code{'left'}). +#' \code{FALSE}, \code{TRUE}, or \code{"left"}). #' Default is \code{FALSE} (no ladderization). #' This parameter specifies whether or not to \code{\link[ape]{ladderize}} the tree #' (i.e., reorder nodes according to the depth of their enclosed #' subtrees) prior to plotting. #' This tends to make trees more aesthetically pleasing and legible in #' a graphical display. -#' When \code{TRUE} or \code{'right'}, ``right'' ladderization is used. +#' When \code{TRUE} or \code{"right"}, ``right'' ladderization is used. #' When set to \code{FALSE}, no ladderization is applied. -#' When set to \code{'left'}, the reverse direction +#' When set to \code{"left"}, the reverse direction #' (``left'' ladderization) is applied. #' #' @return @@ -1610,10 +1655,10 @@ plot_bar = function(physeq, x = "Sample", y = "Abundance", fill = NULL, title = #' @importFrom data.table setkey #' @export #' @examples -#' library('ggplot2') -#' data('esophagus') +#' library("ggplot2") +#' data("esophagus") #' phy = phy_tree(esophagus) -#' phy <- ape::root(phy, '65_2_5', resolve.root=TRUE) +#' phy <- ape::root(phy, "65_2_5", resolve.root=TRUE) #' treeSegs0 = tree_layout(phy) #' treeSegs1 = tree_layout(esophagus) #' edgeMap = aes(x=xleft, xend=xright, y=y, yend=y) @@ -1622,32 +1667,32 @@ plot_bar = function(physeq, x = "Sample", y = "Abundance", fill = NULL, title = #' p1 = ggplot(treeSegs1$edgeDT, edgeMap) + geom_segment() + geom_segment(vertMap, data=treeSegs1$vertDT) #' print(p0) #' print(p1) -#' plot_tree(esophagus, 'treeonly') -#' plot_tree(esophagus, 'treeonly', ladderize='left') -tree_layout = function(phy, ladderize = FALSE) { - if (inherits(phy, "phyloseq")) { +#' plot_tree(esophagus, "treeonly") +#' plot_tree(esophagus, "treeonly", ladderize="left") +tree_layout = function(phy, ladderize=FALSE){ + if(inherits(phy, "phyloseq")){ phy = phy_tree(phy) } - if (!inherits(phy, "phylo")) { + if(!inherits(phy, "phylo")){ stop("tree missing or invalid. Please check `phy` argument and try again.") } - if (is.null(phy$edge.length)) { + if(is.null(phy$edge.length)){ # If no edge lengths, set them all to value of 1 (dendrogram). - phy$edge.length <- rep(1L, times = nrow(phy$edge)) + phy$edge.length <- rep(1L, times=nrow(phy$edge)) } # Perform ladderizing, if requested - if (ladderize != FALSE) { - if (ladderize == "left") { + if(ladderize != FALSE){ + if(ladderize == "left"){ phy <- ladderize(phy, FALSE) - } else if (ladderize == TRUE | ladderize == "right") { + } else if(ladderize==TRUE | ladderize=="right"){ phy <- ladderize(phy, TRUE) } else { stop("You did not specify a supported option for argument `ladderize`.") } } - # 'z' is the tree in postorder order used in calls to .C Descending order of - # left-hand side of edge (the ancestor to the node) - z = reorder.phylo(phy, order = "postorder") + # 'z' is the tree in postorder order used in calls to .C + # Descending order of left-hand side of edge (the ancestor to the node) + z = reorder.phylo(phy, order="postorder") # Initialize some characteristics of the tree. Nedge = nrow(phy$edge)[1] Nnode = phy$Nnode @@ -1656,83 +1701,85 @@ tree_layout = function(phy, ladderize = FALSE) { TIPS = phy$edge[(phy$edge[, 2] <= Ntip), 2] NODES = (ROOT):(Ntip + Nnode) nodelabels = phy$node.label - # Call phyloseq-internal function that in-turn calls ape's internal horizontal - # position function, in C, using the re-ordered phylo object. + # Call phyloseq-internal function that in-turn calls ape's internal + # horizontal position function, in C, using the re-ordered phylo object. xx = ape_node_depth_edge_length(Ntip, Nnode, z$edge, Nedge, z$edge.length) # Initialize `yy`, before passing to ape internal function in C. yy <- numeric(Ntip + Nnode) yy[TIPS] <- 1:Ntip # Define the ape_node_height wrapping function - ape_node_height <- function(Ntip, Nnode, edge, Nedge, yy) { - .C(ape:::node_height, PACKAGE = "ape", as.integer(Ntip), as.integer(Nnode), - as.integer(edge[, 1]), as.integer(edge[, 2]), as.integer(Nedge), as.double(yy))[[6]] - } - # The call in ape yy <- .nodeHeight(Ntip, Nnode, z$edge, Nedge, yy) + ape_node_height <- function(Ntip, Nnode, edge, Nedge, yy){ + .C(ape:::node_height, PACKAGE="ape", + as.integer(Ntip), as.integer(Nnode), + as.integer(edge[, 1]), as.integer(edge[, 2]), + as.integer(Nedge), as.double(yy))[[6]] + } + # The call in ape + #yy <- .nodeHeight(Ntip, Nnode, z$edge, Nedge, yy) yy <- ape_node_height(Ntip, Nnode, z$edge, Nedge, yy) - # Initialize an edge data.table Don't set key, order matters - edgeDT = data.table(phy$edge, edge.length = phy$edge.length, OTU = NA_character_) + # Initialize an edge data.table + # Don't set key, order matters + edgeDT = data.table(phy$edge, edge.length=phy$edge.length, OTU=NA_character_) # Add tip.labels if present - if (!is.null(phy$tip.label)) { + if(!is.null(phy$tip.label)){ # Initialize OTU, set node (V2) as key, assign taxa_names as OTU label - edgeDT[, `:=`(OTU, NA_character_)] + edgeDT[, OTU:=NA_character_] setkey(edgeDT, V2) - edgeDT[V2 <= Ntip, `:=`(OTU, phy$tip.label)] + edgeDT[V2 <= Ntip, OTU:=phy$tip.label] } - # Add the mapping for each edge defined in `xx` and `yy` - edgeDT[, `:=`(xleft, xx[V1])] - edgeDT[, `:=`(xright, xx[V2])] - edgeDT[, `:=`(y, yy[V2])] + # Add the mapping for each edge defined in `xx` and `yy` + edgeDT[, xleft:=xx[V1]] + edgeDT[, xright:=xx[V2]] + edgeDT[, y:=yy[V2]] # Next define vertical segments - vertDT = edgeDT[, list(x = xleft[1], vmin = min(y), vmax = max(y)), by = V1, - mult = "last"] - if (!is.null(phy$node.label)) { + vertDT = edgeDT[, list(x=xleft[1], vmin=min(y), vmax=max(y)), by=V1, mult="last"] + if(!is.null(phy$node.label)){ # Add non-root node labels to edgeDT - edgeDT[V2 > ROOT, `:=`(x, xright)] - edgeDT[V2 > ROOT, `:=`(label, phy$node.label[-1])] + edgeDT[V2 > ROOT, x:=xright] + edgeDT[V2 > ROOT, label:=phy$node.label[-1]] # Add root label (first node label) to vertDT setkey(vertDT, V1) - vertDT[J(ROOT), `:=`(y, mean(c(vmin, vmax)))] - vertDT[J(ROOT), `:=`(label, phy$node.label[1])] + vertDT[J(ROOT), y:=mean(c(vmin, vmax))] + vertDT[J(ROOT), label:=phy$node.label[1]] } - return(list(edgeDT = edgeDT, vertDT = vertDT)) + return(list(edgeDT=edgeDT, vertDT=vertDT)) } -################################################################################ Define an internal function for determining what the text-size should be +################################################################################ +# Define an internal function for determining what the text-size should be #' @keywords internal -manytextsize <- function(n, mins = 0.5, maxs = 4, B = 6, D = 100) { - # empirically selected size-value calculator. - s <- B * exp(-n/D) - # enforce a floor. - s <- ifelse(s > mins, s, mins) - # enforce a max - s <- ifelse(s < maxs, s, maxs) - return(s) +manytextsize <- function(n, mins=0.5, maxs=4, B=6, D=100){ + # empirically selected size-value calculator. + s <- B * exp(-n/D) + # enforce a floor. + s <- ifelse(s > mins, s, mins) + # enforce a max + s <- ifelse(s < maxs, s, maxs) + return(s) } -################################################################################ Return TRUE if the nodes of the tree in the phyloseq object provided are -################################################################################ unlabeled. +################################################################################ +# Return TRUE if the nodes of the tree in the phyloseq object provided are unlabeled. #' @keywords internal -nodesnotlabeled = function(physeq) { - if (is.null(phy_tree(physeq, FALSE))) { - warning("There is no phylogenetic tree in the object you have provided. Try `phy_tree(physeq)` to see.") - return(TRUE) - } else { - return(is.null(phy_tree(physeq)$node.label) | length(phy_tree(physeq)$node.label) == - 0L) - } +nodesnotlabeled = function(physeq){ + if(is.null(phy_tree(physeq, FALSE))){ + warning("There is no phylogenetic tree in the object you have provided. Try `phy_tree(physeq)` to see.") + return(TRUE) + } else { + return(is.null(phy_tree(physeq)$node.label) | length(phy_tree(physeq)$node.label)==0L) + } } -# A quick test function to decide how nodes should be labeled by default, if at -# all. +# A quick test function to decide how nodes should be labeled by default, if at all. +# #' @keywords internal -howtolabnodes = function(physeq) { - if (!nodesnotlabeled(physeq)) { - # If the nodes are labeled, use a version of this function, taking into account - # `ntaxa`. - return(nodeplotdefault(manytextsize(ntaxa(physeq)))) - } else { +howtolabnodes = function(physeq){ + if(!nodesnotlabeled(physeq)){ + # If the nodes are labeled, use a version of this function, taking into account `ntaxa`. + return(nodeplotdefault(manytextsize(ntaxa(physeq)))) + } else { # Else, use `nodeplotblank`, which returns the ggplot object as-is. - return(nodeplotblank) - } + return(nodeplotblank) + } } -################################################################################ +################################################################################ #' Function to avoid plotting node labels #' #' Unlike, \code{\link{nodeplotdefault}} and \code{\link{nodeplotboot}}, @@ -1763,13 +1810,13 @@ howtolabnodes = function(physeq) { #' @import ggplot2 #' @export #' @examples -#' data('esophagus') +#' data("esophagus") #' plot_tree(esophagus) #' plot_tree(esophagus, nodelabf=nodeplotblank) -nodeplotblank = function(p, nodelabdf) { - return(p) +nodeplotblank = function(p, nodelabdf){ + return(p) } -################################################################################ +################################################################################ #' Generates a function for labeling bootstrap values on a phylogenetic tree. #' #' Is not a labeling function itself, but returns one. @@ -1824,36 +1871,36 @@ nodeplotblank = function(p, nodelabdf) { #' @examples #' nodeplotboot() #' nodeplotboot(3, -0.4) -nodeplotboot = function(highthresh = 95L, lowcthresh = 50L, size = 2L, hjust = -0.2) { - function(p, nodelabdf) { - # For bootstrap, check that the node labels can be coerced to numeric - try(boot <- as(as(nodelabdf$label, "character"), "numeric"), TRUE) - # Want NAs/NaN to propagate, but still need to test remainder - goodboot = boot[complete.cases(boot)] - if (!is(goodboot, "numeric") & length(goodboot) > 0) { - stop("The node labels, phy_tree(physeq)$node.label, are not coercable to a numeric vector with any elements.") - } - # So they look even more like bootstraps and display well, force them to be - # between 0 and 100, rounded to 2 digits. - if (all(goodboot >= 0 & goodboot <= 1)) { - boot = round(boot, 2) * 100L - } - nodelabdf$boot = boot - boottop = subset(nodelabdf, boot >= highthresh) - bootmid = subset(nodelabdf, boot > lowcthresh & boot < highthresh) - # Label the high-confidence nodes with a point. - if (nrow(boottop) > 0L) { - p = p + geom_point(mapping = aes(x = x, y = y), data = boottop, na.rm = TRUE) - } - # Label the remaining bootstrap values as text at the nodes. - if (nrow(bootmid) > 0L) { - bootmid$label = bootmid$boot - p = nodeplotdefault(size, hjust)(p, bootmid) - } - return(p) - } +nodeplotboot = function(highthresh=95L, lowcthresh=50L, size=2L, hjust=-0.2){ + function(p, nodelabdf){ + # For bootstrap, check that the node labels can be coerced to numeric + try(boot <- as(as(nodelabdf$label, "character"), "numeric"), TRUE) + # Want NAs/NaN to propagate, but still need to test remainder + goodboot = boot[complete.cases(boot)] + if( !is(goodboot, "numeric") & length(goodboot) > 0 ){ + stop("The node labels, phy_tree(physeq)$node.label, are not coercable to a numeric vector with any elements.") + } + # So they look even more like bootstraps and display well, + # force them to be between 0 and 100, rounded to 2 digits. + if( all( goodboot >= 0.0 & goodboot <= 1.0 ) ){ + boot = round(boot, 2)*100L + } + nodelabdf$boot = boot + boottop = subset(nodelabdf, boot >= highthresh) + bootmid = subset(nodelabdf, boot > lowcthresh & boot < highthresh) + # Label the high-confidence nodes with a point. + if( nrow(boottop)>0L ){ + p = p + geom_point(mapping=aes(x=x, y=y), data=boottop, na.rm=TRUE) + } + # Label the remaining bootstrap values as text at the nodes. + if( nrow(bootmid)>0L ){ + bootmid$label = bootmid$boot + p = nodeplotdefault(size, hjust)(p, bootmid) + } + return(p) + } } -################################################################################ +################################################################################ #' Generates a default node-label function #' #' Is not a labeling function itself, but returns one. @@ -1895,14 +1942,14 @@ nodeplotboot = function(highthresh = 95L, lowcthresh = 50L, size = 2L, hjust = - #' @examples #' nodeplotdefault() #' nodeplotdefault(3, -0.4) -nodeplotdefault = function(size = 2L, hjust = -0.2) { - function(p, nodelabdf) { - p = p + geom_text(mapping = aes(x = x, y = y, label = label), data = nodelabdf, - size = size, hjust = hjust, na.rm = TRUE) - return(p) - } +nodeplotdefault = function(size=2L, hjust=-0.2){ + function(p, nodelabdf){ + p = p + geom_text(mapping=aes(x=x, y=y, label=label), data=nodelabdf, + size=size, hjust=hjust, na.rm=TRUE) + return(p) + } } -################################################################################ +################################################################################ #' Plot a phylogenetic tree with optional annotations #' #' There are many useful examples of phyloseq tree graphics in the @@ -1935,11 +1982,11 @@ nodeplotdefault = function(size = 2L, hjust = -0.2) { #' the \code{physeq} argument should also have a \code{\link{sample_data}} #' and/or \code{\link{tax_table}} component(s). #' -#' @param method (Optional). Character string. Default \code{'sampledodge'}. +#' @param method (Optional). Character string. Default \code{"sampledodge"}. #' The name of the annotation method to use. #' This will be expanded in future versions. -#' Currently only \code{'sampledodge'} and \code{'treeonly'} are supported. -#' The \code{'sampledodge'} option results in points +#' Currently only \code{"sampledodge"} and \code{"treeonly"} are supported. +#' The \code{"sampledodge"} option results in points #' drawn next to leaves if individuals from that taxa were observed, #' and a separate point is drawn for each sample. #' @@ -1966,7 +2013,7 @@ nodeplotdefault = function(size = 2L, hjust = -0.2) { #' #' @param size (Optional). Character string. Default \code{NULL}. #' The name of the variable in \code{physeq} to map to point size. -#' A special argument \code{'abundance'} is reserved here and scales +#' A special argument \code{"abundance"} is reserved here and scales #' point size using abundance in each sample on a log scale. #' Supported options here also include the reserved special variables #' of \code{\link{psmelt}}. @@ -1980,7 +2027,7 @@ nodeplotdefault = function(size = 2L, hjust = -0.2) { #' #' @param label.tips (Optional). Character string. Default is \code{NULL}, #' indicating that no tip labels will be printed. -#' If \code{'taxa_names'}, then the name of the taxa will be added +#' If \code{"taxa_names"}, then the name of the taxa will be added #' to the tree; either next to the leaves, or next to #' the set of points that label the leaves. Alternatively, #' if this is one of the rank names (from \code{rank_names(physeq)}), @@ -2012,16 +2059,16 @@ nodeplotdefault = function(size = 2L, hjust = -0.2) { #' shrink this value. #' #' @param ladderize (Optional). Boolean or character string (either -#' \code{FALSE}, \code{TRUE}, or \code{'left'}). +#' \code{FALSE}, \code{TRUE}, or \code{"left"}). #' Default is \code{FALSE}. #' This parameter specifies whether or not to \code{\link[ape]{ladderize}} the tree #' (i.e., reorder nodes according to the depth of their enclosed #' subtrees) prior to plotting. #' This tends to make trees more aesthetically pleasing and legible in #' a graphical display. -#' When \code{TRUE} or \code{'right'}, ``right'' ladderization is used. +#' When \code{TRUE} or \code{"right"}, ``right'' ladderization is used. #' When set to \code{FALSE}, no ladderization is applied. -#' When set to \code{'left'}, the reverse direction +#' When set to \code{"left"}, the reverse direction #' (``left'' ladderization) is applied. #' This argument is passed on to \code{\link{tree_layout}}. #' @@ -2046,7 +2093,7 @@ nodeplotdefault = function(size = 2L, hjust = -0.2) { #' #' @param justify (Optional). A character string indicating the #' type of justification to use on dodged points and tip labels. -#' A value of \code{'jagged'}, the default, results in +#' A value of \code{"jagged"}, the default, results in #' these tip-mapped elements being spaced as close to the tips as possible #' without gaps. #' Currently, any other value for \code{justify} results in @@ -2068,218 +2115,218 @@ nodeplotdefault = function(size = 2L, hjust = -0.2) { #' @examples #' # # Using plot_tree() with the esophagus dataset. #' # # Please note that many more interesting examples are shown -#' # # in the online tutorials' +#' # # in the online tutorials" #' # # http://joey711.github.io/phyloseq/plot_tree-examples #' data(esophagus) #' # plot_tree(esophagus) -#' # plot_tree(esophagus, color='Sample') -#' # plot_tree(esophagus, size='Abundance') -#' # plot_tree(esophagus, size='Abundance', color='samples') -#' plot_tree(esophagus, size='Abundance', color='Sample', base.spacing=0.03) -################################################################################ -#' plot_tree(esophagus, size='abundance', color='samples', base.spacing=0.03) -plot_tree = function(physeq, method = "sampledodge", nodelabf = NULL, color = NULL, - shape = NULL, size = NULL, min.abundance = Inf, label.tips = NULL, text.size = NULL, - sizebase = 5, base.spacing = 0.02, ladderize = FALSE, plot.margin = 0.2, title = NULL, - treetheme = NULL, justify = "jagged") { - ######################################## Support mis-capitalization of reserved variable names in color, shape, size - ######################################## This helps, for instance, with backward-compatibility where 'abundance' was the - ######################################## reserved variable name for mapping OTU abundance entries - fix_reserved_vars = function(aesvar) { - aesvar <- gsub("^abundance[s]{0,}$", "Abundance", aesvar, ignore.case = TRUE) - aesvar <- gsub("^OTU[s]{0,}$", "OTU", aesvar, ignore.case = TRUE) - aesvar <- gsub("^taxa_name[s]{0,}$", "OTU", aesvar, ignore.case = TRUE) - aesvar <- gsub("^sample[s]{0,}$", "Sample", aesvar, ignore.case = TRUE) +#' # plot_tree(esophagus, color="Sample") +#' # plot_tree(esophagus, size="Abundance") +#' # plot_tree(esophagus, size="Abundance", color="samples") +#' plot_tree(esophagus, size="Abundance", color="Sample", base.spacing=0.03) +################################################################################ +#' plot_tree(esophagus, size="abundance", color="samples", base.spacing=0.03) +plot_tree = function(physeq, method="sampledodge", nodelabf=NULL, + color=NULL, shape=NULL, size=NULL, + min.abundance=Inf, label.tips=NULL, text.size=NULL, + sizebase=5, base.spacing = 0.02, + ladderize=FALSE, plot.margin=0.2, title=NULL, + treetheme=NULL, justify="jagged"){ + ######################################## + # Support mis-capitalization of reserved variable names in color, shape, size + # This helps, for instance, with backward-compatibility where "abundance" + # was the reserved variable name for mapping OTU abundance entries + fix_reserved_vars = function(aesvar){ + aesvar <- gsub("^abundance[s]{0,}$", "Abundance", aesvar, ignore.case=TRUE) + aesvar <- gsub("^OTU[s]{0,}$", "OTU", aesvar, ignore.case=TRUE) + aesvar <- gsub("^taxa_name[s]{0,}$", "OTU", aesvar, ignore.case=TRUE) + aesvar <- gsub("^sample[s]{0,}$", "Sample", aesvar, ignore.case=TRUE) return(aesvar) } - if (!is.null(label.tips)) { - label.tips <- fix_reserved_vars(label.tips) - } - if (!is.null(color)) { - color <- fix_reserved_vars(color) - } - if (!is.null(shape)) { - shape <- fix_reserved_vars(shape) - } - if (!is.null(size)) { - size <- fix_reserved_vars(size) - } - ######################################## - if (is.null(phy_tree(physeq, FALSE))) { - stop("There is no phylogenetic tree in the object you have provided.\n", - "Try phy_tree(physeq) to see for yourself.") + if(!is.null(label.tips)){label.tips <- fix_reserved_vars(label.tips)} + if(!is.null(color)){color <- fix_reserved_vars(color)} + if(!is.null(shape)){shape <- fix_reserved_vars(shape)} + if(!is.null(size) ){size <- fix_reserved_vars(size)} + ######################################## + if( is.null(phy_tree(physeq, FALSE)) ){ + stop("There is no phylogenetic tree in the object you have provided.\n", + "Try phy_tree(physeq) to see for yourself.") } - if (!inherits(physeq, "phyloseq")) { + if(!inherits(physeq, "phyloseq")){ # If only a phylogenetic tree, then only tree available to overlay. method <- "treeonly" } # Create the tree data.table - treeSegs <- tree_layout(phy_tree(physeq), ladderize = ladderize) - edgeMap = aes(x = xleft, xend = xright, y = y, yend = y) - vertMap = aes(x = x, xend = x, y = vmin, yend = vmax) - # Initialize phylogenetic tree. Naked, lines-only, unannotated tree as first - # layers. Edge (horiz) first, then vertical. - p = ggplot(data = treeSegs$edgeDT) + geom_segment(edgeMap) + geom_segment(vertMap, - data = treeSegs$vertDT) - # If no text.size given, calculate it from number of tips ('species', aka taxa) + treeSegs <- tree_layout(phy_tree(physeq), ladderize=ladderize) + edgeMap = aes(x=xleft, xend=xright, y=y, yend=y) + vertMap = aes(x=x, xend=x, y=vmin, yend=vmax) + # Initialize phylogenetic tree. + # Naked, lines-only, unannotated tree as first layers. Edge (horiz) first, then vertical. + p = ggplot(data=treeSegs$edgeDT) + geom_segment(edgeMap) + + geom_segment(vertMap, data=treeSegs$vertDT) + # If no text.size given, calculate it from number of tips ("species", aka taxa) # This is very fast. No need to worry about whether text is printed or not. - if (is.null(text.size)) { + if(is.null(text.size)){ text.size <- manytextsize(ntaxa(physeq)) } # Add the species labels to the right. - if (!is.null(label.tips) & method != "sampledodge") { + if(!is.null(label.tips) & method!="sampledodge"){ # If method is sampledodge, then labels are added to the right of points, later. # Add labels layer to plotting object. labelDT = treeSegs$edgeDT[!is.na(OTU), ] - if (!is.null(tax_table(object = physeq, errorIfNULL = FALSE))) { + if(!is.null(tax_table(object=physeq, errorIfNULL=FALSE))){ # If there is a taxonomy available, merge it with the label data.table - taxDT = data.table(tax_table(physeq), OTU = taxa_names(physeq), key = "OTU") + taxDT = data.table(tax_table(physeq), OTU=taxa_names(physeq), key="OTU") # Merge with taxonomy. - labelDT = merge(x = labelDT, y = taxDT, by = "OTU") + labelDT = merge(x=labelDT, y=taxDT, by="OTU") } - if (justify == "jagged") { - # Tip label aesthetic mapping. Aesthetics can be NULL, and that aesthetic gets - # ignored. - labelMap <- aes_string(x = "xright", y = "y", label = label.tips, color = color) + if(justify=="jagged"){ + # Tip label aesthetic mapping. + # Aesthetics can be NULL, and that aesthetic gets ignored. + labelMap <- aes_string(x="xright", y="y", label=label.tips, color=color) } else { # The left-justified version of tip-labels. - labelMap <- aes_string(x = "max(xright, na.rm=TRUE)", y = "y", label = label.tips, - color = color) + labelMap <- aes_string(x="max(xright, na.rm=TRUE)", y="y", label=label.tips, color=color) } - p <- p + geom_text(labelMap, data = labelDT, size = I(text.size), hjust = -0.1, - na.rm = TRUE) - } - # Node label section. If no nodelabf ('node label function') given, ask internal - # function to pick one. Is NULL by default, meaning will dispatch to - # `howtolabnodes` to select function. For no node labels, the 'dummy' function - # `nodeplotblank` will return tree plot object, p, as-is, unmodified. - if (is.null(nodelabf)) { + p <- p + geom_text(labelMap, data=labelDT, size=I(text.size), hjust=-0.1, na.rm=TRUE) + } + # Node label section. + # + # If no nodelabf ("node label function") given, ask internal function to pick one. + # Is NULL by default, meaning will dispatch to `howtolabnodes` to select function. + # For no node labels, the "dummy" function `nodeplotblank` will return tree plot + # object, p, as-is, unmodified. + if(is.null(nodelabf)){ nodelabf = howtolabnodes(physeq) } - #### set node `y` as the mean of the vertical segment Use the provided/inferred node - #### label function to add the node labels layer(s) Non-root nodes first + #### set node `y` as the mean of the vertical segment + # Use the provided/inferred node label function to add the node labels layer(s) + # Non-root nodes first p = nodelabf(p, treeSegs$edgeDT[!is.na(label), ]) # Add root label (if present) p = nodelabf(p, treeSegs$vertDT[!is.na(label), ]) # Theme specification - if (is.null(treetheme)) { + if(is.null(treetheme)){ # If NULL, then use the default tree theme. - treetheme <- theme(axis.ticks = element_blank(), axis.title.x = element_blank(), - axis.text.x = element_blank(), axis.title.y = element_blank(), axis.text.y = element_blank(), - panel.background = element_blank(), panel.grid.minor = element_blank(), - panel.grid.major = element_blank()) - } - if (inherits(treetheme, "theme")) { - # If a theme, add theme layer to plot. For all other cases, skip this, which - # will cause default theme to be used + treetheme <- theme(axis.ticks = element_blank(), + axis.title.x=element_blank(), axis.text.x=element_blank(), + axis.title.y=element_blank(), axis.text.y=element_blank(), + panel.background = element_blank(), + panel.grid.minor = element_blank(), + panel.grid.major = element_blank()) + } + if(inherits(treetheme, "theme")){ + # If a theme, add theme layer to plot. + # For all other cases, skip this, which will cause default theme to be used p <- p + treetheme } # Optionally add a title to the plot - if (!is.null(title)) { + if(!is.null(title)){ p <- p + ggtitle(title) - } - if (method != "sampledodge") { + } + if(method!="sampledodge"){ # If anything but a sampledodge tree, return now without further decorations. return(p) } - ######################################## Sample Dodge Section Special words, c('Sample', 'Abundance', 'OTU') See - ######################################## psmelt() Initialize the species/taxa/OTU data.table + ######################################## + # Sample Dodge Section + # Special words, c("Sample", "Abundance", "OTU") + # See psmelt() + ######################################## + # Initialize the species/taxa/OTU data.table dodgeDT = treeSegs$edgeDT[!is.na(OTU), ] # Merge with psmelt() result, to make all co-variables available - dodgeDT = merge(x = dodgeDT, y = data.table(psmelt(physeq), key = "OTU"), by = "OTU") - if (justify == "jagged") { + dodgeDT = merge(x=dodgeDT, y=data.table(psmelt(physeq), key="OTU"), by="OTU") + if(justify=="jagged"){ # Remove 0 Abundance value entries now, not later, for jagged. - dodgeDT <- dodgeDT[Abundance > 0, ] + dodgeDT <- dodgeDT[Abundance > 0, ] } # Set key. Changes `dodgeDT` in place. OTU is first key, always. - if (!is.null(color) | !is.null(shape) | !is.null(size)) { + if( !is.null(color) | !is.null(shape) | !is.null(size) ){ # If color, shape, or size is chosen, setkey by those as well - setkeyv(dodgeDT, cols = c("OTU", color, shape, size)) + setkeyv(dodgeDT, cols=c("OTU", color, shape, size)) } else { - # Else, set key by OTU and sample name. + # Else, set key by OTU and sample name. setkey(dodgeDT, OTU, Sample) } # Add sample-dodge horizontal adjustment index. In-place data.table assignment - dodgeDT[, `:=`(h.adj.index, 1:length(xright)), by = OTU] - # `base.spacing` is a user-input parameter. The sampledodge step size is based - # on this and the max `x` value - if (justify == "jagged") { - dodgeDT[, `:=`(xdodge, (xright + h.adj.index * base.spacing * max(xright, - na.rm = TRUE)))] + dodgeDT[, h.adj.index := 1:length(xright), by=OTU] + # `base.spacing` is a user-input parameter. + # The sampledodge step size is based on this and the max `x` value + if(justify=="jagged"){ + dodgeDT[, xdodge:=(xright + h.adj.index * base.spacing * max(xright, na.rm=TRUE))] } else { - # Left-justified version, `xdodge` always starts at the max of all `xright` - # values. - dodgeDT[, `:=`(xdodge, max(xright, na.rm = TRUE) + h.adj.index * base.spacing * - max(xright, na.rm = TRUE))] + # Left-justified version, `xdodge` always starts at the max of all `xright` values. + dodgeDT[, xdodge := max(xright, na.rm=TRUE) + h.adj.index * base.spacing * max(xright, na.rm=TRUE)] # zeroes removed only after all sample points have been mapped. dodgeDT <- dodgeDT[Abundance > 0, ] } - # The general tip-point map. Objects can be NULL, and that aesthetic gets - # ignored. - dodgeMap <- aes_string(x = "xdodge", y = "y", color = color, fill = color, shape = shape, - size = size) - p <- p + geom_point(dodgeMap, data = dodgeDT, na.rm = TRUE) + # The general tip-point map. Objects can be NULL, and that aesthetic gets ignored. + dodgeMap <- aes_string(x="xdodge", y="y", color=color, fill=color, + shape=shape, size=size) + p <- p + geom_point(dodgeMap, data=dodgeDT, na.rm=TRUE) # Adjust point size transform - if (!is.null(size)) { - p <- p + scale_size_continuous(trans = log_trans(sizebase)) - } - # Optionally-add abundance value label to each point. User controls this by the - # `min.abundance` parameter. A value of `Inf` implies no labels. - if (any(dodgeDT$Abundance >= min.abundance[1])) { - pointlabdf = dodgeDT[Abundance >= min.abundance[1], ] - p <- p + geom_text(mapping = aes(xdodge, y, label = Abundance), data = pointlabdf, - size = text.size, na.rm = TRUE) + if( !is.null(size) ){ + p <- p + scale_size_continuous(trans=log_trans(sizebase)) + } + # Optionally-add abundance value label to each point. + # User controls this by the `min.abundance` parameter. + # A value of `Inf` implies no labels. + if( any(dodgeDT$Abundance >= min.abundance[1]) ){ + pointlabdf = dodgeDT[Abundance>=min.abundance[1], ] + p <- p + geom_text(mapping=aes(xdodge, y, label=Abundance), + data=pointlabdf, size=text.size, na.rm=TRUE) } # If indicated, add the species labels to the right of dodged points. - if (!is.null(label.tips)) { - # `tiplabDT` has only one row per tip, the farthest horizontal adjusted position - # (one for each taxa) + if(!is.null(label.tips)){ + # `tiplabDT` has only one row per tip, the farthest horizontal + # adjusted position (one for each taxa) tiplabDT = dodgeDT - tiplabDT[, `:=`(xfartiplab, max(xdodge)), by = OTU] - tiplabDT <- tiplabDT[h.adj.index == 1, .SD, by = OTU] - if (!is.null(color)) { - if (color %in% sample_variables(physeq, errorIfNULL = FALSE)) { + tiplabDT[, xfartiplab:=max(xdodge), by=OTU] + tiplabDT <- tiplabDT[h.adj.index==1, .SD, by=OTU] + if(!is.null(color)){ + if(color %in% sample_variables(physeq, errorIfNULL=FALSE)){ color <- NULL } } labelMap <- NULL - if (justify == "jagged") { - labelMap <- aes_string(x = "xfartiplab", y = "y", label = label.tips, - color = color) + if(justify=="jagged"){ + labelMap <- aes_string(x="xfartiplab", y="y", label=label.tips, color=color) } else { - labelMap <- aes_string(x = "max(xfartiplab, na.rm=TRUE)", y = "y", label = label.tips, - color = color) + labelMap <- aes_string(x="max(xfartiplab, na.rm=TRUE)", y="y", label=label.tips, color=color) } # Add labels layer to plotting object. - p <- p + geom_text(labelMap, tiplabDT, size = I(text.size), hjust = -0.1, - na.rm = TRUE) - } - # Plot margins. Adjust the tree graphic plot margins. Helps to manually ensure - # that graphic elements aren't clipped, especially when there are long tip - # labels. - min.x <- -0.01 # + dodgeDT[, min(c(xleft))] - max.x <- dodgeDT[, max(xright, na.rm = TRUE)] - if ("xdodge" %in% names(dodgeDT)) { - max.x <- dodgeDT[, max(xright, xdodge, na.rm = TRUE)] - } - if (plot.margin > 0) { - max.x <- max.x * (1 + plot.margin) - } - p <- p + scale_x_continuous(limits = c(min.x, max.x)) + p <- p + geom_text(labelMap, tiplabDT, size=I(text.size), hjust=-0.1, na.rm=TRUE) + } + # Plot margins. + # Adjust the tree graphic plot margins. + # Helps to manually ensure that graphic elements aren't clipped, + # especially when there are long tip labels. + min.x <- -0.01 # + dodgeDT[, min(c(xleft))] + max.x <- dodgeDT[, max(xright, na.rm=TRUE)] + if("xdodge" %in% names(dodgeDT)){ + max.x <- dodgeDT[, max(xright, xdodge, na.rm=TRUE)] + } + if(plot.margin > 0){ + max.x <- max.x * (1.0 + plot.margin) + } + p <- p + scale_x_continuous(limits=c(min.x, max.x)) return(p) } -################################################################################ Adapted from NeatMap-package. Vectorized for speed and simplicity, also only -################################################################################ calculates theta and not r. +################################################################################ +################################################################################ +################################################################################ +# Adapted from NeatMap-package. +# Vectorized for speed and simplicity, also only calculates theta and not r. #' @keywords internal -RadialTheta <- function(pos) { - pos = as(pos, "matrix") - xc = mean(pos[, 1]) - yc = mean(pos[, 2]) - theta = atan2((pos[, 2] - yc), (pos[, 1] - xc)) - names(theta) <- rownames(pos) - return(theta) +RadialTheta <- function(pos){ + pos = as(pos, "matrix") + xc = mean(pos[, 1]) + yc = mean(pos[, 2]) + theta = atan2((pos[, 2] - yc), (pos[, 1] - xc)) + names(theta) <- rownames(pos) + return(theta) } -################################################################################ +################################################################################ #' Create an ecologically-organized heatmap using ggplot2 graphics #' #' There are many useful examples of phyloseq heatmap graphics in the @@ -2353,25 +2400,25 @@ RadialTheta <- function(pos) { #' @param low (Optional). A character string. An R color. #' See \code{?\link{colors}} for options support in R (there are lots). #' The color that represents the lowest non-zero value -#' in the heatmap. Default is a dark blue color, \code{'#000033'}. +#' in the heatmap. Default is a dark blue color, \code{"#000033"}. #' #' @param high (Optional). A character string. An R color. #' See \code{\link{colors}} for options support in R (there are lots). #' The color that will represent the highest -#' value in the heatmap. The default is \code{'#66CCFF'}. -#' Zero-values are treated as \code{NA}, and set to \code{'black'}, to represent +#' value in the heatmap. The default is \code{"#66CCFF"}. +#' Zero-values are treated as \code{NA}, and set to \code{"black"}, to represent #' a background color. #' #' @param na.value (Optional). A character string. An R color. #' See \code{\link{colors}} for options support in R (there are lots). #' The color to represent what is essentially the background of the plot, #' the non-observations that occur as \code{NA} or -#' \code{0} values in the abundance table. The default is \code{'black'}, which +#' \code{0} values in the abundance table. The default is \code{"black"}, which #' works well on computer-screen graphics devices, but may be a poor choice for -#' printers, in which case you might want this value to be \code{'white'}, and +#' printers, in which case you might want this value to be \code{"white"}, and #' reverse the values of \code{high} and \code{low}, above. #' -#' @param trans (Optional). \code{'trans'}-class transformer-definition object. +#' @param trans (Optional). \code{"trans"}-class transformer-definition object. #' A numerical transformer to use in #' the continuous color scale. See \code{\link[scales]{trans_new}} for details. #' The default is \code{\link{log_trans}(4)}. @@ -2446,203 +2493,209 @@ RadialTheta <- function(pos) { #' #' @export #' @examples -#' data('GlobalPatterns') -#' gpac <- subset_taxa(GlobalPatterns, Phylum=='Crenarchaeota') +#' data("GlobalPatterns") +#' gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota") #' # FYI, the base-R function uses a non-ecological ordering scheme, #' # but does add potentially useful hclust dendrogram to the sides... -#' gpac <- subset_taxa(GlobalPatterns, Phylum=='Crenarchaeota') +#' gpac <- subset_taxa(GlobalPatterns, Phylum=="Crenarchaeota") #' # Remove the nearly-empty samples (e.g. 10 reads or less) #' gpac = prune_samples(sample_sums(gpac) > 50, gpac) #' # Arbitrary order if method set to NULL -#' plot_heatmap(gpac, method=NULL, sample.label='SampleType', taxa.label='Family') +#' plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family") #' # Use ordination -#' plot_heatmap(gpac, sample.label='SampleType', taxa.label='Family') +#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family") #' # Use ordination for OTUs, but not sample-order -#' plot_heatmap(gpac, sample.label='SampleType', taxa.label='Family', sample.order='SampleType') +#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", sample.order="SampleType") #' # Specifying both orders omits any attempt to use ordination. The following should be the same. -#' p0 = plot_heatmap(gpac, sample.label='SampleType', taxa.label='Family', taxa.order='Phylum', sample.order='SampleType') -#' p1 = plot_heatmap(gpac, method=NULL, sample.label='SampleType', taxa.label='Family', taxa.order='Phylum', sample.order='SampleType') +#' p0 = plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", taxa.order="Phylum", sample.order="SampleType") +#' p1 = plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family", taxa.order="Phylum", sample.order="SampleType") #' #expect_equivalent(p0, p1) #' # Example: Order matters. Random ordering of OTU indices is difficult to interpret, even with structured sample order #' rando = sample(taxa_names(gpac), size=ntaxa(gpac), replace=FALSE) -#' plot_heatmap(gpac, method=NULL, sample.label='SampleType', taxa.label='Family', taxa.order=rando, sample.order='SampleType') +#' plot_heatmap(gpac, method=NULL, sample.label="SampleType", taxa.label="Family", taxa.order=rando, sample.order="SampleType") #' # # Select the edges of each axis. #' # First, arbitrary edge, ordering #' plot_heatmap(gpac, method=NULL) #' # Second, biological-ordering (instead of default ordination-ordering), but arbitrary edge -#' plot_heatmap(gpac, taxa.order='Family', sample.order='SampleType') +#' plot_heatmap(gpac, taxa.order="Family", sample.order="SampleType") #' # Third, biological ordering, selected edges -#' plot_heatmap(gpac, taxa.order='Family', sample.order='SampleType', first.taxa='546313', first.sample='NP2') +#' plot_heatmap(gpac, taxa.order="Family", sample.order="SampleType", first.taxa="546313", first.sample="NP2") #' # Fourth, add meaningful labels -#' plot_heatmap(gpac, sample.label='SampleType', taxa.label='Family', taxa.order='Family', sample.order='SampleType', first.taxa='546313', first.sample='NP2') -plot_heatmap <- function(physeq, method = "NMDS", distance = "bray", sample.label = NULL, - taxa.label = NULL, low = "#000033", high = "#66CCFF", na.value = "black", trans = log_trans(4), - max.label = 250, title = NULL, sample.order = NULL, taxa.order = NULL, first.sample = NULL, - first.taxa = NULL, ...) { - +#' plot_heatmap(gpac, sample.label="SampleType", taxa.label="Family", taxa.order="Family", sample.order="SampleType", first.taxa="546313", first.sample="NP2") +plot_heatmap <- function(physeq, method="NMDS", distance="bray", + sample.label=NULL, taxa.label=NULL, + low="#000033", high="#66CCFF", na.value="black", trans=log_trans(4), + max.label=250, title=NULL, sample.order=NULL, taxa.order=NULL, + first.sample=NULL, first.taxa=NULL, ...){ + # User-override ordering - if (!is.null(taxa.order) & length(taxa.order) == 1) { + if( !is.null(taxa.order) & length(taxa.order)==1 ){ # Assume `taxa.order` is a tax_table variable. Use it for ordering. rankcol = which(rank_names(physeq) %in% taxa.order) taxmat = as(tax_table(physeq)[, 1:rankcol], "matrix") - taxa.order = apply(taxmat, 1, paste, sep = "", collapse = "") + taxa.order = apply(taxmat, 1, paste, sep="", collapse="") names(taxa.order) <- taxa_names(physeq) - taxa.order = names(sort(taxa.order, na.last = TRUE)) + taxa.order = names(sort(taxa.order, na.last=TRUE)) } - if (!is.null(sample.order) & length(sample.order) == 1) { + if( !is.null(sample.order) & length(sample.order)==1 ){ # Assume `sample.order` is a sample variable. Use it for ordering. sample.order = as.character(get_variable(physeq, sample.order)) names(sample.order) <- sample_names(physeq) - sample.order = names(sort(sample.order, na.last = TRUE)) + sample.order = names(sort(sample.order, na.last=TRUE)) } - if (!is.null(method) & (is.null(taxa.order) | is.null(sample.order))) { - # Only attempt NeatMap if method is non-NULL & at least one of taxa.order and - # sample.order is not-yet defined. If both axes orders pre-defined by user, no - # need to perform ordination... + if( !is.null(method) & (is.null(taxa.order) | is.null(sample.order)) ){ + # Only attempt NeatMap if method is non-NULL & at least one of + # taxa.order and sample.order is not-yet defined. + # If both axes orders pre-defined by user, no need to perform ordination... - # Copy the approach from NeatMap by doing ordination on samples, but use - # phyloseq-wrapped distance/ordination procedures. Reorder by the angle in - # radial coordinates on the 2-axis plane. + # Copy the approach from NeatMap by doing ordination on samples, but use + # phyloseq-wrapped distance/ordination procedures. + # Reorder by the angle in radial coordinates on the 2-axis plane. - # In case of NMDS iterations, capture the output so it isn't dumped on - # standard-out - junk = capture.output(ps.ord <- ordinate(physeq, method, distance, ...), - file = NULL) - if (is.null(sample.order)) { + # In case of NMDS iterations, capture the output so it isn't dumped on standard-out + junk = capture.output( ps.ord <- ordinate(physeq, method, distance, ...), file=NULL) + if( is.null(sample.order) ){ siteDF = NULL # Only define new ord-based sample order if user did not define one already - trash1 = try({ - siteDF <- scores(ps.ord, choices = c(1, 2), display = "sites", physeq = physeq) - }, silent = TRUE) - if (inherits(trash1, "try-error")) { + trash1 = try({siteDF <- scores(ps.ord, choices = c(1, 2), display="sites", physeq=physeq)}, + silent = TRUE) + if(inherits(trash1, "try-error")){ # warn that the attempt to get ordination coordinates for ordering failed. - warning("Attempt to access ordination coordinates for sample ordering failed.\n", - "Using default sample ordering.") + warning("Attempt to access ordination coordinates for sample ordering failed.\n", + "Using default sample ordering.") } - if (!is.null(siteDF)) { + if(!is.null(siteDF)){ # If the score accession seemed to work, go ahead and replace sample.order sample.order <- sample_names(physeq)[order(RadialTheta(siteDF))] } } - - if (is.null(taxa.order)) { - # re-order species/taxa/OTUs, if possible, and only if user did not define an - # order already - specDF = NULL - trash2 = try({ - specDF <- scores(ps.ord, choices = c(1, 2), display = "species", - physeq = physeq) - }, silent = TRUE) - if (inherits(trash2, "try-error")) { - # warn that the attempt to get ordination coordinates for ordering failed. - warning("Attempt to access ordination coordinates for feature/species/taxa/OTU ordering failed.\n", - "Using default feature/species/taxa/OTU ordering.") - } - if (!is.null(specDF)) { - # If the score accession seemed to work, go ahead and replace sample.order - taxa.order = taxa_names(physeq)[order(RadialTheta(specDF))] - } - } - } + + if( is.null(taxa.order) ){ + # re-order species/taxa/OTUs, if possible, + # and only if user did not define an order already + specDF = NULL + trash2 = try({specDF <- scores(ps.ord, choices=c(1, 2), display="species", physeq=physeq)}, + silent = TRUE) + if(inherits(trash2, "try-error")){ + # warn that the attempt to get ordination coordinates for ordering failed. + warning("Attempt to access ordination coordinates for feature/species/taxa/OTU ordering failed.\n", + "Using default feature/species/taxa/OTU ordering.") + } + if(!is.null(specDF)){ + # If the score accession seemed to work, go ahead and replace sample.order + taxa.order = taxa_names(physeq)[order(RadialTheta(specDF))] + } + } + } # Now that index orders are determined, check/assign edges of axes, if specified - if (!is.null(first.sample)) { + if( !is.null(first.sample) ){ sample.order = chunkReOrder(sample.order, first.sample) } - if (!is.null(first.taxa)) { + if( !is.null(first.taxa) ){ taxa.order = chunkReOrder(taxa.order, first.taxa) } - - # melt physeq with the standard user-accessible data melting function for - # creating plot-ready data.frames, psmelt. This is also used inside some of the - # other plot_* functions. - adf = psmelt(physeq) - # Coerce the main axis variables to character. Will reset them to factor if - # re-ordering is needed. - adf$OTU = as(adf$OTU, "character") - adf$Sample = as(adf$Sample, "character") - if (!is.null(sample.order)) { - # If sample-order is available, coerce to factor with special level-order - adf$Sample = factor(adf$Sample, levels = sample.order) - } else { - # Make sure it is a factor, but with default order/levels - adf$Sample = factor(adf$Sample) - } - if (!is.null(taxa.order)) { - # If OTU-order is available, coerce to factor with special level-order - adf$OTU = factor(adf$OTU, levels = taxa.order) - } else { - # Make sure it is a factor, but with default order/levels - adf$OTU = factor(adf$OTU) - } - - ## Now the plotting part Initialize p. - p = ggplot(adf, aes(x = Sample, y = OTU, fill = Abundance)) + geom_raster() - - # # Don't render labels if more than max.label Samples - if (nsamples(physeq) <= max.label) { - # Add resize layer for samples if there are fewer than max.label - p <- p + theme(axis.text.x = element_text(size = manytextsize(nsamples(physeq), - 4, 30, 12), angle = -90, vjust = 0.5, hjust = 0)) - } else { - # Remove the labels from any rendering. - p = p + scale_x_discrete("Sample", labels = "") - } - # OTUs - if (ntaxa(physeq) <= max.label) { - # Add resize layer for OTUs if there are fewer than max.label - p <- p + theme(axis.text.y = element_text(size = manytextsize(ntaxa(physeq), - 4, 30, 12))) - } else { - # Remove the labels from any rendering. - p = p + scale_y_discrete("OTU", labels = "") - } - - # # Axis Relabeling (Skipped if more than max.label): Re-write sample-labels to - # some sample variable... - if (!is.null(sample.label) & nsamples(physeq) <= max.label) { - # Make a sample-named char-vector of the values for sample.label - labvec = as(get_variable(physeq, sample.label), "character") - names(labvec) <- sample_names(physeq) - if (!is.null(sample.order)) { - # Re-order according to sample.order - labvec = labvec[sample.order] - } - # Replace any NA (missing) values with '' instead. Avoid recycling labels. - labvec[is.na(labvec)] <- "" - # Add the sample.label re-labeling layer - p = p + scale_x_discrete(sample.label, labels = labvec) - } - if (!is.null(taxa.label) & ntaxa(physeq) <= max.label) { - # Make a OTU-named vector of the values for taxa.label - labvec <- as(tax_table(physeq)[, taxa.label], "character") - names(labvec) <- taxa_names(physeq) - if (!is.null(taxa.order)) { - # Re-order according to taxa.order - labvec <- labvec[taxa.order] - } - # Replace any NA (missing) values with '' instead. Avoid recycling labels. - labvec[is.na(labvec)] <- "" - # Add the taxa.label re-labeling layer - p = p + scale_y_discrete(taxa.label, labels = labvec) - } - - # Color scale transformations - if (!is.null(trans)) { - p = p + scale_fill_gradient(low = low, high = high, trans = trans, na.value = na.value) - } else { - p = p + scale_fill_gradient(low = low, high = high, na.value = na.value) - } - - # Optionally add a title to the plot - if (!is.null(title)) { - p = p + ggtitle(title) - } - - return(p) + + # melt physeq with the standard user-accessible data melting function + # for creating plot-ready data.frames, psmelt. + # This is also used inside some of the other plot_* functions. + adf = psmelt(physeq) + # Coerce the main axis variables to character. + # Will reset them to factor if re-ordering is needed. + adf$OTU = as(adf$OTU, "character") + adf$Sample = as(adf$Sample, "character") + if( !is.null(sample.order) ){ + # If sample-order is available, coerce to factor with special level-order + adf$Sample = factor(adf$Sample, levels=sample.order) + } else { + # Make sure it is a factor, but with default order/levels + adf$Sample = factor(adf$Sample) + } + if( !is.null(taxa.order) ){ + # If OTU-order is available, coerce to factor with special level-order + adf$OTU = factor(adf$OTU, levels=taxa.order) + } else { + # Make sure it is a factor, but with default order/levels + adf$OTU = factor(adf$OTU) + } + + ## Now the plotting part + # Initialize p. + p = ggplot(adf, aes(x = Sample, y = OTU, fill=Abundance)) + + geom_raster() + + # # Don't render labels if more than max.label + # Samples + if( nsamples(physeq) <= max.label ){ + # Add resize layer for samples if there are fewer than max.label + p <- p + theme( + axis.text.x = element_text( + size=manytextsize(nsamples(physeq), 4, 30, 12), + angle=-90, vjust=0.5, hjust=0 + ) + ) + } else { + # Remove the labels from any rendering. + p = p + scale_x_discrete("Sample", labels="") + } + # OTUs + if( ntaxa(physeq) <= max.label ){ + # Add resize layer for OTUs if there are fewer than max.label + p <- p + theme( + axis.text.y = element_text( + size=manytextsize(ntaxa(physeq), 4, 30, 12) + ) + ) + } else { + # Remove the labels from any rendering. + p = p + scale_y_discrete("OTU", labels="") + } + + # # Axis Relabeling (Skipped if more than max.label): + # Re-write sample-labels to some sample variable... + if( !is.null(sample.label) & nsamples(physeq) <= max.label){ + # Make a sample-named char-vector of the values for sample.label + labvec = as(get_variable(physeq, sample.label), "character") + names(labvec) <- sample_names(physeq) + if( !is.null(sample.order) ){ + # Re-order according to sample.order + labvec = labvec[sample.order] + } + # Replace any NA (missing) values with "" instead. Avoid recycling labels. + labvec[is.na(labvec)] <- "" + # Add the sample.label re-labeling layer + p = p + scale_x_discrete(sample.label, labels=labvec) + } + if( !is.null(taxa.label) & ntaxa(physeq) <= max.label){ + # Make a OTU-named vector of the values for taxa.label + labvec <- as(tax_table(physeq)[, taxa.label], "character") + names(labvec) <- taxa_names(physeq) + if( !is.null(taxa.order) ){ + # Re-order according to taxa.order + labvec <- labvec[taxa.order] + } + # Replace any NA (missing) values with "" instead. Avoid recycling labels. + labvec[is.na(labvec)] <- "" + # Add the taxa.label re-labeling layer + p = p + scale_y_discrete(taxa.label, labels=labvec) + } + + # Color scale transformations + if( !is.null(trans) ){ + p = p + scale_fill_gradient(low=low, high=high, trans=trans, na.value=na.value) + } else { + p = p + scale_fill_gradient(low=low, high=high, na.value=na.value) + } + + # Optionally add a title to the plot + if( !is.null(title) ){ + p = p + ggtitle(title) + } + + return(p) } -################################################################################ +################################################################################ #' Chunk re-order a vector so that specified newstart is first. #' #' Different than relevel. @@ -2664,36 +2717,34 @@ plot_heatmap <- function(physeq, method = "NMDS", distance = "bray", sample.labe #' # # This is also the default #' # all(chunkReOrder(10:25) == 10:25) #' # # An example with characters -#' # chunkReOrder(LETTERS, 'G') -#' # chunkReOrder(LETTERS, 'B') -#' # chunkReOrder(LETTERS, 'Z') +#' # chunkReOrder(LETTERS, "G") +#' # chunkReOrder(LETTERS, "B") +#' # chunkReOrder(LETTERS, "Z") #' # # What about when `newstart` is not in `x`? Return x as-is, throw warning. -#' # chunkReOrder(LETTERS, 'g') -chunkReOrder = function(x, newstart = x[[1]]) { +#' # chunkReOrder(LETTERS, "g") +chunkReOrder = function(x, newstart = x[[1]]){ pivot = match(newstart[1], x, nomatch = NA) # If pivot `is.na`, throw warning, return x as-is - if (is.na(pivot)) { + if(is.na(pivot)){ warning("The `newstart` argument was not in `x`. Returning `x` without reordering.") newx = x } else { - newx = c(tail(x, { - length(x) - pivot + 1 - }), head(x, pivot - 1L)) + newx = c(tail(x, {length(x) - pivot + 1}), head(x, pivot - 1L)) } return(newx) } -################################################################################ +################################################################################ #' Create a ggplot summary of gap statistic results #' #' @param clusgap (Required). -#' An object of S3 class \code{'clusGap'}, basically a list with components. +#' An object of S3 class \code{"clusGap"}, basically a list with components. #' See the \code{\link[cluster]{clusGap}} documentation for more details. #' In most cases this will be the output of \code{\link{gapstat_ord}}, #' or \code{\link[cluster]{clusGap}} if you called it directly. #' #' @param title (Optional). Character string. #' The main title for the graphic. -#' Default is \code{'Gap Statistic results'}. +#' Default is \code{"Gap Statistic results"}. #' #' @return #' A \code{\link[ggplot2]{ggplot}} plot object. @@ -2711,32 +2762,32 @@ chunkReOrder = function(x, newstart = x[[1]]) { #' @export #' @examples #' # Load and process data -#' data('soilrep') +#' data("soilrep") #' soilr = rarefy_even_depth(soilrep, rngseed=888) #' print(soilr) #' sample_variables(soilr) #' # Ordination -#' sord = ordinate(soilr, 'DCA') +#' sord = ordinate(soilr, "DCA") #' # Gap Statistic #' gs = gapstat_ord(sord, axes=1:4, verbose=FALSE) #' # Evaluate results with plots, etc. #' plot_scree(sord) -#' plot_ordination(soilr, sord, color='Treatment') +#' plot_ordination(soilr, sord, color="Treatment") #' plot_clusgap(gs) -#' print(gs, method='Tibs2001SEmax') +#' print(gs, method="Tibs2001SEmax") #' # Non-ordination example, use cluster::clusGap function directly -#' library('cluster') +#' library("cluster") #' pam1 = function(x, k){list(cluster = pam(x, k, cluster.only=TRUE))} #' gs.pam.RU = clusGap(ruspini, FUN = pam1, K.max = 8, B = 60) #' gs.pam.RU -#' plot(gs.pam.RU, main = 'Gap statistic for the 'ruspini' data') -#' mtext('k = 4 is best .. and k = 5 pretty close') +#' plot(gs.pam.RU, main = "Gap statistic for the 'ruspini' data") +#' mtext("k = 4 is best .. and k = 5 pretty close") #' plot_clusgap(gs.pam.RU) -plot_clusgap = function(clusgap, title = "Gap Statistic results") { - gstab = data.frame(clusgap$Tab, k = 1:nrow(clusgap$Tab)) - p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size = 5) - p = p + geom_errorbar(aes(ymax = gap + SE.sim, ymin = gap - SE.sim)) - p = p + ggtitle(title) - return(p) +plot_clusgap = function(clusgap, title="Gap Statistic results"){ + gstab = data.frame(clusgap$Tab, k = 1:nrow(clusgap$Tab)) + p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size = 5) + p = p + geom_errorbar(aes(ymax = gap + SE.sim, ymin = gap - SE.sim)) + p = p + ggtitle(title) + return(p) } -################################################################################ +################################################################################ \ No newline at end of file diff --git a/R/sampleData-class.R b/R/sampleData-class.R index b010fcb1..80c49eb8 100644 --- a/R/sampleData-class.R +++ b/R/sampleData-class.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Build or access sample_data. #' #' This is the suggested method for both constructing and accessing a table @@ -40,29 +40,29 @@ #' @examples # #' data(soilrep) #' head(sample_data(soilrep)) -setGeneric("sample_data", function(object, errorIfNULL = TRUE) standardGeneric("sample_data")) +setGeneric("sample_data", function(object, errorIfNULL=TRUE) standardGeneric("sample_data")) #' @rdname sample_data-methods #' @aliases sample_data,ANY-method -setMethod("sample_data", "ANY", function(object, errorIfNULL = TRUE) { - access(object, "sam_data", errorIfNULL) +setMethod("sample_data", "ANY", function(object, errorIfNULL=TRUE){ + access(object, "sam_data", errorIfNULL) }) # constructor; for creating sample_data from a data.frame #' @rdname sample_data-methods #' @aliases sample_data,data.frame-method -setMethod("sample_data", "data.frame", function(object) { - # Make sure there are no phantom levels in categorical variables - object <- reconcile_categories(object) - - # instantiate first to check validity - SM <- new("sample_data", object) - - # Want dummy samples index names if missing - if (all(rownames(SM) == as.character(1:nrow(SM)))) { - rownames(SM) <- paste("sa", 1:nrow(SM), sep = "") - } - return(SM) +setMethod("sample_data", "data.frame", function(object){ + # Make sure there are no phantom levels in categorical variables + object <- reconcile_categories(object) + + # instantiate first to check validity + SM <- new("sample_data", object) + + # Want dummy samples index names if missing + if( all(rownames(SM) == as.character(1:nrow(SM))) ){ + rownames(SM) <- paste("sa", 1:nrow(SM), sep="") + } + return(SM) }) -################################################################################ +################################################################################ #' Cleans absent levels in sample_data/data.frame. #' #' This is used internally by the builder method, \code{\link{sample_data}}, to @@ -85,22 +85,22 @@ setMethod("sample_data", "data.frame", function(object) { #' # # # data(GlobalPatterns) #' # # # SM <- sample_data(GlobalPatterns) #' # # # DF <- data.frame(SM) -#' # # # DF <- data.frame(DF, col1=1:nrow(DF), col2=paste(1:nrow(DF), 't', sep='')) +#' # # # DF <- data.frame(DF, col1=1:nrow(DF), col2=paste(1:nrow(DF), "t", sep="")) #' # # # DF <- reconcile_categories(DF) #' # # # SM <- sample_data(reconcile_categories(SM)) #' # # # sapply(DF, class) #' # # # sapply(SM, class) -reconcile_categories <- function(DFSM) { - DF = as(DFSM, "data.frame") - # variable_classes <- sapply(DF, class) factor_cols <- - # names(variable_classes[variable_classes %in% c('factor', 'character')]) - factor_cols = which(sapply(DF, inherits, what = "factor")) - for (j in factor_cols) { - DF[, j] <- factor(as(DF[, j], "character")) - } - return(DF) +reconcile_categories <- function(DFSM){ + DF = as(DFSM, "data.frame") + #variable_classes <- sapply(DF, class) + #factor_cols <- names(variable_classes[variable_classes %in% c("factor", "character")]) + factor_cols = which(sapply(DF, inherits, what="factor")) + for( j in factor_cols){ + DF[, j] <- factor( as(DF[, j], "character") ) + } + return(DF) } -################################################################################ +################################################################################ #' Subset samples by sample_data expression #' #' This is a convenience wrapper around the \code{\link{subset}} function. @@ -136,20 +136,20 @@ reconcile_categories <- function(DFSM) { #' #' @examples #' # data(GlobalPatterns) -#' # subset_samples(GlobalPatterns, SampleType=='Ocean') -subset_samples <- function(physeq, ...) { - if (is.null(sample_data(physeq))) { - cat("Nothing subset. No sample_data in physeq.\n") - return(physeq) - } else { - oldDF <- as(sample_data(physeq), "data.frame") - newDF <- subset(oldDF, ...) - if (class(physeq) == "sample_data") { - return(sample_data(newDF)) - } else { - sample_data(physeq) <- sample_data(newDF) - return(physeq) - } - } +#' # subset_samples(GlobalPatterns, SampleType=="Ocean") +subset_samples <- function(physeq, ...){ + if( is.null(sample_data(physeq)) ){ + cat("Nothing subset. No sample_data in physeq.\n") + return(physeq) + } else { + oldDF <- as(sample_data(physeq), "data.frame") + newDF <- subset(oldDF, ...) + if( class(physeq) == "sample_data" ){ + return(sample_data(newDF)) + } else { + sample_data(physeq) <- sample_data(newDF) + return(physeq) + } + } } -################################################################################ +################################################################################ diff --git a/R/show-methods.R b/R/show-methods.R index 26ed6c59..d58bd9d7 100644 --- a/R/show-methods.R +++ b/R/show-methods.R @@ -1,31 +1,35 @@ -############################################################################ +############################################################################ #' @rdname show-methods -setMethod("show", "otu_table", function(object) { - # print otu_table (always there). - cat(paste("OTU Table: [", ntaxa(object), " taxa and ", nsamples(object), - " samples]", sep = ""), fill = TRUE) - if (taxa_are_rows(object)) { - cat(" taxa are rows", fill = TRUE) - } else { - cat(" taxa are columns", fill = TRUE) - } - show(as(object, "matrix")) +setMethod("show", "otu_table", function(object){ + # print otu_table (always there). + cat(paste("OTU Table: [", ntaxa(object), " taxa and ", + nsamples(object), " samples]", sep = ""), fill = TRUE) + if( taxa_are_rows(object) ){ + cat(" taxa are rows", fill=TRUE) + } else { + cat(" taxa are columns", fill=TRUE) + } + show(as(object, "matrix")) }) -############################################################################ +############################################################################ #' @rdname show-methods -setMethod("show", "sample_data", function(object) { - cat(paste("Sample Data: [", dim(sample_data(object))[1], " samples by ", - dim(sample_data(object))[2], " sample variables]:", sep = ""), fill = TRUE) - show(as(object, "data.frame")) +setMethod("show", "sample_data", function(object){ + cat(paste("Sample Data: [", dim(sample_data(object))[1], " samples by ", + dim(sample_data(object))[2], + " sample variables]:", sep = ""), + fill = TRUE) + show(as(object, "data.frame")) }) -############################################################################ +############################################################################ #' @rdname show-methods -setMethod("show", "taxonomyTable", function(object) { - cat(paste("Taxonomy Table: [", dim(object)[1], " taxa by ", dim(object)[2], - " taxonomic ranks]:", sep = ""), fill = TRUE) - show(as(object, "matrix")) +setMethod("show", "taxonomyTable", function(object){ + cat(paste("Taxonomy Table: [", dim(object)[1], " taxa by ", + dim(object)[2], + " taxonomic ranks]:", sep = ""), + fill = TRUE) + show(as(object, "matrix")) }) -############################################################################ +############################################################################ #' method extensions to show for phyloseq objects. #' #' See the general documentation of \code{\link[methods]{show}} method for @@ -40,38 +44,39 @@ setMethod("show", "taxonomyTable", function(object) { #' # data(GlobalPatterns) #' # show(GlobalPatterns) #' # GlobalPatterns -setMethod("show", "phyloseq", function(object) { - cat("phyloseq-class experiment-level object", fill = TRUE) - # print otu_table (always there). - cat(paste("otu_table() OTU Table: [ ", ntaxa(otu_table(object)), " taxa and ", - nsamples(otu_table(object)), " samples ]", sep = ""), fill = TRUE) - - # print Sample Data if there - if (!is.null(sample_data(object, FALSE))) { - cat(paste("sample_data() Sample Data: [ ", dim(sample_data(object))[1], - " samples by ", dim(sample_data(object))[2], " sample variables ]", sep = ""), - fill = TRUE) - } - - # print tax Tab if there - if (!is.null(tax_table(object, FALSE))) { - cat(paste("tax_table() Taxonomy Table: [ ", dim(tax_table(object))[1], - " taxa by ", dim(tax_table(object))[2], " taxonomic ranks ]", sep = ""), - fill = TRUE) - } - - # print tree if there - if (!is.null(phy_tree(object, FALSE))) { - cat(paste("phy_tree() Phylogenetic Tree: [ ", ntaxa(phy_tree(object)), - " tips and ", phy_tree(object)$Nnode, " internal nodes ]", sep = ""), - fill = TRUE) - } - - # print refseq summary if there - if (!is.null(refseq(object, FALSE))) { - cat(paste("refseq() ", class(refseq(object))[1], ": [ ", ntaxa(refseq(object)), - " reference sequences ]", sep = ""), fill = TRUE) - } - +setMethod("show", "phyloseq", function(object){ + cat("phyloseq-class experiment-level object", fill=TRUE) + # print otu_table (always there). + cat(paste("otu_table() OTU Table: [ ", ntaxa(otu_table(object)), " taxa and ", + nsamples(otu_table(object)), " samples ]", sep = ""), fill = TRUE) + + # print Sample Data if there + if(!is.null(sample_data(object, FALSE))){ + cat(paste("sample_data() Sample Data: [ ", dim(sample_data(object))[1], " samples by ", + dim(sample_data(object))[2], + " sample variables ]", sep = ""), fill = TRUE) + } + + # print tax Tab if there + if(!is.null(tax_table(object, FALSE))){ + cat(paste("tax_table() Taxonomy Table: [ ", dim(tax_table(object))[1], " taxa by ", + dim(tax_table(object))[2], + " taxonomic ranks ]", sep = ""), fill = TRUE) + } + + # print tree if there + if(!is.null(phy_tree(object, FALSE))){ + cat(paste("phy_tree() Phylogenetic Tree: [ ", ntaxa(phy_tree(object)), " tips and ", + phy_tree(object)$Nnode, + " internal nodes ]", sep = ""), + fill = TRUE + ) + } + + # print refseq summary if there + if(!is.null(refseq(object, FALSE))){ + cat(paste("refseq() ", class(refseq(object))[1], ": [ ", ntaxa(refseq(object)), " reference sequences ]", sep = ""), fill=TRUE) + } + }) -############################################################################ +############################################################################ diff --git a/R/taxonomyTable-class.R b/R/taxonomyTable-class.R index 19f0cf7f..ce77d01b 100644 --- a/R/taxonomyTable-class.R +++ b/R/taxonomyTable-class.R @@ -1,4 +1,4 @@ -################################################################################ +################################################################################ #' Build or access the taxonomyTable. #' #' This is the suggested method for both constructing and accessing a table of @@ -36,45 +36,48 @@ #' @export #' #' @examples # -#' # tax1 <- tax_table(matrix('abc', 30, 8)) +#' # tax1 <- tax_table(matrix("abc", 30, 8)) #' # data(GlobalPatterns) #' # tax_table(GlobalPatterns) -setGeneric("tax_table", function(object, errorIfNULL = TRUE) standardGeneric("tax_table")) +setGeneric("tax_table", function(object, errorIfNULL=TRUE) standardGeneric("tax_table")) #' @rdname tax_table-methods #' @aliases tax_table,ANY-method -setMethod("tax_table", "ANY", function(object, errorIfNULL = TRUE) { - access(object, "tax_table", errorIfNULL) +setMethod("tax_table", "ANY", function(object, errorIfNULL=TRUE){ + access(object, "tax_table", errorIfNULL) }) # Constructor; for creating taxonomyTable from a matrix. #' @rdname tax_table-methods #' @aliases tax_table,matrix-method -setMethod("tax_table", "matrix", function(object) { +setMethod("tax_table", "matrix", function(object){ # Want dummy species/taxa index names if missing - if (is.null(rownames(object))) { - rownames(object) <- paste("sp", 1:nrow(object), sep = "") + if(is.null(rownames(object))){ + rownames(object) <- paste("sp", 1:nrow(object), sep="") } - if (is.null(colnames(object))) { - colnames(object) <- paste("ta", 1:ncol(object), sep = "") - } - # instantiate as taxonomyTable - return(new("taxonomyTable", object)) + if(is.null(colnames(object))){ + colnames(object) <- paste("ta", 1:ncol(object), sep="") + } + # instantiate as taxonomyTable + return(new("taxonomyTable", object)) }) # Constructor; coerce to matrix, then pass on for creating taxonomyTable. #' @rdname tax_table-methods #' @aliases tax_table,data.frame-method -setMethod("tax_table", "data.frame", function(object) { - # Warn first +setMethod("tax_table", "data.frame", function(object){ + # Warn first text = "Coercing from data.frame class to character matrix \n" text = paste0(text, "prior to building taxonomyTable. \n") text = paste0(text, "This could introduce artifacts. \n") text = paste0(text, "Check your taxonomyTable, or coerce to matrix manually.") - warning(text) - # Coerce everything to a matrix, then char-vector, then back to matrix. - TT <- matrix(as(as(object, "matrix"), "character"), nrow = nrow(object), ncol = ncol(object)) - # Pass on to matrix-method. - tax_table(TT) + warning(text) + # Coerce everything to a matrix, then char-vector, then back to matrix. + TT <- matrix(as(as(object, "matrix"), "character"), + nrow=nrow(object), + ncol=ncol(object) + ) + # Pass on to matrix-method. + tax_table(TT) }) -################################################################################ +################################################################################ #' Subset species by taxonomic expression #' #' This is a convenience wrapper around the \code{\link{subset}} function. @@ -108,22 +111,22 @@ setMethod("tax_table", "data.frame", function(object) { #' @export #' #' @examples -#' ## ex3 <- subset_taxa(GlobalPatterns, Phylum=='Bacteroidetes') -subset_taxa <- function(physeq, ...) { - if (is.null(tax_table(physeq))) { - cat("Nothing subset. No taxonomyTable in physeq.\n") - return(physeq) - } else { - oldMA <- as(tax_table(physeq), "matrix") - oldDF <- data.frame(oldMA) - newDF <- subset(oldDF, ...) - newMA <- as(newDF, "matrix") - if (inherits(physeq, "taxonomyTable")) { - return(tax_table(newMA)) - } else { - tax_table(physeq) <- tax_table(newMA) - return(physeq) - } - } +#' ## ex3 <- subset_taxa(GlobalPatterns, Phylum=="Bacteroidetes") +subset_taxa <- function(physeq, ...){ + if( is.null(tax_table(physeq)) ){ + cat("Nothing subset. No taxonomyTable in physeq.\n") + return(physeq) + } else { + oldMA <- as(tax_table(physeq), "matrix") + oldDF <- data.frame(oldMA) + newDF <- subset(oldDF, ...) + newMA <- as(newDF, "matrix") + if( inherits(physeq, "taxonomyTable") ){ + return(tax_table(newMA)) + } else { + tax_table(physeq) <- tax_table(newMA) + return(physeq) + } + } } -################################################################################ +################################################################################ diff --git a/R/transform_filter-methods.R b/R/transform_filter-methods.R index b91a6757..9b26ce58 100644 --- a/R/transform_filter-methods.R +++ b/R/transform_filter-methods.R @@ -1,6 +1,9 @@ -################################################################################ Function to create subsampled dataset in which each sample has same number of -################################################################################ total observations/counts/reads Note that the subsampling is random, so some -################################################################################ noise is introduced making the relative abundances slightly different +################################################################################ +# Function to create subsampled dataset +# in which each sample has same number of total observations/counts/reads +# Note that the subsampling is random, so some noise is introduced making the +# relative abundances slightly different +################################################################################ #' Resample an OTU table such that all samples have the same library size. #' #' Please note that the authors of phyloseq do not advocate using this @@ -104,132 +107,137 @@ #' #' @examples #' # Test with esophagus dataset -#' data('esophagus') +#' data("esophagus") #' esorepT = rarefy_even_depth(esophagus, replace=TRUE) #' esorepF = rarefy_even_depth(esophagus, replace=FALSE) #' sample_sums(esophagus) #' sample_sums(esorepT) #' sample_sums(esorepF) #' ## NRun Manually: Too slow! -#' # data('GlobalPatterns') +#' # data("GlobalPatterns") #' # GPrepT = rarefy_even_depth(GlobalPatterns, 1E5, replace=TRUE) #' ## Actually just this one is slow #' # system.time(GPrepF <- rarefy_even_depth(GlobalPatterns, 1E5, replace=FALSE)) -rarefy_even_depth <- function(physeq, sample.size = min(sample_sums(physeq)), rngseed = FALSE, - replace = TRUE, trimOTUs = TRUE, verbose = TRUE) { - - if (as(rngseed, "logical")) { - # Now call the set.seed using the value expected in phyloseq - set.seed(rngseed) - if (verbose) { +rarefy_even_depth <- function(physeq, sample.size=min(sample_sums(physeq)), + rngseed=FALSE, replace=TRUE, trimOTUs=TRUE, verbose=TRUE){ + + if( as(rngseed, "logical") ){ + # Now call the set.seed using the value expected in phyloseq + set.seed(rngseed) + if(verbose){ # Print to screen this value message("`set.seed(", rngseed, ")` was used to initialize repeatable random subsampling.") message("Please record this for your records so others can reproduce.") - message("Try `set.seed(", rngseed, "); .Random.seed` for the full vector", - sep = "") - message("...") + message("Try `set.seed(", rngseed,"); .Random.seed` for the full vector", sep="") + message("...") } - } else if (verbose) { - message("You set `rngseed` to FALSE. Make sure you've set & recorded\n", - " the random seed of your session for reproducibility.\n", "See `?set.seed`\n") - message("...") - } - - # Make sure sample.size is of length 1. - if (length(sample.size) > 1) { - warning("`sample.size` had more than one value. ", "Using only the first. \n ... \n") - sample.size <- sample.size[1] - } - - if (sample.size <= 0) { - stop("sample.size less than or equal to zero. ", "Need positive sample size to work.") - } - - # Instead of warning, expected behavior now is to prune samples that have fewer - # reads than `sample.size` - if (min(sample_sums(physeq)) < sample.size) { - rmsamples = sample_names(physeq)[sample_sums(physeq) < sample.size] - if (verbose) { - message(length(rmsamples), " samples removed", "because they contained fewer reads than `sample.size`.") + } else if(verbose){ + message("You set `rngseed` to FALSE. Make sure you've set & recorded\n", + " the random seed of your session for reproducibility.\n", + "See `?set.seed`\n") + message("...") + } + + # Make sure sample.size is of length 1. + if( length(sample.size) > 1 ){ + warning("`sample.size` had more than one value. ", + "Using only the first. \n ... \n") + sample.size <- sample.size[1] + } + + if( sample.size <= 0 ){ + stop("sample.size less than or equal to zero. ", + "Need positive sample size to work.") + } + + # Instead of warning, expected behavior now is to prune samples + # that have fewer reads than `sample.size` + if( min(sample_sums(physeq)) < sample.size ){ + rmsamples = sample_names(physeq)[sample_sums(physeq) < sample.size] + if(verbose){ + message(length(rmsamples), " samples removed", + "because they contained fewer reads than `sample.size`.") message("Up to first five removed samples are: \n") - message(rmsamples[1:min(5, length(rmsamples))], sep = "\t") - message("...") + message(rmsamples[1:min(5, length(rmsamples))], sep="\t") + message("...") } - # Now done with notifying user of pruning, actually prune. - physeq = prune_samples(setdiff(sample_names(physeq), rmsamples), physeq) - } - # initialize the subsamples phyloseq instance, newsub - newsub <- physeq - # enforce orientation as species-are-rows, for assignment - if (!taxa_are_rows(newsub)) { - newsub <- t(newsub) - } - # apply through each sample, and replace - newotu <- apply(otu_table(newsub), 2, rarefaction_subsample, sample.size = sample.size, - replace = replace) - # Add OTU names to the row indices - rownames(newotu) <- taxa_names(physeq) - # replace the otu_table. - otu_table(newsub) <- otu_table(newotu, TRUE) - if (trimOTUs) { - # Check for and remove empty OTUs 1. Notify user of empty OTUs being cut. 2. Cut - # empty OTUs + # Now done with notifying user of pruning, actually prune. + physeq = prune_samples(setdiff(sample_names(physeq), rmsamples), physeq) + } + # initialize the subsamples phyloseq instance, newsub + newsub <- physeq + # enforce orientation as species-are-rows, for assignment + if(!taxa_are_rows(newsub)){newsub <- t(newsub)} + # apply through each sample, and replace + newotu <- apply(otu_table(newsub), 2, rarefaction_subsample, + sample.size=sample.size, replace=replace) + # Add OTU names to the row indices + rownames(newotu) <- taxa_names(physeq) + # replace the otu_table. + otu_table(newsub) <- otu_table(newotu, TRUE) + if(trimOTUs){ + # Check for and remove empty OTUs + # 1. Notify user of empty OTUs being cut. + # 2. Cut empty OTUs rmtaxa = taxa_names(newsub)[taxa_sums(newsub) <= 0] - if (length(rmtaxa) > 0) { - if (verbose) { - message(length(rmtaxa), "OTUs were removed because they are no longer \n", - "present in any sample after random subsampling\n") + if( length(rmtaxa) > 0 ){ + if(verbose){ + message(length(rmtaxa), "OTUs were removed because they are no longer \n", + "present in any sample after random subsampling\n") message("...") } newsub = prune_taxa(setdiff(taxa_names(newsub), rmtaxa), newsub) } } - # If the OTU table was transposed before rarefaction, transpose it back to the - # way it was in the original physeq object. - if (!taxa_are_rows(physeq)) { - newsub <- t(newsub) - } - return(newsub) + # If the OTU table was transposed before rarefaction, transpose it + # back to the way it was in the original physeq object. + if(!taxa_are_rows(physeq)){newsub <- t(newsub)} + return(newsub) } -################################################################################ rarefaction subsample function, one sample +################################################################################ +# rarefaction subsample function, one sample +################################################################################ #' @keywords internal -rarefaction_subsample <- function(x, sample.size, replace = FALSE) { - # This is a test x = sample(10, 10) x = 1:10 sample.size = 50 system.time(obsvec - # <- foreach(OTUi=1:length(x), times=x, .combine=c) %do% {rep(OTUi, times)}) - # data('GlobalPatterns') sample.size = - # sample_sums(GlobalPatterns)[which.min(sample_sums(GlobalPatterns))] x = - # get_taxa(GlobalPatterns, which.max(sample_sums(GlobalPatterns))) Create - # replacement species vector - rarvec <- numeric(length(x)) - # Perform the sub-sampling. Suppress warnings due to old R compat issue. Also, - # make sure to avoid errors from x summing to zero, and there are no observations - # to sample. The initialization of rarvec above is already sufficient. - if (sum(x) <= 0) { - # Protect against, and quickly return an empty vector, if x is already an empty - # count vector - return(rarvec) - } - if (replace) { - # resample with replacement - suppressWarnings(subsample <- sample(1:length(x), sample.size, replace = TRUE, - prob = x)) - } else { - # resample without replacement - obsvec <- apply(data.frame(OTUi = 1:length(x), times = x), 1, function(x) { - rep_len(x["OTUi"], x["times"]) - }) - obsvec <- unlist(obsvec, use.names = FALSE) - # use `sample` for subsampling. Hope that obsvec doesn't overflow. - suppressWarnings(subsample <- sample(obsvec, sample.size, replace = FALSE)) - } - # Tabulate the results (these are already named by the order in `x`) - sstab <- table(subsample) - # Assign the tabulated random subsample values to the species vector - rarvec[as(names(sstab), "integer")] <- sstab - # Return abundance vector. Let replacement happen elsewhere. - return(rarvec) +rarefaction_subsample <- function(x, sample.size, replace=FALSE){ + # This is a test + # x = sample(10, 10) + # x = 1:10 + # sample.size = 50 + #system.time(obsvec <- foreach(OTUi=1:length(x), times=x, .combine=c) %do% {rep(OTUi, times)}) + # data("GlobalPatterns") + # sample.size = sample_sums(GlobalPatterns)[which.min(sample_sums(GlobalPatterns))] + # x = get_taxa(GlobalPatterns, which.max(sample_sums(GlobalPatterns))) + # Create replacement species vector + rarvec <- numeric(length(x)) + # Perform the sub-sampling. Suppress warnings due to old R compat issue. + # Also, make sure to avoid errors from x summing to zero, + # and there are no observations to sample. + # The initialization of rarvec above is already sufficient. + if(sum(x) <= 0){ + # Protect against, and quickly return an empty vector, + # if x is already an empty count vector + return(rarvec) + } + if(replace){ + # resample with replacement + suppressWarnings(subsample <- sample(1:length(x), sample.size, replace=TRUE, prob=x)) + } else { + # resample without replacement + obsvec <- apply(data.frame(OTUi=1:length(x), times=x), 1, function(x){ + rep_len(x["OTUi"], x["times"]) + }) + obsvec <- unlist(obsvec, use.names=FALSE) + # use `sample` for subsampling. Hope that obsvec doesn't overflow. + suppressWarnings(subsample <- sample(obsvec, sample.size, replace=FALSE)) + } + # Tabulate the results (these are already named by the order in `x`) + sstab <- table(subsample) + # Assign the tabulated random subsample values to the species vector + rarvec[as(names(sstab), "integer")] <- sstab + # Return abundance vector. Let replacement happen elsewhere. + return(rarvec) } -################################################################################ +################################################################################ #' Agglomerate closely-related taxa using single-linkage clustering. #' #' All tips of the tree separated by a cophenetic distance smaller than @@ -288,24 +296,23 @@ rarefaction_subsample <- function(x, sample.size, replace = FALSE) { #' @export #' #' @examples -#' data('esophagus') +#' data("esophagus") #' # for speed #' esophagus = prune_taxa(taxa_names(esophagus)[1:25], esophagus) -#' plot_tree(esophagus, label.tips='taxa_names', size='abundance', title='Before tip_glom()') -#' plot_tree(tip_glom(esophagus, h=0.2), label.tips='taxa_names', size='abundance', title='After tip_glom()') -tip_glom = function(physeq, h = 0.2, hcfun = agnes, ...) { +#' plot_tree(esophagus, label.tips="taxa_names", size="abundance", title="Before tip_glom()") +#' plot_tree(tip_glom(esophagus, h=0.2), label.tips="taxa_names", size="abundance", title="After tip_glom()") +tip_glom = function(physeq, h=0.2, hcfun=agnes, ...){ dd = as.dist(cophenetic.phylo(phy_tree(physeq))) - psclust = cutree(as.hclust(hcfun(dd, ...)), h = h) - cliques = levels(factor(psclust))[tapply(psclust, factor(psclust), function(x) { - length(x) > 1 - })] + psclust = cutree(as.hclust(hcfun(dd, ...)), h=h) + cliques = levels(factor(psclust))[tapply(psclust, factor(psclust), function(x){length(x)>1})] # For each clique, merge taxa in it... - for (i in cliques) { - physeq = merge_taxa(physeq, eqtaxa = names(psclust)[psclust == i]) + for( i in cliques){ + physeq = merge_taxa(physeq, eqtaxa=names(psclust)[psclust == i]) } return(physeq) } -################################################################################ +################################################################################ +################################################################################ #' Agglomerate taxa of the same type. #' #' This method merges species that have the same taxonomy at a certain @@ -318,7 +325,7 @@ tip_glom = function(physeq, h = 0.2, hcfun = agnes, ...) { #' for agglomeration will be replaced with \code{NA}, #' because they should be meaningless following agglomeration. #' -#' @usage tax_glom(physeq, taxrank=rank_names(physeq)[1], NArm=TRUE, bad_empty=c(NA, '', ' ', '\t')) +#' @usage tax_glom(physeq, taxrank=rank_names(physeq)[1], NArm=TRUE, bad_empty=c(NA, "", " ", "\t")) #' #' @param physeq (Required). \code{\link{phyloseq-class}} or \code{\link{otu_table}}. #' @@ -340,7 +347,7 @@ tip_glom = function(physeq, h = 0.2, hcfun = agnes, ...) { #' analysis, think about also trying the nomenclature-agnostic \code{\link{tip_glom}} #' method if you have a phylogenetic tree available. #' -#' @param bad_empty (Optional). Character vector. Default: \code{c(NA, '', ' ', '\t')}. +#' @param bad_empty (Optional). Character vector. Default: \code{c(NA, "", " ", "\t")}. #' Defines the bad/empty values #' that should be ignored and/or considered unknown. They will be removed #' from the internal agglomeration vector derived from the argument to \code{tax}, @@ -365,65 +372,64 @@ tip_glom = function(physeq, h = 0.2, hcfun = agnes, ...) { #' # ## print the available taxonomic ranks #' # colnames(tax_table(GlobalPatterns)) #' # ## agglomerate at the Family taxonomic rank -#' # (x1 <- tax_glom(GlobalPatterns, taxrank='Family') ) +#' # (x1 <- tax_glom(GlobalPatterns, taxrank="Family") ) #' # ## How many taxa before/after agglomeration? #' # ntaxa(GlobalPatterns); ntaxa(x1) #' # ## Look at enterotype dataset... #' # data(enterotype) #' # ## print the available taxonomic ranks. Shows only 1 rank available, not useful for tax_glom #' # colnames(tax_table(enterotype)) -tax_glom <- function(physeq, taxrank = rank_names(physeq)[1], NArm = TRUE, bad_empty = c(NA, - "", " ", "\t")) { - - # Error if tax_table slot is empty - if (is.null(access(physeq, "tax_table"))) { - stop("The tax_glom() function requires that physeq contain a taxonomyTable") - } - - # Error if bad taxrank - if (!taxrank[1] %in% rank_names(physeq)) { - stop("Bad taxrank argument. Must be among the values of rank_names(physeq)") - } - - # Make a vector from the taxonomic data. - CN <- which(rank_names(physeq) %in% taxrank[1]) - tax <- as(access(physeq, "tax_table"), "matrix")[, CN] - - # if NArm is TRUE, remove the empty, white-space, NA values from - if (NArm) { - keep_species <- names(tax)[!(tax %in% bad_empty)] - physeq <- prune_taxa(keep_species, physeq) - } - - # Concatenate data up to the taxrank column, use this for agglomeration - tax <- as(access(physeq, "tax_table"), "matrix")[, 1:CN, drop = FALSE] - tax <- apply(tax, 1, function(i) { - paste(i, sep = ";_;", collapse = ";_;") - }) - - # Remove NAs and useless from the vector/factor for looping. This does not - # remove the taxa that have an unknown (NA) taxonomic designation at this - # particular taxonomic rank. - tax <- tax[!(tax %in% bad_empty)] - - # Define the OTU cliques to loop through - spCliques <- tapply(names(tax), factor(tax), list) - - # Successively merge taxa in physeq. - for (i in names(spCliques)) { - physeq <- merge_taxa(physeq, spCliques[[i]]) - } - - # 'Empty' the values to the right of the rank, using NA_character_. - if (CN < length(rank_names(physeq))) { - badcolumns <- (CN + 1):length(rank_names(physeq)) - tax_table(physeq)[, badcolumns] <- NA_character_ - } - - # Return. - return(physeq) +tax_glom <- function(physeq, taxrank=rank_names(physeq)[1], + NArm=TRUE, bad_empty=c(NA, "", " ", "\t")){ + + # Error if tax_table slot is empty + if( is.null(access(physeq, "tax_table")) ){ + stop("The tax_glom() function requires that physeq contain a taxonomyTable") + } + + # Error if bad taxrank + if( !taxrank[1] %in% rank_names(physeq) ){ + stop("Bad taxrank argument. Must be among the values of rank_names(physeq)") + } + + # Make a vector from the taxonomic data. + CN <- which( rank_names(physeq) %in% taxrank[1] ) + tax <- as(access(physeq, "tax_table"), "matrix")[, CN] + + # if NArm is TRUE, remove the empty, white-space, NA values from + if( NArm ){ + keep_species <- names(tax)[ !(tax %in% bad_empty) ] + physeq <- prune_taxa(keep_species, physeq) + } + + # Concatenate data up to the taxrank column, use this for agglomeration + tax <- as(access(physeq, "tax_table"), "matrix")[, 1:CN, drop=FALSE] + tax <- apply(tax, 1, function(i){paste(i, sep=";_;", collapse=";_;")}) + + # Remove NAs and useless from the vector/factor for looping. + # This does not remove the taxa that have an unknown (NA) + # taxonomic designation at this particular taxonomic rank. + tax <- tax[ !(tax %in% bad_empty) ] + + # Define the OTU cliques to loop through + spCliques <- tapply(names(tax), factor(tax), list) + + # Successively merge taxa in physeq. + for( i in names(spCliques)){ + physeq <- merge_taxa(physeq, spCliques[[i]]) + } + + # "Empty" the values to the right of the rank, using NA_character_. + if( CN < length(rank_names(physeq)) ){ + badcolumns <- (CN+1):length(rank_names(physeq)) + tax_table(physeq)[, badcolumns] <- NA_character_ + } + + # Return. + return(physeq) } -################################################################################ +################################################################################ +################################################################################ #' Prune unwanted OTUs / taxa from a phylogenetic object. #' #' An S4 Generic method for removing (pruning) unwanted OTUs/taxa from phylogenetic @@ -458,113 +464,115 @@ tax_glom <- function(physeq, taxrank = rank_names(physeq)[1], NArm = TRUE, bad_e #' @rdname prune_taxa-methods #' @export #' @examples -#' data('esophagus') +#' data("esophagus") #' esophagus -#' plot(sort(taxa_sums(esophagus), TRUE), type='h', ylim=c(0, 50)) +#' plot(sort(taxa_sums(esophagus), TRUE), type="h", ylim=c(0, 50)) #' x1 = prune_taxa(taxa_sums(esophagus) > 10, esophagus) #' x2 = prune_taxa(names(sort(taxa_sums(esophagus), TRUE))[1:9], esophagus) #' identical(x1, x2) setGeneric("prune_taxa", function(taxa, x) standardGeneric("prune_taxa")) #' @aliases prune_taxa,NULL,ANY-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("NULL", "ANY"), function(taxa, x) { - return(x) +setMethod("prune_taxa", signature("NULL", "ANY"), function(taxa, x){ + return(x) }) -# Any prune_taxa call w/ signature starting with a logical converts the logical -# to a character vector, and then dispatches to more specific method. +# Any prune_taxa call w/ signature starting with a logical +# converts the logical to a character vector, and then dispatches +# to more specific method. #' @aliases prune_taxa,logical,ANY-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("logical", "ANY"), function(taxa, x) { - # Check that logical has same length as ntaxa, stop if not. - if (!identical(length(taxa), ntaxa(x))) { - stop("logical argument to taxa is wrong length. Should equal ntaxa(x)") - } else { - # Pass on to names-based prune_taxa method - return(prune_taxa(taxa_names(x)[taxa], x)) - } +setMethod("prune_taxa", signature("logical", "ANY"), function(taxa, x){ + # Check that logical has same length as ntaxa, stop if not. + if( !identical(length(taxa), ntaxa(x)) ){ + stop("logical argument to taxa is wrong length. Should equal ntaxa(x)") + } else { + # Pass on to names-based prune_taxa method + return( prune_taxa(taxa_names(x)[taxa], x) ) + } }) #' @importFrom ape drop.tip #' @aliases prune_taxa,character,phylo-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "phylo"), function(taxa, x) { - if (length(taxa) <= 1) { - # Can't have a tree with 1 or fewer tips - warning("prune_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.") - return(NULL) - } else if (setequal(taxa, taxa_names(x))) { - return(x) - } else { - return(drop.tip(x, setdiff(taxa_names(x), taxa))) - } +setMethod("prune_taxa", signature("character", "phylo"), function(taxa, x){ + if( length(taxa) <= 1 ){ + # Can't have a tree with 1 or fewer tips + warning("prune_taxa attempted to reduce tree to 1 or fewer tips.\n tree replaced with NULL.") + return(NULL) + } else if( setequal(taxa, taxa_names(x)) ){ + return(x) + } else { + return( drop.tip(x, setdiff(taxa_names(x), taxa)) ) + } }) #' @aliases prune_taxa,character,otu_table-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "otu_table"), function(taxa, x) { - if (setequal(taxa, taxa_names(x))) { - return(x) - } else { - taxa = intersect(taxa, taxa_names(x)) - if (taxa_are_rows(x)) { - return(x[taxa, , drop = FALSE]) - } else { - return(x[, taxa, drop = FALSE]) - } - } +setMethod("prune_taxa", signature("character", "otu_table"), function(taxa, x){ + if( setequal(taxa, taxa_names(x)) ){ + return(x) + } else { + taxa = intersect( taxa, taxa_names(x) ) + if( taxa_are_rows(x) ){ + return(x[taxa, , drop=FALSE]) + } else { + return(x[, taxa, drop=FALSE]) + } + } }) #' @aliases prune_taxa,character,sample_data-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "sample_data"), function(taxa, x) { - return(x) +setMethod("prune_taxa", signature("character", "sample_data"), function(taxa, x){ + return(x) }) #' @aliases prune_taxa,character,phyloseq-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "phyloseq"), function(taxa, x) { - # Re-define `taxa` as the intersection of OTU names for each component AND `taxa` - taxa = intersect(intersect_taxa(x), taxa) - # Now prune them all. All phyloseq objects have an otu_table slot, no need to - # test for existence. - x@otu_table = prune_taxa(taxa, otu_table(x)) - # Test if slot is present. If so, perform the component prune. - if (!is.null(x@tax_table)) { - x@tax_table = prune_taxa(taxa, tax_table(x)) - } - if (!is.null(x@phy_tree)) { - x@phy_tree = prune_taxa(taxa, phy_tree(x)) - } - if (!is.null(x@refseq)) { - x@refseq = prune_taxa(taxa, refseq(x)) - } - # Force index order after pruning to be the same, according to the same rules as - # in the constructor, phyloseq() - x = index_reorder(x, index_type = "taxa") - return(x) +setMethod("prune_taxa", signature("character", "phyloseq"), function(taxa, x){ + # Re-define `taxa` as the intersection of OTU names for each component AND `taxa` + taxa = intersect(intersect_taxa(x), taxa) + # Now prune them all. + # All phyloseq objects have an otu_table slot, no need to test for existence. + x@otu_table = prune_taxa(taxa, otu_table(x)) + # Test if slot is present. If so, perform the component prune. + if( !is.null(x@tax_table) ){ + x@tax_table = prune_taxa(taxa, tax_table(x)) + } + if( !is.null(x@phy_tree) ){ + x@phy_tree = prune_taxa(taxa, phy_tree(x)) + } + if( !is.null(x@refseq) ){ + x@refseq = prune_taxa(taxa, refseq(x)) + } + # Force index order after pruning to be the same, + # according to the same rules as in the constructor, phyloseq() + x = index_reorder(x, index_type="taxa") + return(x) }) #' @aliases prune_taxa,character,taxonomyTable-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "taxonomyTable"), function(taxa, x) { - if (setequal(taxa, taxa_names(x))) { - return(x) - } else { - taxa = intersect(taxa, taxa_names(x)) - return(x[taxa, , drop = FALSE]) - } +setMethod("prune_taxa", signature("character", "taxonomyTable"), function(taxa, x){ + if( setequal(taxa, taxa_names(x)) ){ + return(x) + } else { + taxa = intersect( taxa, taxa_names(x) ) + return( x[taxa, , drop=FALSE] ) + } }) #' @importClassesFrom Biostrings XStringSet #' @aliases prune_taxa,character,XStringSet-method #' @rdname prune_taxa-methods -setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x) { - if (setequal(taxa, taxa_names(x))) { - # Nothing to do, return x as-is. - return(x) - } else if (length(intersect(taxa, taxa_names(x))) == 0) { - # Informative error if intersection is zero. - stop("prune_taxa,XStringSet: taxa and taxa_names(x) do not overlap.") - } else { - # Pop the OTUs that are not in `taxa`, without reordering. - return(x[-which(!taxa_names(x) %in% taxa)]) - } +setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x){ + if( setequal(taxa, taxa_names(x)) ){ + # Nothing to do, return x as-is. + return(x) + } else if( length(intersect(taxa, taxa_names(x))) == 0 ){ + # Informative error if intersection is zero. + stop("prune_taxa,XStringSet: taxa and taxa_names(x) do not overlap.") + } else { + # Pop the OTUs that are not in `taxa`, without reordering. + return(x[-which(!taxa_names(x) %in% taxa)]) + } }) -################################################################################ +################################################################################ +################################################################################ #' Define a subset of samples to keep in a phyloseq object. #' #' An S4 Generic method for pruning/filtering unwanted samples @@ -591,72 +599,67 @@ setMethod("prune_taxa", signature("character", "XStringSet"), function(taxa, x) #' @examples #' data(GlobalPatterns) #' # Subset to just the Chlamydiae phylum. -#' GP.chl <- subset_taxa(GlobalPatterns, Phylum=='Chlamydiae') +#' GP.chl <- subset_taxa(GlobalPatterns, Phylum=="Chlamydiae") #' # Remove the samples that have less than 20 total reads from Chlamydiae #' GP.chl <- prune_samples(sample_sums(GP.chl)>=20, GP.chl) -#' # (p <- plot_tree(GP.chl, color='SampleType', shape='Family', label.tips='Genus', size='abundance')) +#' # (p <- plot_tree(GP.chl, color="SampleType", shape="Family", label.tips="Genus", size="abundance")) setGeneric("prune_samples", function(samples, x) standardGeneric("prune_samples")) #' @aliases prune_samples,character,otu_table-method #' @rdname prune_samples-methods -setMethod("prune_samples", signature("character", "otu_table"), function(samples, - x) { - if (setequal(samples, sample_names(x))) { - # If the sets of `samples` and sample_names are the same, return as-is. - return(x) - } else { - samples = intersect(samples, sample_names(x)) - if (taxa_are_rows(x)) { - return(x[, samples]) - } else { - return(x[samples, ]) - } - } +setMethod("prune_samples", signature("character", "otu_table"), function(samples, x){ + if( setequal(samples, sample_names(x)) ){ + # If the sets of `samples` and sample_names are the same, return as-is. + return(x) + } else { + samples = intersect(samples, sample_names(x)) + if( taxa_are_rows(x) ){ + return( x[, samples] ) + } else { + return( x[samples, ] ) + } + } }) #' @aliases prune_samples,character,sample_data-method #' @rdname prune_samples-methods -setMethod("prune_samples", signature("character", "sample_data"), function(samples, - x) { - if (setequal(samples, sample_names(x))) { - # If the sets of `samples` and sample_names are the same, return as-is. - return(x) - } else { - samples = intersect(samples, sample_names(x)) - return(x[samples, ]) - } +setMethod("prune_samples", signature("character", "sample_data"), function(samples, x){ + if( setequal(samples, sample_names(x)) ){ + # If the sets of `samples` and sample_names are the same, return as-is. + return(x) + } else { + samples = intersect(samples, sample_names(x)) + return(x[samples, ]) + } }) #' @aliases prune_samples,character,phyloseq-method #' @rdname prune_samples-methods -setMethod("prune_samples", signature("character", "phyloseq"), function(samples, - x) { - # Re-define `samples` as the intersection of samples names for each component AND - # `samples` - samples = intersect(intersect_samples(x), samples) - # Now prune each component. All phyloseq objects have an otu_table slot, no need - # to test for existence. - x@otu_table = prune_samples(samples, otu_table(x)) - if (!is.null(x@sam_data)) { - # protect missing sample_data component. Don't need to prune if empty - x@sam_data = prune_samples(samples, sample_data(x)) - } - # Force sample index order after pruning to be the same, according to the same - # rules as in the constructor, phyloseq() - x = index_reorder(x, index_type = "samples") - return(x) +setMethod("prune_samples", signature("character", "phyloseq"), function(samples, x){ + # Re-define `samples` as the intersection of samples names for each component AND `samples` + samples = intersect(intersect_samples(x), samples) + # Now prune each component. + # All phyloseq objects have an otu_table slot, no need to test for existence. + x@otu_table = prune_samples(samples, otu_table(x)) + if( !is.null(x@sam_data) ){ + # protect missing sample_data component. Don't need to prune if empty + x@sam_data = prune_samples(samples, sample_data(x)) + } + # Force sample index order after pruning to be the same, + # according to the same rules as in the constructor, phyloseq() + x = index_reorder(x, index_type="samples") + return(x) }) -# A logical should specify the samples to keep, or not. Have same length as -# nsamples(x) +# A logical should specify the samples to keep, or not. Have same length as nsamples(x) #' @aliases prune_samples,logical,ANY-method #' @rdname prune_samples-methods -setMethod("prune_samples", signature("logical", "ANY"), function(samples, x) { - # Check that logical has same length as nsamples, stop if not. - if (!identical(length(samples), nsamples(x))) { - stop("logical argument to samples is wrong length. Should equal nsamples(x)") - } else { - # Pass on to names-based prune_samples method - return(prune_samples(sample_names(x)[samples], x)) - } +setMethod("prune_samples", signature("logical", "ANY"), function(samples, x){ + # Check that logical has same length as nsamples, stop if not. + if( !identical(length(samples), nsamples(x)) ){ + stop("logical argument to samples is wrong length. Should equal nsamples(x)") + } else { + # Pass on to names-based prune_samples method + return( prune_samples(sample_names(x)[samples], x) ) + } }) -################################################################################ +################################################################################ #' Thresholded rank transformation. #' #' The lowest \code{thresh} values in \code{x} all get the value 'thresh'. @@ -686,19 +689,15 @@ setMethod("prune_samples", signature("logical", "ANY"), function(samples, x) { #' identical(x1, x2) #' (x3 <- otu_table(apply(otu_table(GP), 2, threshrank, thresh=500), taxa_are_rows(GP)) ) #' identical(x1, x3) -threshrank <- function(x, thresh, keep0s = FALSE, ...) { - if (keep0s) { - index0 <- which(x == 0) - } - x <- rank(x, ...) - thresh <- thresh[1] - x[x < thresh] <- thresh - if (keep0s) { - x[index0] <- 0 - } - return(x) +threshrank <- function(x, thresh, keep0s=FALSE, ...){ + if( keep0s ){ index0 <- which(x == 0) } + x <- rank(x, ...) + thresh <- thresh[1] + x[x= A - }, A) +setMethod("genefilter_sample", signature("matrix"), function(X, flist, A=1){ + TFmat = apply(X, 2, flist) + apply(TFmat, 1, function(x, A){sum(x) >= A}, A) }) #' @rdname genefilter_sample-methods #' @aliases genefilter_sample,otu_table-method -setMethod("genefilter_sample", signature("otu_table"), function(X, flist, A = 1) { - if (taxa_are_rows(X)) { - genefilter_sample(as(X, "matrix"), flist, A) - } else { - genefilter_sample(t(as(X, "matrix")), flist, A) - } +setMethod("genefilter_sample", signature("otu_table"), function(X, flist, A=1){ + if( taxa_are_rows(X) ){ + genefilter_sample( as(X, "matrix"), flist, A) + } else { + genefilter_sample( t(as(X, "matrix")), flist, A) + } }) #' @rdname genefilter_sample-methods #' @aliases genefilter_sample,phyloseq-method -setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A = 1) { - genefilter_sample(otu_table(X), flist, A) +setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A=1){ + genefilter_sample(otu_table(X), flist, A) }) -################################################################################ +################################################################################ #' A sample-wise filter function builder #' analogous to \code{\link[genefilter]{filterfun}}. #' @@ -950,25 +947,23 @@ setMethod("genefilter_sample", signature("phyloseq"), function(X, flist, A = 1) #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -filterfun_sample = function(...) { - flist <- list(...) - if (length(flist) == 1 && is.list(flist[[1]])) { - flist <- flist[[1]] - } - f = function(x) { - # initialize fval (a logical vector) - fun = flist[[1]] - fval = fun(x) - # check the remaining functions. Compare & logic, element-wise, each loop. - for (fun in flist[-1]) { - fval = fval & fun(x) - } - return(fval) - } - class(f) <- "filterfun" - return(f) +filterfun_sample = function(...){ + flist <- list(...) + if( length(flist) == 1 && is.list(flist[[1]])) { flist <- flist[[1]] } + f = function(x){ + # initialize fval (a logical vector) + fun = flist[[1]] + fval = fun(x) + # check the remaining functions. Compare & logic, element-wise, each loop. + for(fun in flist[-1]){ + fval = fval & fun(x) + } + return(fval) + } + class(f) <- "filterfun" + return(f) } -################################################################################ +################################################################################ #' Filter taxa based on across-sample OTU abundance criteria #' #' This function is directly analogous to the @@ -980,7 +975,7 @@ filterfun_sample = function(...) { #' It takes as input a phyloseq object, #' and returns a logical vector #' indicating whether or not each OTU passed the criteria. -#' Alternatively, if the \code{'prune'} option is set to \code{FALSE}, +#' Alternatively, if the \code{"prune"} option is set to \code{FALSE}, #' it returns the already-trimmed version of the phyloseq object. #' #' @usage filter_taxa(physeq, flist, prune=FALSE) @@ -1008,37 +1003,37 @@ filterfun_sample = function(...) { #' \code{\link{filterfun_sample}} #' #' @examples -#' data('enterotype') -#' require('genefilter') +#' data("enterotype") +#' require("genefilter") #' flist <- filterfun(kOverA(5, 2e-05)) #' ent.logi <- filter_taxa(enterotype, flist) #' ent.trim <- filter_taxa(enterotype, flist, TRUE) #' identical(ent.trim, prune_taxa(ent.logi, enterotype)) #' identical(sum(ent.logi), ntaxa(ent.trim)) #' filter_taxa(enterotype, flist, TRUE) -filter_taxa <- function(physeq, flist, prune = FALSE) { - # access OTU table - OTU <- access(physeq, "otu_table", TRUE) - # Enforce orientation (we are filtering taxa, not samples) - if (!taxa_are_rows(OTU)) { - OTU <- t(OTU) - } - # Coerce to vanilla matrix - OTU <- as(OTU, "matrix") - # Apply filtering function(s), get logical of length ntaxa(physeq) - ans <- apply(OTU, 1, flist) - # sanity check - if (ntaxa(physeq) != length(ans)) { - stop("Logic error in applying function(s). Logical result not same length as ntaxa(physeq)") - } - # Now return logical or pruned phyloseq-class instance. - if (prune) { - return(prune_taxa(ans, physeq)) - } else { - return(ans) - } +filter_taxa <- function(physeq, flist, prune=FALSE){ + # access OTU table + OTU <- access(physeq, "otu_table", TRUE) + # Enforce orientation (we are filtering taxa, not samples) + if(!taxa_are_rows(OTU)) { + OTU <- t(OTU) + } + # Coerce to vanilla matrix + OTU <- as(OTU, "matrix") + # Apply filtering function(s), get logical of length ntaxa(physeq) + ans <- apply(OTU, 1, flist) + # sanity check + if( ntaxa(physeq) != length(ans) ){ + stop("Logic error in applying function(s). Logical result not same length as ntaxa(physeq)") + } + # Now return logical or pruned phyloseq-class instance. + if( prune ){ + return( prune_taxa(ans, physeq) ) + } else { + return( ans ) + } } -################################################################################ +################################################################################ #' Make filter fun. the most abundant \code{k} taxa #' #' @usage topk(k, na.rm=TRUE) @@ -1064,15 +1059,13 @@ filter_taxa <- function(physeq, flist, prune = FALSE) { #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -topk = function(k, na.rm = TRUE) { - function(x) { - if (na.rm) { - x = x[!is.na(x)] +topk = function(k, na.rm=TRUE){ + function(x){ + if(na.rm){x = x[!is.na(x)]} + x >= sort(x, decreasing=TRUE)[k] } - x >= sort(x, decreasing = TRUE)[k] - } } -############################################################ +############################################################ #' Make filter fun. that returns the most abundant \code{p} fraction of taxa #' #' @usage topp(p, na.rm=TRUE) @@ -1100,15 +1093,13 @@ topk = function(k, na.rm = TRUE) { #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -topp <- function(p, na.rm = TRUE) { - function(x) { - if (na.rm) { - x = x[!is.na(x)] +topp <- function(p, na.rm=TRUE){ + function(x){ + if(na.rm){x = x[!is.na(x)]} + x >= sort(x, decreasing=TRUE)[ceiling(length(x)*p)] } - x >= sort(x, decreasing = TRUE)[ceiling(length(x) * p)] - } } -################################################################################ +################################################################################ #' Make filter fun. that returns the top f fraction of taxa in a sample. #' #' As opposed to \code{\link{topp}}, which gives the @@ -1131,7 +1122,7 @@ topp <- function(p, na.rm = TRUE) { #' @export #' #' @examples -#' t1 <- 1:10; names(t1)<-paste('t', 1:10, sep='') +#' t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="") #' topf(0.6)(t1) #' ## Use simulated abundance matrix #' set.seed(711) @@ -1141,17 +1132,17 @@ topp <- function(p, na.rm = TRUE) { #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -topf <- function(f, na.rm = TRUE) { - function(x) { - if (na.rm) { - x = x[!is.na(x)] +topf <- function(f, na.rm=TRUE){ + function(x){ + if (na.rm){ + x = x[!is.na(x)] + } + y <- sort(x, decreasing = TRUE) + y <- cumsum(y)/sum(x) + return( (y <= f)[names(x)] ) } - y <- sort(x, decreasing = TRUE) - y <- cumsum(y)/sum(x) - return((y <= f)[names(x)]) - } } -################################################################################ +################################################################################ #' Set to FALSE any outlier species greater than f fractional abundance. #' #' This is for removing overly-abundant outlier taxa, not for trimming low-abundance @@ -1172,7 +1163,7 @@ topf <- function(f, na.rm = TRUE) { #' #' @export #' @examples -#' t1 <- 1:10; names(t1)<-paste('t', 1:10, sep='') +#' t1 <- 1:10; names(t1)<-paste("t", 1:10, sep="") #' rm_outlierf(0.15)(t1) #' ## Use simulated abundance matrix #' set.seed(711) @@ -1183,13 +1174,13 @@ topf <- function(f, na.rm = TRUE) { #' wh2 <- c(TRUE, TRUE, TRUE, FALSE, FALSE) #' prune_taxa(wh1, testOTU) #' prune_taxa(wh2, testOTU) -rm_outlierf <- function(f, na.rm = TRUE) { - function(x) { - if (na.rm) { - x = x[!is.na(x)] +rm_outlierf <- function(f, na.rm=TRUE){ + function(x){ + if(na.rm){ + x = x[!is.na(x)] + } + y <- x / sum(x) + return( y < f ) } - y <- x/sum(x) - return(y < f) - } } -################################################################################ +################################################################################ diff --git a/R/validity-methods.R b/R/validity-methods.R index 26dbb12b..a29bdc90 100644 --- a/R/validity-methods.R +++ b/R/validity-methods.R @@ -1,89 +1,117 @@ -################################################################################ Validity methods: These are delicate, because they are effectively at the S4 -################################################################################ infrastructure level, in between 'new' and the constructor. Some of the issues -################################################################################ that might otherwise go here for a check are handled by the constructors. In -################################################################################ many cases it desirable to let the constructor handle this, because it allows -################################################################################ greater flexibility and transparency. These tests should be limited to -################################################################################ conditions that are not fixed automatically by the constructors, and/or could -################################################################################ not be because the deficiency/error is too fundamental. By design, we expect -################################################################################ the validity errors to cause a fault before (nearly) any action by the -################################################################################ constructor. This is a special case where the accessors are not-used, in favor -################################################################################ of the S4 @tags. E.g. object@otu_table instead of otu_table(object). This is to -################################################################################ avoid any complications with the accessors interacting with objects early on. -################################################################################ Perhaps this is a mistake, but its a very limited case and won't be difficult -################################################################################ to change. Also, for now these are not documented at all at the user-level, -################################################################################ and are not expected to ever be at the 'user-level', so formal documentation -################################################################################ probably unnecessary. Lots of comments throughout this code will need to -################################################################################ compensate. otu_table: # # * all values must be numeric -################################################################################ (otu_table()-constructor should probably round values by default)) # # * all -################################################################################ values must be >= 0 (no negative abundances) -validotu_table <- function(object) { - # Both dimensions must have non-zero length. - if (any(dim(object) == 0)) { - return("\n OTU abundance data must have non-zero dimensions.") - } - # Verify that it is numeric matrix - if (!is.numeric(object@.Data[, 1])) { - text = "\n Non-numeric matrix provided as OTU table.\n" +################################################################################ +# Validity methods: +# +# These are delicate, because they are effectively at the S4 infrastructure +# level, in between "new" and the constructor. Some of the issues that might +# otherwise go here for a check are handled by the constructors. In many +# cases it desirable to let the constructor handle this, because it allows +# greater flexibility and transparency. These tests should be limited to +# conditions that are not fixed automatically by the constructors, and/or +# could not be because the deficiency/error is too fundamental. By design, +# we expect the validity errors to cause a fault before (nearly) any action +# by the constructor. +# +# This is a special case where the accessors are not-used, in favor of the +# S4 @tags. E.g. object@otu_table instead of otu_table(object). This is to avoid +# any complications with the accessors interacting with objects early on. +# Perhaps this is a mistake, but its a very limited case and won't be difficult +# to change. +# +# Also, for now these are not documented at all at the user-level, +# and are not expected to ever +# be at the "user-level", so formal documentation probably unnecessary. Lots +# of comments throughout this code will need to compensate. +################################################################################ +######################################## +# otu_table: +# # # * all values must be numeric (otu_table()-constructor should probably round values by default)) +# # # * all values must be >= 0 (no negative abundances) +######################################## +validotu_table <- function(object){ + # Both dimensions must have non-zero length. + if( any(dim(object)==0) ){ + return("\n OTU abundance data must have non-zero dimensions.") + } + # Verify that it is numeric matrix + if( !is.numeric(object@.Data[, 1]) ){ + text = "\n Non-numeric matrix provided as OTU table.\n" text = paste0(text, "Abundance is expected to be numeric.") - return(text) - } - return(TRUE) + return(text) + } + return(TRUE) } ## assign the function as the validity method for the otu_table class setValidity("otu_table", validotu_table) -######################################## sample_data: -validsample_data <- function(object) { - if (any(dim(object) == 0)) { - return("Sample Data must have non-zero dimensions.") - } - return(TRUE) +######################################## +######################################## +# sample_data: +######################################## +validsample_data <- function(object){ + if( any(dim(object)==0) ){ + return("Sample Data must have non-zero dimensions.") + } + return(TRUE) } ## assign the function as the validity method for the sample_data class setValidity("sample_data", validsample_data) -######################################## taxonomyTable: # # * all values must be a character # # * at least some -######################################## non-NULL (or equiv) values taxonomyTable validity function -validTaxonomyTable <- function(object) { - # Both dimensions must have non-zero length. - if (any(dim(object) == 0)) { - return("\n Taxonomy Table must have non-zero dimensions.") - } - # Verify that it is character matrix - if (!is.character(object@.Data[, 1])) { +######################################## +######################################## +# taxonomyTable: +######################################## +# # # * all values must be a character +# # # * at least some non-NULL (or equiv) values +# taxonomyTable validity function +######################################## +validTaxonomyTable <- function(object){ + # Both dimensions must have non-zero length. + if( any(dim(object)==0) ){ + return("\n Taxonomy Table must have non-zero dimensions.") + } + # Verify that it is character matrix + if( !is.character(object@.Data[, 1]) ){ text = "\n Non-character matrix provided as Taxonomy Table.\n" text = paste0(text, "Taxonomy is expected to be characters.") - return(text) - } - return(TRUE) + return(text) + } + return(TRUE) } ## assign the function as the validity method for the sample_data class setValidity("taxonomyTable", validTaxonomyTable) -######################################## tree: # (Any rules about trees appropriate in this context?) +######################################## +######################################## +# tree: +######################################## +# # (Any rules about trees appropriate in this context?) -######################################## phyloseq-class: Because data-index complete-matching is checked/enforced by the -######################################## phyloseq() constructor, it should not be checked here, or the constructor will -######################################## fail validity tests before it gets the chance to groom the objects. Instead, -######################################## the validity test can check if there is any intersection of the species names -######################################## and/or sample names, prior to any attempt by the constructor to prune (which -######################################## would end) in a mysterious index error, anyway -validphyloseq <- function(object) { - # There must be an otu_table - if (is.null(object@otu_table)) { - return("\n An otu_table is required for most analysis / graphics in the phyloseq-package") - } - # intersection of species-names must have non-zero length - if (length(intersect_taxa(object)) <= 0) { - return(paste("\n Component taxa/OTU names do not match.\n", " Taxa indices are critical to analysis.\n Try taxa_names()", - sep = "")) - } - # If there is sample data, check that sample-names overlap - if (!is.null(object@sam_data)) { - if (length(intersect(sample_names(object@sam_data), sample_names(object@otu_table))) <= - 0) { - return("\n Component sample names do not match.\n Try sample_names()") - } - } - return(TRUE) +######################################## +######################################## +# phyloseq-class: +######################################## +# Because data-index complete-matching is checked/enforced by the phyloseq() constructor, +# it should not be checked here, or the constructor will fail validity tests before +# it gets the chance to groom the objects. +# Instead, the validity test can check if there is any intersection of the species names +# and/or sample names, prior to any attempt by the constructor to prune (which would end) +# in a mysterious index error, anyway +######################################## +validphyloseq <- function(object){ + # There must be an otu_table + if( is.null(object@otu_table) ){ + return("\n An otu_table is required for most analysis / graphics in the phyloseq-package") + } + # intersection of species-names must have non-zero length + if( length(intersect_taxa(object)) <= 0 ){ + return(paste("\n Component taxa/OTU names do not match.\n", + " Taxa indices are critical to analysis.\n Try taxa_names()", sep="")) + } + # If there is sample data, check that sample-names overlap + if( !is.null(object@sam_data) ){ + if( length(intersect(sample_names(object@sam_data), sample_names(object@otu_table))) <= 0 ){ + return("\n Component sample names do not match.\n Try sample_names()") + } + } + return(TRUE) } ## assign the function as the validity method for the otu_table class setValidity("phyloseq", validphyloseq) -######################################## +########################################