Merge pull request #36 from microbiomeDB/treeSE

importing user data
microbiomeDB · Jun 5, 2024 · 0e8b7e8 · 0e8b7e8
2 parents 2e8071c + 8d5a4c4
commit 0e8b7e8
Show file tree

Hide file tree

Showing 15 changed files with 902 additions and 7 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -16,8 +16,8 @@ Imports:
     Maaslin2,
     methods,
     microbiomeComputations,
-    phyloseq,
-    purrr
+    purrr,
+    TreeSummarizedExperiment
 Remotes:
     microbiomeDB/veupathUtils,
     microbiomeDB/corGraph,
@@ -36,6 +36,10 @@ Suggests:
     microbiomeData,
     knitr,
     rmarkdown,
-    tidyverse
+    tidyverse,
+    mia,
+    dada2,
+    biomformat,
+    phyloseq
 Config/testthat/edition: 3
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,14 @@ export(getCollection)
 export(getComputeResult)
 export(getComputeResultWithMetadata)
 export(getVariables)
+export(importBIOM)
+export(importDADA2)
+export(importHUMAnN)
+export(importMetaPhlAn)
+export(importMothur)
+export(importPhyloseq)
+export(importQIIME2)
+export(importTreeSummarizedExperiment)
 export(rankedAbundance)
 export(selfCorrelation)
 export(updateCollectionName)
@@ -23,9 +31,18 @@ exportMethods(getSampleMetadata)
 import(data.table)
 importFrom(DESeq2,DESeqDataSetFromMatrix)
 importFrom(Maaslin2,Maaslin2)
+importFrom(SummarizedExperiment,rowData)
 importFrom(corGraph,bipartiteNetwork)
 importFrom(corGraph,unipartiteNetwork)
+importFrom(data.table,data.table)
 importFrom(igraph,graph_from_data_frame)
+importFrom(mia,importHUMAnN)
+importFrom(mia,importMetaPhlAn)
+importFrom(mia,importMothur)
+importFrom(mia,importQIIME2)
+importFrom(mia,makeTreeSEFromBiom)
+importFrom(mia,makeTreeSEFromDADA2)
+importFrom(mia,makeTreeSEFromPhyloseq)
 importFrom(microbiomeComputations,AbsoluteAbundanceData)
 importFrom(microbiomeComputations,AbundanceData)
 importFrom(microbiomeComputations,Comparator)

diff --git a/R/importers-MbioDataset.R b/R/importers-MbioDataset.R
@@ -0,0 +1,332 @@
+
+buildCollectionFromTreeSE <- function(
+    collectionName = list(assayDataName = NULL, rowDataColumnName = NULL), 
+    rowData, # this is a data.frame representing the row data/ tree
+    assayData,
+    normalizationMethod = c("TSS", "none"),
+    verbose = c(TRUE, FALSE)
+) {
+    verbose <- veupathUtils::matchArg(verbose)
+
+    assayDataName <- collectionName$assayDataName
+    rowDataColumnName <- collectionName$rowDataColumnName
+
+    if (is.null(assayDataName) || is.null(rowDataColumnName)) {
+        stop("Must specify both assayDataName and rowDataColumnName as named elements of the collectionName list argument")
+    }
+
+    assayDT <- as.data.frame.matrix(assayData, col.names = colnames(assayData), row.names = row.names(assayData))
+    dt <- data.table::as.data.table(merge(assayDT, rowData[rowDataColumnName], by = 0))
+    dt$Row.names <- NULL
+
+    recordIDs <- names(dt)[names(dt) != rowDataColumnName]
+    dt <- dt[, lapply(.SD, sum, na.rm=TRUE), by=rowDataColumnName]
+    dt <- data.table::transpose(dt, make.names=rowDataColumnName)
+
+    # if this does grow into other methods, the normalization step could be factored out probably
+    if (normalizationMethod == "TSS") {
+        dt <- dt / rowSums(dt)
+    }
+
+    dt$recordIDs <- recordIDs
+
+    recordIdColumn <- 'recordIDs'
+    ancestorIdColumns <- character(0)
+    collectionName <- paste0(assayDataName, ": ", rowDataColumnName)
+    if (normalizationMethod != "none") {
+        collectionName <- paste0(collectionName, " (", normalizationMethod, " normalized)")
+    }
+
+    collection <- veupathUtils::Collection(
+        data = dt,
+        recordIdColumn = recordIdColumn,
+        ancestorIdColumns = ancestorIdColumns,
+        name = collectionName
+    )
+
+    return(collection)
+}
+
+#' Import TreeSummarizedExperiment
+#' 
+#' Import data from TreeSummarizedExperiment to MbioDataset.
+#' There is some loss of granularity in this process. It results
+#' in a simpler and more performant object which is compliant
+#' with the MicrobiomeDB infrastructure.
+#' 
+#' @param data A TreeSummarizedExperiment
+#' @param normalizationMethod Normalization method to use on they assay data. Options are "none" and "TSS".
+#' Applying TSS normalization to absolute taxonomic abundances produces relative taxonomic abundances. Default is "TSS".
+#' @param keepRawValues Keep the raw assay values as well as the normalized values.
+#' @param verbose Print messages
+#' @return A MbioDataset
+#' @importFrom purrr reduce
+#' @importFrom data.table data.table
+#' @importFrom SummarizedExperiment rowData
+#' @rdname importTreeSummarizedExperiment
+#' @export
+importTreeSummarizedExperiment <- function(data, normalizationMethod = c("TSS", "none"), keepRawValues = c(TRUE, FALSE), verbose = c(TRUE, FALSE)) {
+
+    normalizationMethod <- veupathUtils::matchArg(normalizationMethod)
+    keepRawValues <- veupathUtils::matchArg(keepRawValues)
+    verbose <- veupathUtils::matchArg(verbose)
+
+    if (!inherits(data, "SummarizedExperiment")) {
+        stop("data must be or extend a SummarizedExperiment")
+    }
+
+    if (keepRawValues == FALSE && normalizationMethod == "none") {
+        stop("keepRawValues must be TRUE when normalizationMethod is 'none'")
+    }
+
+    # figure out what all assays we have, and what data is available per assay
+    # these will become collections in the MbioDataset
+    # TODO it looks like rowData is expected to be same across all assays,
+    # which is odd to me, but probably means we can simplify this logic
+    collectionsDTList <- lapply(names(data@assays@data), function(x) {
+        data.table::data.table(assayDataName = x, rowDataColumnName = colnames(SummarizedExperiment::rowData(data)))
+    })
+    collectionsDT <- purrr::reduce(collectionsDTList, rbind)
+
+    if (nrow(collectionsDT) != 0) {
+        rawCollectionsList <- list()
+        if (keepRawValues) {
+            # call buildCollectionFromTreeSE for each column of each assay data, w normalization 'none'
+            collectionsByAssayList <- apply(collectionsDT, 1, function(x) {
+                collectionName = as.list(x, keep.names=TRUE);
+
+                buildCollectionFromTreeSE(
+                    collectionName = collectionName, 
+                    rowData = as.data.frame(SummarizedExperiment::rowData(data)), 
+                    assayData = data@assays@data[[collectionName$assayDataName]], 
+                    normalizationMethod = "none",
+                    verbose = verbose
+                )
+            })
+            rawCollectionsList <- purrr::reduce(collectionsByAssayList, c)
+        }
+
+        normalizedCollectionsList <- list()
+        if (normalizationMethod != "none") {
+            collectionsByAssayList <- apply(collectionsDT, 1, function(x) {
+                collectionName = as.list(x, keep.names=TRUE);
+
+                buildCollectionFromTreeSE(
+                    collectionName = collectionName,
+                    rowData = as.data.frame(SummarizedExperiment::rowData(data)), 
+                    assayData = data@assays@data[[collectionName$assayDataName]], 
+                    normalizationMethod = normalizationMethod, 
+                    verbose = verbose
+                )
+            })
+            normalizedCollectionsList <- purrr::reduce(collectionsByAssayList, c)
+        }
+
+        collectionsList <- c(rawCollectionsList, normalizedCollectionsList)
+    } else {
+        collectionsList <- veupathUtils::Collections()
+    }
+
+
+    # build and validate MbioDataset, colData becomes sampleMetadata
+    colData <- SummarizedExperiment::colData(data)
+    metadataDT <- data.table::data.table()
+    if (!!length(colData)) {
+        metadataDT <- data.table::as.data.table(SummarizedExperiment::colData(data))
+        metadataDT$recordIDs <- rownames(SummarizedExperiment::colData(data))
+    }
+    if (length(metadataDT) == 1) metadataDT <- data.table::data.table()
+
+    mbioDataset <- MbioDataset(
+        collections = collectionsList, 
+        metadata = SampleMetadata(data = metadataDT, recordIdColumn = "recordIDs")
+    )
+
+    # return a MbioDataset
+    return(mbioDataset)
+}
+
+#' @rdname importTreeSummarizedExperiment
+importTreeSE <- importTreeSummarizedExperiment
+
+## lean on miaverse to import biom, phyloseq, csv, etc
+## TODO do these also needs args about relative abundances? id think so..
+
+#' Import HUMAnN data
+#' 
+#' Import data from HUMAnN results to MbioDataset. There is
+#' some loss of granularity in this process. It results
+#' in a simpler and more performant object which is compliant
+#' with the MicrobiomeDB infrastructure. See \code{mia::importHUMAnN}
+#' for documentation.
+#' 
+#' @param normalizationMethod Normalization method to use on they assay data. Options are "none" and "TSS".
+#' Applying TSS normalization to absolute taxonomic abundances produces relative taxonomic abundances. Default is "TSS".
+#' @param keepRawValues Keep the raw assay values as well as the normalized values.
+#' @param verbose Print messages
+#' @param ... Arguments to pass to mia::importHUMAnN
+#' @return A MbioDataset
+#' @export
+#' @importFrom mia importHUMAnN
+importHUMAnN <- function(normalizationMethod = c("TSS", "none"), keepRawValues = c(TRUE, FALSE), verbose = c(TRUE, FALSE), ...) {
+    .require_package("mia")
+
+    treeSE <- mia::importHUMAnN(...)
+
+    mbioDataset <- importTreeSummarizedExperiment(treeSE, normalizationMethod = normalizationMethod, keepRawValues = keepRawValues, verbose = verbose)
+    return(mbioDataset)
+}
+
+#' Import MetaPhlAn data
+#' 
+#' Import data from MetaPhlAn results to MbioDataset. There is
+#' some loss of granularity in this process. It results
+#' in a simpler and more performant object which is compliant
+#' with the MicrobiomeDB infrastructure. See \code{mia::importMetaPhlAn}
+#' for documentation.
+#' 
+#' @param normalizationMethod Normalization method to use on they assay data. Options are "none" and "TSS".
+#' Applying TSS normalization to absolute taxonomic abundances produces relative taxonomic abundances. Default is "TSS".
+#' @param keepRawValues Keep the raw assay values as well as the normalized values.
+#' @param verbose Print messages
+#' @param ... Arguments to pass to mia::importMetaPhlAn
+#' @return A MbioDataset
+#' @export
+#' @importFrom mia importMetaPhlAn
+importMetaPhlAn <- function(normalizationMethod = c("TSS", "none"), keepRawValues = c(TRUE, FALSE), verbose = c(TRUE, FALSE), ...) {
+    .require_package("mia")
+
+    treeSE <- mia::importMetaPhlAn(...)
+
+    mbioDataset <- importTreeSummarizedExperiment(treeSE, normalizationMethod = normalizationMethod, keepRawValues = keepRawValues, verbose = verbose)
+    return(mbioDataset)
+}
+
+#' Import MOTHUR data
+#' 
+#' Import data from MOTHUR results to MbioDataset. There is
+#' some loss of granularity in this process. It results
+#' in a simpler and more performant object which is compliant
+#' with the MicrobiomeDB infrastructure. See \code{mia::importMothur}
+#' for documentation.
+#' 
+#' @param normalizationMethod Normalization method to use on they assay data. Options are "none" and "TSS".
+#' Applying TSS normalization to absolute taxonomic abundances produces relative taxonomic abundances. Default is "TSS".
+#' @param keepRawValues Keep the raw assay values as well as the normalized values.
+#' @param verbose Print messages
+#' @param ... Arguments to pass to mia::importMothur
+#' @return A MbioDataset
+#' @export
+#' @importFrom mia importMothur
+importMothur <- function(normalizationMethod = c("TSS", "none"), keepRawValues = c(TRUE, FALSE), verbose = c(TRUE, FALSE), ...) {
+    .require_package("mia")
+
+    treeSE <- mia::importMothur(...)
+
+    mbioDataset <- importTreeSummarizedExperiment(treeSE, normalizationMethod = normalizationMethod, keepRawValues = keepRawValues, verbose = verbose)
+    return(mbioDataset)
+}
+
+#' Import QIIME2 data
+#' 
+#' Import data from QIIME2 results to MbioDataset. There is
+#' some loss of granularity in this process. It results
+#' in a simpler and more performant object which is compliant
+#' with the MicrobiomeDB infrastructure. See \code{mia::importQIIME2}
+#' for documentation.
+#' 
+#' @param normalizationMethod Normalization method to use on they assay data. Options are "none" and "TSS".
+#' Applying TSS normalization to absolute taxonomic abundances produces relative taxonomic abundances. Default is "TSS".
+#' @param keepRawValues Keep the raw assay values as well as the normalized values.
+#' @param verbose Print messages
+#' @param ... Arguments to pass to mia::importQIIME2
+#' @return A MbioDataset
+#' @export
+#' @importFrom mia importQIIME2
+importQIIME2 <- function(normalizationMethod = c("TSS", "none"), keepRawValues = c(TRUE, FALSE), verbose = c(TRUE, FALSE), ...) {
+    .require_package("mia")
+
+    treeSE <- mia::importQIIME2(...)
+
+    mbioDataset <- importTreeSummarizedExperiment(treeSE, normalizationMethod = normalizationMethod, keepRawValues = keepRawValues, verbose = verbose)
+    return(mbioDataset)
+}
+
+#' Import BIOM data
+#' 
+#' Import data from BIOM results to MbioDataset. There is
+#' some loss of granularity in this process. It results
+#' in a simpler and more performant object which is compliant
+#' with the MicrobiomeDB infrastructure. See \code{mia::makeTreeSEFromBiom}
+#' for documentation.
+#' 
+#' @param normalizationMethod Normalization method to use on they assay data. Options are "none" and "TSS".
+#' Applying TSS normalization to absolute taxonomic abundances produces relative taxonomic abundances. Default is "TSS".
+#' @param keepRawValues Keep the raw assay values as well as the normalized values.
+#' @param verbose Print messages
+#' @param ... Arguments to pass to mia::makeTreeSEFromBiom
+#' @return A MbioDataset
+#' @export
+#' @importFrom mia makeTreeSEFromBiom
+importBIOM <- function(normalizationMethod = c("TSS", "none"), keepRawValues = c(TRUE, FALSE), verbose = c(TRUE, FALSE), ...) {
+    .require_package("mia")
+    .require_package("biomformat")
+
+    biom <- biomformat::read_biom(...)
+    treeSE <- mia::makeTreeSEFromBiom(obj=biom)
+
+    mbioDataset <- importTreeSummarizedExperiment(treeSE, normalizationMethod = normalizationMethod, keepRawValues = keepRawValues, verbose = verbose)
+    return(mbioDataset)
+}
+
+#' Import DADA2 data
+#' 
+#' Import data from DADA2 results to MbioDataset. There is
+#' some loss of granularity in this process. It results
+#' in a simpler and more performant object which is compliant
+#' with the MicrobiomeDB infrastructure. See \code{mia::makeTreeSEFromDADA2}
+#' for documentation.
+#' 
+#' @param normalizationMethod Normalization method to use on they assay data. Options are "none" and "TSS".
+#' Applying TSS normalization to absolute taxonomic abundances produces relative taxonomic abundances. Default is "TSS".
+#' @param keepRawValues Keep the raw assay values as well as the normalized values.
+#' @param verbose Print messages
+#' @param ... Arguments to pass to mia::makeTreeSEFromDADA2
+#' @return A MbioDataset
+#' @export
+#' @importFrom mia makeTreeSEFromDADA2
+importDADA2 <- function(normalizationMethod = c("TSS", "none"), keepRawValues = c(TRUE, FALSE), verbose = c(TRUE, FALSE), ...) {
+    .require_package("mia")
+    .require_package("dada2")
+
+    treeSE <- mia::makeTreeSEFromDADA2(...)
+
+    mbioDataset <- importTreeSummarizedExperiment(treeSE, normalizationMethod = normalizationMethod, keepRawValues = keepRawValues, verbose = verbose)
+    return(mbioDataset)
+}
+
+#' Import Phyloseq data
+#' 
+#' Import data from Phyloseq results to MbioDataset. There is
+#' some loss of granularity in this process. It results
+#' in a simpler and more performant object which is compliant
+#' with the MicrobiomeDB infrastructure. See \code{mia::makeTreeSEFromPhyloseq}
+#' for documentation.
+#' 
+#' @param normalizationMethod Normalization method to use on they assay data. Options are "none" and "TSS".
+#' Applying TSS normalization to absolute taxonomic abundances produces relative taxonomic abundances. Default is "TSS".
+#' @param keepRawValues Keep the raw assay values as well as the normalized values.
+#' @param verbose Print messages
+#' @param ... Arguments to pass to mia::makeTreeSEFromPhyloseq
+#' @return A MbioDataset
+#' @export
+#' @importFrom mia makeTreeSEFromPhyloseq
+importPhyloseq <- function(normalizationMethod = c("TSS", "none"), keepRawValues = c(TRUE, FALSE), verbose = c(TRUE, FALSE), ...) {
+    .require_package("mia")
+    .require_package("phyloseq")
+
+    treeSE <- mia::makeTreeSEFromPhyloseq(...)
+
+    mbioDataset <- importTreeSummarizedExperiment(treeSE, normalizationMethod = normalizationMethod, keepRawValues = keepRawValues, verbose = verbose)
+    return(mbioDataset)
+}
diff --git a/R/methods-MbioDataset.R b/R/methods-MbioDataset.R
@@ -259,6 +259,7 @@ setMethod("getCollection", "MbioDataset", function(object, collectionName = char
             )
         }
     } else if (format == "phyloseq") {
+        .require_package("phyloseq")
 
         sampleNames <- collectionDT[[collection@recordIdColumn]]
         keepCols <- names(collectionDT)[! names(collectionDT) %in% collectionIdColumns]