diff --git a/NAMESPACE b/NAMESPACE index e9eeebc..e837294 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,10 +11,12 @@ export(differentialAbundance) export(getCollection) export(getComputeResult) export(getComputeResultWithMetadata) +export(getVariables) export(rankedAbundance) export(selfCorrelation) export(updateCollectionName) exportMethods(getCollectionNames) +exportMethods(getCollectionVariableNames) exportMethods(getMetadataVariableNames) exportMethods(getMetadataVariableSummary) exportMethods(getSampleMetadata) @@ -45,6 +47,7 @@ importFrom(veupathUtils,correlation) importFrom(veupathUtils,findAncestorIdColumns) importFrom(veupathUtils,findRecordIdColumn) importFrom(veupathUtils,getCollectionNames) +importFrom(veupathUtils,getCollectionVariableNames) importFrom(veupathUtils,getDataFromSource) importFrom(veupathUtils,getIdColumns) importFrom(veupathUtils,getMetadataVariableNames) diff --git a/R/methods-MbioDataset.R b/R/methods-MbioDataset.R index 6ec6611..c0e5200 100644 --- a/R/methods-MbioDataset.R +++ b/R/methods-MbioDataset.R @@ -283,4 +283,133 @@ setMethod("getCollection", "MbioDataset", function(object, collectionName = char } return(abundanceData) -}) \ No newline at end of file +}) + +collectionVarNamesGeneric <- getGeneric("getCollectionVariableNames", "veupathUtils") +#' Get Microbiome Dataset Collection Variable Names +#' +#' Get the variable names in a collection in the Microbiome Dataset. +#' +#' @examples +#' variableNames <- getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") +#' @param object A Microbiome Dataset +#' @param collectionName The name of the collection to return the variable names for +#' @return a character vector of the variable names in the requested collection +#' @export +#' @importFrom veupathUtils getCollectionVariableNames +setMethod(collectionVarNamesGeneric, "MbioDataset", function(object, collectionName) { + return(veupathUtils::getCollectionVariableNames(getCollection(object, collectionName))) +}) + +#' Get Microbiome Dataset Variables +#' +#' Get the variables in the Microbiome Dataset by their names. +#' The requested variables could belong to any collection or +#' to the metadata. The returned data.table will contain the +#' requested variables as columns and any appropriate identifiers. +#' If one of the requested variables cannot be returned, a warning +#' will be printed. +#' +#' @examples +#' getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") +#' getMetadataVariableNames(microbiomeData::DiabImmune) +#' variablesDT <- getVariables( +#' microbiomeData::DiabImmune, +#' list("metadata" = c("age_months", "sex"), +#' "16S (V4) Genus" = "Bacteroides", +#' "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)" +#' ) +#' ) +#' @param object A Microbiome Dataset +#' @param variables The names of the variables to return. This should be a named list +#' where the names are collection names and the values are variable names for that collection. +#' For the case of metadata variables, the name should be "metadata". +#' @return a data.table of the requested variables +#' @rdname getVariables +#' @export +setGeneric("getVariables", function(object, variables) standardGeneric("getVariables"), signature = "object") + +#' @rdname getVariables +#' @aliases getVariables,MbioDataset,character-method +setMethod("getVariables", "MbioDataset", function(object, variables) { + + if (!is.list(variables)) { + stop("variables argument must be a list") + } + if (is.null(names(variables))) { + stop("variables argument must be a named list") + } + + ## identify variables w identical names early + flattenedVars <- unlist(variables) + dups <- unname(flattenedVars[duplicated(flattenedVars)]) + collectionsWithDups <- lapply(variables, function(x) {dups %in% x}) + collectionsWithDupsIndexes <- unname(which(collectionsWithDups == TRUE)) + + fetchCollectionVariables <- function(collectionIndex) { + variableNames <- variables[[collectionIndex]] + collectionName <- names(variables)[collectionIndex] + + if (collectionName == "metadata") { + return(getSampleMetadata(object, metadataVariables = variableNames)) + } + + if (!collectionName %in% getCollectionNames(object)) { + stop(sprintf("Collection '%s' does not exist", collectionName)) + } + + if (any(variableNames %in% getCollectionVariableNames(object, collectionName))) { + collection <- getCollection(object, collectionName) + presentVars <- variableNames[variableNames %in% getCollectionVariableNames(collection)] + if (veupathUtils::isOneToManyWithAncestor(collection)) { + warning("Unable to return the following variables: ", presentVars) + return(data.table::data.table()) + } + dt <- veupathUtils::getCollectionData(collection, presentVars) + if (collectionIndex %in% collectionsWithDupsIndexes) { + ## rename variables to prepend the collection name + names(dt)[names(dt) %in% presentVars] <- paste(collectionName, presentVars) + } + return(dt) + } else { + stop(sprintf("Collection '%s' does not contain the following variables: %s", collectionName, paste(variableNames, collapse = ", "))) + } + } + + ## this kind of assumes that metadata are always on ancestor entities of assays + ## this will break for user data, when we get there + mergeCols <- getSampleMetadataIdColumns(object) + + if (length(variables) == 0) { + return(data.table::data.table()) + } + + collectionVarDTs <- lapply(1:length(variables), fetchCollectionVariables) + names(collectionVarDTs) <- names(variables) + collectionVarDT <- purrr::reduce(collectionVarDTs, customMerge, mergeCols = mergeCols) + + return(collectionVarDT) +}) + +## a helper that merges two collections of variables +## if either input is empty, returns the other +## use this w some caution. It is barely a general +## purpose function, and isnt really tested. +customMerge <- function(x, y, mergeCols = NULL) { + if (!inherits(x, "data.table")) { + stop("Argument 'x' must be a data.table") + } else if (!inherits(y, "data.table")) { + stop("Argument 'y' must be a data.table") + } + + if (!length(x)) { + return(y) + } else if (!length(y)) { + return(x) + } else { + if (is.null(mergeCols)) { + return(merge(x, y)) + } + return(merge(x, y, by = mergeCols)) + } +} \ No newline at end of file diff --git a/man/getCollectionVariableNames-MbioDataset-method.Rd b/man/getCollectionVariableNames-MbioDataset-method.Rd new file mode 100644 index 0000000..3a7c6e8 --- /dev/null +++ b/man/getCollectionVariableNames-MbioDataset-method.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-MbioDataset.R +\name{getCollectionVariableNames,MbioDataset-method} +\alias{getCollectionVariableNames,MbioDataset-method} +\title{Get Microbiome Dataset Collection Variable Names} +\usage{ +\S4method{getCollectionVariableNames}{MbioDataset}(object, collectionName) +} +\arguments{ +\item{object}{A Microbiome Dataset} + +\item{collectionName}{The name of the collection to return the variable names for} +} +\value{ +a character vector of the variable names in the requested collection +} +\description{ +Get the variable names in a collection in the Microbiome Dataset. +} +\examples{ +variableNames <- getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") +} diff --git a/man/getVariables.Rd b/man/getVariables.Rd new file mode 100644 index 0000000..d95693e --- /dev/null +++ b/man/getVariables.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-MbioDataset.R +\name{getVariables} +\alias{getVariables} +\alias{getVariables,MbioDataset-method} +\alias{getVariables,MbioDataset,character-method} +\title{Get Microbiome Dataset Variables} +\usage{ +getVariables(object, variables) + +\S4method{getVariables}{MbioDataset}(object, variables) +} +\arguments{ +\item{object}{A Microbiome Dataset} + +\item{variables}{The names of the variables to return. This should be a named list +where the names are collection names and the values are variable names for that collection. +For the case of metadata variables, the name should be "metadata".} +} +\value{ +a data.table of the requested variables +} +\description{ +Get the variables in the Microbiome Dataset by their names. +The requested variables could belong to any collection or +to the metadata. The returned data.table will contain the +requested variables as columns and any appropriate identifiers. +If one of the requested variables cannot be returned, a warning +will be printed. +} +\examples{ +getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") +getMetadataVariableNames(microbiomeData::DiabImmune) +variablesDT <- getVariables( + microbiomeData::DiabImmune, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)" + ) +) +} diff --git a/tests/testthat/test-MbioDataset.R b/tests/testthat/test-MbioDataset.R index c5d0479..d20448c 100644 --- a/tests/testthat/test-MbioDataset.R +++ b/tests/testthat/test-MbioDataset.R @@ -77,4 +77,72 @@ test_that("we can update collection names and get collections", { testCollection <- getCollection(testDataset, "My Collection", "phyloseq") expect_s4_class(testCollection, "phyloseq") +}) + +test_that("we can get arbitrary variables", { + dataFile1 <- test_path('testdata','DiabImmune/DiabImmune_entity_16SRRNAV4Assay.txt') + metadataFile1 <- test_path('testdata','DiabImmune/DiabImmune_ParticipantRepeatedMeasure.txt') + dataFile2 <- test_path('testdata','DiabImmune/DiabImmune_MetagenomicSequencingAssay.txt') + metadataFile2 <- test_path('testdata','DiabImmune/DiabImmune_Participant.txt') + metadataFile3 <- test_path('testdata','DiabImmune/DiabImmune_Sample.txt') + ontologyFile <- test_path('testdata','DiabImmune/DiabImmune_OntologyMetadata.txt') + mbioDataset <- MbioDataset(list(dataFile1, dataFile2), list(metadataFile2, metadataFile1, metadataFile3), ontologyFile) + + # try a sensible thing w vars on different 1:1 entities + variablesDT <- getVariables( + mbioDataset, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)" + ) + ) + # expect a data.table w four columns + expect_s3_class(variablesDT, "data.table") + expect_equal(length(variablesDT), 9) # 4 vars + 5 ids + expect_equal(all(c("age_months", "sex", "Bacteroides", "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)") %in% names(variablesDT)), TRUE) + expect_equal(nrow(variablesDT) > 0, TRUE) + + # try a var that doesnt exist + expect_error( + variablesDT <- getVariables( + mbioDataset, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)", + "WGS Genus" = "doesntexist" + ) + ) + ) + + + # try a collection that doesnt exist + expect_error( + variablesDT <- getVariables( + mbioDataset, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "doesntexist" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)" + ) + ) + ) + + # try the same named variable on two different collections + variablesDT <- getVariables( + mbioDataset, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "WGS Genus" = "Bacteroides" + ) + ) + + expect_s3_class(variablesDT, "data.table") + expect_equal(length(variablesDT), 9) # 4 vars + 5 ids + expect_equal(all(c("age_months", "sex", "16S (V4) Genus Bacteroides", "WGS Genus Bacteroides") %in% names(variablesDT)), TRUE) + expect_equal(nrow(variablesDT) > 0, TRUE) + + # pass something other than a named list + expect_error(variablesDT <- getVariables(mbioDataset, "16S (V4) Genus")) + expect_error(variablesDT <- getVariables(mbioDataset, list("16S (V4) Genus)"))) + + # find an ex where assays arent 1:1 w ancestors }) \ No newline at end of file