From b94495132ff2e0c77cd982832ce74fa6b9ce427f Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Sun, 21 Apr 2024 22:47:38 -0400 Subject: [PATCH 1/6] draft getVariables method --- R/methods-MbioDataset.R | 115 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/R/methods-MbioDataset.R b/R/methods-MbioDataset.R index 6ec6611..b8a6b7a 100644 --- a/R/methods-MbioDataset.R +++ b/R/methods-MbioDataset.R @@ -283,4 +283,119 @@ setMethod("getCollection", "MbioDataset", function(object, collectionName = char } return(abundanceData) +}) + +collectionVarNamesGeneric <- getGeneric("getCollectionVariableNames", "veupathUtils") +#' Get Microbiome Dataset Collection Variable Names +#' +#' Get the variable names in a collection in the Microbiome Dataset. +#' +#' @examples +#' variableNames <- getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") +#' @param object A Microbiome Dataset +#' @param collectionName The name of the collection to return the variable names for +#' @return a character vector of the variable names in the requested collection +#' @export +#' @rdname getCollectionVariableNames +#' @aliases getCollectionVariableNames,MbioDataset,character-method +setMethod(collectionVarNamesGeneric, "MbioDataset", function(object, collectionName) { + return(veupathUtils::getCollectionVariableNames(getCollection(object, collectionName))) +}) + +#' Get Microbiome Dataset Variables +#' +#' Get the variables in the Microbiome Dataset by their names. +#' The requested variables could belong to any collection or +#' to the metadata. The returned data.table will contain the +#' requested variables as columns. If one of the requested +#' variables cannot be returned, a warning will be printed. +#' +#' @examples +#' getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") +#' getMetadataVariableNames(microbiomeData::DiabImmune) +#' variablesDT <- getVariables( +#' microbiomeData::DiabImmune, +#' list("metadata" = c("age_months", "sex"), +#' "16S (V4) Genus" = "Bacteroides", +#' "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)" +#' ) +#' ) +#' @param object A Microbiome Dataset +#' @param variables The names of the variables to return. this should be a named list +#' where the names are collection names and the values are variable names for that collection. +#' For the case of metadata variables, the name should be "metadata". +#' @return a data.table of the requested variables +#' @rdname getVariables +#' @export +setGeneric("getVariables", function(object, variables) standardGeneric("getVariables"), signature = "object") + +#' @rdname getVariables +#' @aliases getVariables,MbioDataset,character-method +setMethod("getVariables", "MbioDataset", function(object, variables) { + + if (!is.list(variables)) { + stop("variables argument must be a list") + } + if (is.null(names(variables))) { + stop("variables argument must be a named list") + } + + ## identify variables w identical names early + flattenedVars <- unlist(variables) + dups <- unname(flattenedVars[duplicated(flattenedVars)]) + collectionsWithDups <- lapply(variables, function(x) {dups %in% x}) + collectionsWithDupsIndexes <- unname(which(collectionsWithDups == TRUE)) + + fetchCollectionVariables <- function(collectionIndex) { + variableNames <- variables[[collectionIndex]] + collectionName <- names(variables)[collectionIndex] + + if (collectionName == "metadata") { + return(getSampleMetadata(object, metadataVariables = variableNames)) + } + + if (!collectionName %in% getCollectionNames(object)) { + stop(sprintf("Collection '%s' does not exist", collectionName)) + } + + if (any(variableNames %in% getCollectionVariableNames(object, collectionName))) { + collection <- getCollection(object, collectionName) + presentVars <- variableNames[variableNames %in% getCollectionVariableNames(collection)] + if (veupathUtils::isOneToManyWithAncestor(collection)) { + warning("Unable to return the following variables: ", presentVars) + return(data.table::data.table()) + } + dt <- veupathUtils::getCollectionData(collection, presentVars) + if (collectionIndex %in% collectionsWithDupsIndexes) { + ## rename variables to prepend the collection name + names(dt)[names(dt) %in% presentVars] <- paste(collectionName, presentVars) + } + return(dt) + } else { + stop(sprintf("Collection '%s' does not contain the following variables: %s", collectionName, paste(variableNames, collapse = ", "))) + } + } + + ## this kind of assumes that metadata are always on ancestor entities of assays + ## this will break for user data, when we get there + mergeCols <- getSampleMetadataIdColumns(object) + mergeCollectionVariables <- function(x, y) { + if (!length(x)) { + return(y) + } else if (!length(y)) { + return(x) + } else { + return(merge(x, y, by = mergeCols)) + } + } + + if (length(variables) == 0) { + return(data.table::data.table()) + } + + collectionVarDTs <- lapply(1:length(variables), fetchCollectionVariables) + names(collectionVarDTs) <- names(variables) + collectionVarDT <- purrr::reduce(collectionVarDTs, mergeCollectionVariables) + + return(collectionVarDT) }) \ No newline at end of file From 30324e229fb3541828eb5850ede128b2ae02deaf Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Sun, 21 Apr 2024 22:47:58 -0400 Subject: [PATCH 2/6] add tests for getVariables --- tests/testthat/test-MbioDataset.R | 68 +++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tests/testthat/test-MbioDataset.R b/tests/testthat/test-MbioDataset.R index c5d0479..d20448c 100644 --- a/tests/testthat/test-MbioDataset.R +++ b/tests/testthat/test-MbioDataset.R @@ -77,4 +77,72 @@ test_that("we can update collection names and get collections", { testCollection <- getCollection(testDataset, "My Collection", "phyloseq") expect_s4_class(testCollection, "phyloseq") +}) + +test_that("we can get arbitrary variables", { + dataFile1 <- test_path('testdata','DiabImmune/DiabImmune_entity_16SRRNAV4Assay.txt') + metadataFile1 <- test_path('testdata','DiabImmune/DiabImmune_ParticipantRepeatedMeasure.txt') + dataFile2 <- test_path('testdata','DiabImmune/DiabImmune_MetagenomicSequencingAssay.txt') + metadataFile2 <- test_path('testdata','DiabImmune/DiabImmune_Participant.txt') + metadataFile3 <- test_path('testdata','DiabImmune/DiabImmune_Sample.txt') + ontologyFile <- test_path('testdata','DiabImmune/DiabImmune_OntologyMetadata.txt') + mbioDataset <- MbioDataset(list(dataFile1, dataFile2), list(metadataFile2, metadataFile1, metadataFile3), ontologyFile) + + # try a sensible thing w vars on different 1:1 entities + variablesDT <- getVariables( + mbioDataset, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)" + ) + ) + # expect a data.table w four columns + expect_s3_class(variablesDT, "data.table") + expect_equal(length(variablesDT), 9) # 4 vars + 5 ids + expect_equal(all(c("age_months", "sex", "Bacteroides", "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)") %in% names(variablesDT)), TRUE) + expect_equal(nrow(variablesDT) > 0, TRUE) + + # try a var that doesnt exist + expect_error( + variablesDT <- getVariables( + mbioDataset, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)", + "WGS Genus" = "doesntexist" + ) + ) + ) + + + # try a collection that doesnt exist + expect_error( + variablesDT <- getVariables( + mbioDataset, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "doesntexist" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)" + ) + ) + ) + + # try the same named variable on two different collections + variablesDT <- getVariables( + mbioDataset, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "WGS Genus" = "Bacteroides" + ) + ) + + expect_s3_class(variablesDT, "data.table") + expect_equal(length(variablesDT), 9) # 4 vars + 5 ids + expect_equal(all(c("age_months", "sex", "16S (V4) Genus Bacteroides", "WGS Genus Bacteroides") %in% names(variablesDT)), TRUE) + expect_equal(nrow(variablesDT) > 0, TRUE) + + # pass something other than a named list + expect_error(variablesDT <- getVariables(mbioDataset, "16S (V4) Genus")) + expect_error(variablesDT <- getVariables(mbioDataset, list("16S (V4) Genus)"))) + + # find an ex where assays arent 1:1 w ancestors }) \ No newline at end of file From 1269f5951ee87ec8949abc33d7915ca1a8047166 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Sun, 21 Apr 2024 22:48:11 -0400 Subject: [PATCH 3/6] update generated docs --- NAMESPACE | 2 ++ man/getCollectionVariableNames.Rd | 23 ++++++++++++++++++ man/getVariables.Rd | 40 +++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 man/getCollectionVariableNames.Rd create mode 100644 man/getVariables.Rd diff --git a/NAMESPACE b/NAMESPACE index e9eeebc..8ba8c94 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,10 +11,12 @@ export(differentialAbundance) export(getCollection) export(getComputeResult) export(getComputeResultWithMetadata) +export(getVariables) export(rankedAbundance) export(selfCorrelation) export(updateCollectionName) exportMethods(getCollectionNames) +exportMethods(getCollectionVariableNames) exportMethods(getMetadataVariableNames) exportMethods(getMetadataVariableSummary) exportMethods(getSampleMetadata) diff --git a/man/getCollectionVariableNames.Rd b/man/getCollectionVariableNames.Rd new file mode 100644 index 0000000..f315c49 --- /dev/null +++ b/man/getCollectionVariableNames.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-MbioDataset.R +\name{getCollectionVariableNames,MbioDataset-method} +\alias{getCollectionVariableNames,MbioDataset-method} +\alias{getCollectionVariableNames,MbioDataset,character-method} +\title{Get Microbiome Dataset Collection Variable Names} +\usage{ +\S4method{getCollectionVariableNames}{MbioDataset}(object, collectionName) +} +\arguments{ +\item{object}{A Microbiome Dataset} + +\item{collectionName}{The name of the collection to return the variable names for} +} +\value{ +a character vector of the variable names in the requested collection +} +\description{ +Get the variable names in a collection in the Microbiome Dataset. +} +\examples{ +variableNames <- getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") +} diff --git a/man/getVariables.Rd b/man/getVariables.Rd new file mode 100644 index 0000000..a6ee65f --- /dev/null +++ b/man/getVariables.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/methods-MbioDataset.R +\name{getVariables} +\alias{getVariables} +\alias{getVariables,MbioDataset-method} +\alias{getVariables,MbioDataset,character-method} +\title{Get Microbiome Dataset Variables} +\usage{ +getVariables(object, variables) + +\S4method{getVariables}{MbioDataset}(object, variables) +} +\arguments{ +\item{object}{A Microbiome Dataset} + +\item{variables}{The names of the variables to return. this should be a named list +where the names are collection names and the values are variable names for that collection. +For the case of metadata variables, the name should be "metadata".} +} +\value{ +a data.table of the requested variables +} +\description{ +Get the variables in the Microbiome Dataset by their names. +The requested variables could belong to any collection or +to the metadata. The returned data.table will contain the +requested variables as columns. If one of the requested +variables cannot be returned, a warning will be printed. +} +\examples{ +getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") +getMetadataVariableNames(microbiomeData::DiabImmune) +variablesDT <- getVariables( + microbiomeData::DiabImmune, + list("metadata" = c("age_months", "sex"), + "16S (V4) Genus" = "Bacteroides", + "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)" + ) +) +} From d32c243a1c12a3be006f24ddb514d5e65ae7c1a9 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 24 Apr 2024 12:07:46 -0400 Subject: [PATCH 4/6] review feedback --- R/methods-MbioDataset.R | 43 +++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/R/methods-MbioDataset.R b/R/methods-MbioDataset.R index b8a6b7a..82bdc05 100644 --- a/R/methods-MbioDataset.R +++ b/R/methods-MbioDataset.R @@ -307,8 +307,9 @@ setMethod(collectionVarNamesGeneric, "MbioDataset", function(object, collectionN #' Get the variables in the Microbiome Dataset by their names. #' The requested variables could belong to any collection or #' to the metadata. The returned data.table will contain the -#' requested variables as columns. If one of the requested -#' variables cannot be returned, a warning will be printed. +#' requested variables as columns and any appropriate identifiers. +#' If one of the requested variables cannot be returned, a warning +#' will be printed. #' #' @examples #' getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") @@ -321,7 +322,7 @@ setMethod(collectionVarNamesGeneric, "MbioDataset", function(object, collectionN #' ) #' ) #' @param object A Microbiome Dataset -#' @param variables The names of the variables to return. this should be a named list +#' @param variables The names of the variables to return. This should be a named list #' where the names are collection names and the values are variable names for that collection. #' For the case of metadata variables, the name should be "metadata". #' @return a data.table of the requested variables @@ -379,15 +380,6 @@ setMethod("getVariables", "MbioDataset", function(object, variables) { ## this kind of assumes that metadata are always on ancestor entities of assays ## this will break for user data, when we get there mergeCols <- getSampleMetadataIdColumns(object) - mergeCollectionVariables <- function(x, y) { - if (!length(x)) { - return(y) - } else if (!length(y)) { - return(x) - } else { - return(merge(x, y, by = mergeCols)) - } - } if (length(variables) == 0) { return(data.table::data.table()) @@ -395,7 +387,30 @@ setMethod("getVariables", "MbioDataset", function(object, variables) { collectionVarDTs <- lapply(1:length(variables), fetchCollectionVariables) names(collectionVarDTs) <- names(variables) - collectionVarDT <- purrr::reduce(collectionVarDTs, mergeCollectionVariables) + collectionVarDT <- purrr::reduce(collectionVarDTs, customMerge, mergeCols = mergeCols) return(collectionVarDT) -}) \ No newline at end of file +}) + +## a helper that merges two collections of variables +## if either input is empty, returns the other +## use this w some caution. It is barely a general +## purpose function, and isnt really tested. +customMerge <- function(x, y, mergeCols = NULL) { + if (!inherits(x, "data.table")) { + stop("Argument 'x' must be a data.table") + } else if (!inherits(y, "data.table")) { + stop("Argument 'y' must be a data.table") + } + + if (!length(x)) { + return(y) + } else if (!length(y)) { + return(x) + } else { + if (is.null(mergeCols)) { + return(merge(x, y)) + } + return(merge(x, y, by = mergeCols)) + } +} \ No newline at end of file From b2934d646e8214907e5a785b3b9c5bcdafdf389e Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 24 Apr 2024 12:11:55 -0400 Subject: [PATCH 5/6] update generated docs --- R/methods-MbioDataset.R | 2 -- ...Rd => getCollectionVariableNames-MbioDataset-method.Rd} | 1 - man/getVariables.Rd | 7 ++++--- 3 files changed, 4 insertions(+), 6 deletions(-) rename man/{getCollectionVariableNames.Rd => getCollectionVariableNames-MbioDataset-method.Rd} (92%) diff --git a/R/methods-MbioDataset.R b/R/methods-MbioDataset.R index 82bdc05..b814025 100644 --- a/R/methods-MbioDataset.R +++ b/R/methods-MbioDataset.R @@ -296,8 +296,6 @@ collectionVarNamesGeneric <- getGeneric("getCollectionVariableNames", "veupathUt #' @param collectionName The name of the collection to return the variable names for #' @return a character vector of the variable names in the requested collection #' @export -#' @rdname getCollectionVariableNames -#' @aliases getCollectionVariableNames,MbioDataset,character-method setMethod(collectionVarNamesGeneric, "MbioDataset", function(object, collectionName) { return(veupathUtils::getCollectionVariableNames(getCollection(object, collectionName))) }) diff --git a/man/getCollectionVariableNames.Rd b/man/getCollectionVariableNames-MbioDataset-method.Rd similarity index 92% rename from man/getCollectionVariableNames.Rd rename to man/getCollectionVariableNames-MbioDataset-method.Rd index f315c49..3a7c6e8 100644 --- a/man/getCollectionVariableNames.Rd +++ b/man/getCollectionVariableNames-MbioDataset-method.Rd @@ -2,7 +2,6 @@ % Please edit documentation in R/methods-MbioDataset.R \name{getCollectionVariableNames,MbioDataset-method} \alias{getCollectionVariableNames,MbioDataset-method} -\alias{getCollectionVariableNames,MbioDataset,character-method} \title{Get Microbiome Dataset Collection Variable Names} \usage{ \S4method{getCollectionVariableNames}{MbioDataset}(object, collectionName) diff --git a/man/getVariables.Rd b/man/getVariables.Rd index a6ee65f..d95693e 100644 --- a/man/getVariables.Rd +++ b/man/getVariables.Rd @@ -13,7 +13,7 @@ getVariables(object, variables) \arguments{ \item{object}{A Microbiome Dataset} -\item{variables}{The names of the variables to return. this should be a named list +\item{variables}{The names of the variables to return. This should be a named list where the names are collection names and the values are variable names for that collection. For the case of metadata variables, the name should be "metadata".} } @@ -24,8 +24,9 @@ a data.table of the requested variables Get the variables in the Microbiome Dataset by their names. The requested variables could belong to any collection or to the metadata. The returned data.table will contain the -requested variables as columns. If one of the requested -variables cannot be returned, a warning will be printed. +requested variables as columns and any appropriate identifiers. +If one of the requested variables cannot be returned, a warning +will be printed. } \examples{ getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus") From adec94c4281479798bc48a96a7b9106c59a1cd47 Mon Sep 17 00:00:00 2001 From: Danielle Callan Date: Wed, 24 Apr 2024 12:13:43 -0400 Subject: [PATCH 6/6] explicitly namespace a veupathutils fxn --- NAMESPACE | 1 + R/methods-MbioDataset.R | 1 + 2 files changed, 2 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 8ba8c94..e837294 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,6 +47,7 @@ importFrom(veupathUtils,correlation) importFrom(veupathUtils,findAncestorIdColumns) importFrom(veupathUtils,findRecordIdColumn) importFrom(veupathUtils,getCollectionNames) +importFrom(veupathUtils,getCollectionVariableNames) importFrom(veupathUtils,getDataFromSource) importFrom(veupathUtils,getIdColumns) importFrom(veupathUtils,getMetadataVariableNames) diff --git a/R/methods-MbioDataset.R b/R/methods-MbioDataset.R index b814025..c0e5200 100644 --- a/R/methods-MbioDataset.R +++ b/R/methods-MbioDataset.R @@ -296,6 +296,7 @@ collectionVarNamesGeneric <- getGeneric("getCollectionVariableNames", "veupathUt #' @param collectionName The name of the collection to return the variable names for #' @return a character vector of the variable names in the requested collection #' @export +#' @importFrom veupathUtils getCollectionVariableNames setMethod(collectionVarNamesGeneric, "MbioDataset", function(object, collectionName) { return(veupathUtils::getCollectionVariableNames(getCollection(object, collectionName))) })