Skip to content

Commit

Permalink
Merge pull request #26 from microbiomeDB/get-arbitrary-vars
Browse files Browse the repository at this point in the history
Get arbitrary vars
  • Loading branch information
d-callan authored Apr 24, 2024
2 parents 5ae0547 + adec94c commit 606e5dc
Show file tree
Hide file tree
Showing 5 changed files with 264 additions and 1 deletion.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ export(differentialAbundance)
export(getCollection)
export(getComputeResult)
export(getComputeResultWithMetadata)
export(getVariables)
export(rankedAbundance)
export(selfCorrelation)
export(updateCollectionName)
exportMethods(getCollectionNames)
exportMethods(getCollectionVariableNames)
exportMethods(getMetadataVariableNames)
exportMethods(getMetadataVariableSummary)
exportMethods(getSampleMetadata)
Expand Down Expand Up @@ -45,6 +47,7 @@ importFrom(veupathUtils,correlation)
importFrom(veupathUtils,findAncestorIdColumns)
importFrom(veupathUtils,findRecordIdColumn)
importFrom(veupathUtils,getCollectionNames)
importFrom(veupathUtils,getCollectionVariableNames)
importFrom(veupathUtils,getDataFromSource)
importFrom(veupathUtils,getIdColumns)
importFrom(veupathUtils,getMetadataVariableNames)
Expand Down
131 changes: 130 additions & 1 deletion R/methods-MbioDataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,133 @@ setMethod("getCollection", "MbioDataset", function(object, collectionName = char
}

return(abundanceData)
})
})

collectionVarNamesGeneric <- getGeneric("getCollectionVariableNames", "veupathUtils")
#' Get Microbiome Dataset Collection Variable Names
#'
#' Get the variable names in a collection in the Microbiome Dataset.
#'
#' @examples
#' variableNames <- getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus")
#' @param object A Microbiome Dataset
#' @param collectionName The name of the collection to return the variable names for
#' @return a character vector of the variable names in the requested collection
#' @export
#' @importFrom veupathUtils getCollectionVariableNames
setMethod(collectionVarNamesGeneric, "MbioDataset", function(object, collectionName) {
return(veupathUtils::getCollectionVariableNames(getCollection(object, collectionName)))
})

#' Get Microbiome Dataset Variables
#'
#' Get the variables in the Microbiome Dataset by their names.
#' The requested variables could belong to any collection or
#' to the metadata. The returned data.table will contain the
#' requested variables as columns and any appropriate identifiers.
#' If one of the requested variables cannot be returned, a warning
#' will be printed.
#'
#' @examples
#' getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus")
#' getMetadataVariableNames(microbiomeData::DiabImmune)
#' variablesDT <- getVariables(
#' microbiomeData::DiabImmune,
#' list("metadata" = c("age_months", "sex"),
#' "16S (V4) Genus" = "Bacteroides",
#' "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)"
#' )
#' )
#' @param object A Microbiome Dataset
#' @param variables The names of the variables to return. This should be a named list
#' where the names are collection names and the values are variable names for that collection.
#' For the case of metadata variables, the name should be "metadata".
#' @return a data.table of the requested variables
#' @rdname getVariables
#' @export
setGeneric("getVariables", function(object, variables) standardGeneric("getVariables"), signature = "object")

#' @rdname getVariables
#' @aliases getVariables,MbioDataset,character-method
setMethod("getVariables", "MbioDataset", function(object, variables) {

if (!is.list(variables)) {
stop("variables argument must be a list")
}
if (is.null(names(variables))) {
stop("variables argument must be a named list")
}

## identify variables w identical names early
flattenedVars <- unlist(variables)
dups <- unname(flattenedVars[duplicated(flattenedVars)])
collectionsWithDups <- lapply(variables, function(x) {dups %in% x})
collectionsWithDupsIndexes <- unname(which(collectionsWithDups == TRUE))

fetchCollectionVariables <- function(collectionIndex) {
variableNames <- variables[[collectionIndex]]
collectionName <- names(variables)[collectionIndex]

if (collectionName == "metadata") {
return(getSampleMetadata(object, metadataVariables = variableNames))
}

if (!collectionName %in% getCollectionNames(object)) {
stop(sprintf("Collection '%s' does not exist", collectionName))
}

if (any(variableNames %in% getCollectionVariableNames(object, collectionName))) {
collection <- getCollection(object, collectionName)
presentVars <- variableNames[variableNames %in% getCollectionVariableNames(collection)]
if (veupathUtils::isOneToManyWithAncestor(collection)) {
warning("Unable to return the following variables: ", presentVars)
return(data.table::data.table())
}
dt <- veupathUtils::getCollectionData(collection, presentVars)
if (collectionIndex %in% collectionsWithDupsIndexes) {
## rename variables to prepend the collection name
names(dt)[names(dt) %in% presentVars] <- paste(collectionName, presentVars)
}
return(dt)
} else {
stop(sprintf("Collection '%s' does not contain the following variables: %s", collectionName, paste(variableNames, collapse = ", ")))
}
}

## this kind of assumes that metadata are always on ancestor entities of assays
## this will break for user data, when we get there
mergeCols <- getSampleMetadataIdColumns(object)

if (length(variables) == 0) {
return(data.table::data.table())
}

collectionVarDTs <- lapply(1:length(variables), fetchCollectionVariables)
names(collectionVarDTs) <- names(variables)
collectionVarDT <- purrr::reduce(collectionVarDTs, customMerge, mergeCols = mergeCols)

return(collectionVarDT)
})

## a helper that merges two collections of variables
## if either input is empty, returns the other
## use this w some caution. It is barely a general
## purpose function, and isnt really tested.
customMerge <- function(x, y, mergeCols = NULL) {
if (!inherits(x, "data.table")) {
stop("Argument 'x' must be a data.table")
} else if (!inherits(y, "data.table")) {
stop("Argument 'y' must be a data.table")
}

if (!length(x)) {
return(y)
} else if (!length(y)) {
return(x)
} else {
if (is.null(mergeCols)) {
return(merge(x, y))
}
return(merge(x, y, by = mergeCols))
}
}
22 changes: 22 additions & 0 deletions man/getCollectionVariableNames-MbioDataset-method.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 41 additions & 0 deletions man/getVariables.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

68 changes: 68 additions & 0 deletions tests/testthat/test-MbioDataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,72 @@ test_that("we can update collection names and get collections", {

testCollection <- getCollection(testDataset, "My Collection", "phyloseq")
expect_s4_class(testCollection, "phyloseq")
})

test_that("we can get arbitrary variables", {
dataFile1 <- test_path('testdata','DiabImmune/DiabImmune_entity_16SRRNAV4Assay.txt')
metadataFile1 <- test_path('testdata','DiabImmune/DiabImmune_ParticipantRepeatedMeasure.txt')
dataFile2 <- test_path('testdata','DiabImmune/DiabImmune_MetagenomicSequencingAssay.txt')
metadataFile2 <- test_path('testdata','DiabImmune/DiabImmune_Participant.txt')
metadataFile3 <- test_path('testdata','DiabImmune/DiabImmune_Sample.txt')
ontologyFile <- test_path('testdata','DiabImmune/DiabImmune_OntologyMetadata.txt')
mbioDataset <- MbioDataset(list(dataFile1, dataFile2), list(metadataFile2, metadataFile1, metadataFile3), ontologyFile)

# try a sensible thing w vars on different 1:1 entities
variablesDT <- getVariables(
mbioDataset,
list("metadata" = c("age_months", "sex"),
"16S (V4) Genus" = "Bacteroides",
"WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)"
)
)
# expect a data.table w four columns
expect_s3_class(variablesDT, "data.table")
expect_equal(length(variablesDT), 9) # 4 vars + 5 ids
expect_equal(all(c("age_months", "sex", "Bacteroides", "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)") %in% names(variablesDT)), TRUE)
expect_equal(nrow(variablesDT) > 0, TRUE)

# try a var that doesnt exist
expect_error(
variablesDT <- getVariables(
mbioDataset,
list("metadata" = c("age_months", "sex"),
"16S (V4) Genus" = "Bacteroides",
"WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)",
"WGS Genus" = "doesntexist"
)
)
)


# try a collection that doesnt exist
expect_error(
variablesDT <- getVariables(
mbioDataset,
list("metadata" = c("age_months", "sex"),
"16S (V4) Genus" = "Bacteroides",
"doesntexist" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)"
)
)
)

# try the same named variable on two different collections
variablesDT <- getVariables(
mbioDataset,
list("metadata" = c("age_months", "sex"),
"16S (V4) Genus" = "Bacteroides",
"WGS Genus" = "Bacteroides"
)
)

expect_s3_class(variablesDT, "data.table")
expect_equal(length(variablesDT), 9) # 4 vars + 5 ids
expect_equal(all(c("age_months", "sex", "16S (V4) Genus Bacteroides", "WGS Genus Bacteroides") %in% names(variablesDT)), TRUE)
expect_equal(nrow(variablesDT) > 0, TRUE)

# pass something other than a named list
expect_error(variablesDT <- getVariables(mbioDataset, "16S (V4) Genus"))
expect_error(variablesDT <- getVariables(mbioDataset, list("16S (V4) Genus)")))

# find an ex where assays arent 1:1 w ancestors
})

0 comments on commit 606e5dc

Please sign in to comment.