Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Get arbitrary vars #26

Merged
merged 6 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ export(differentialAbundance)
export(getCollection)
export(getComputeResult)
export(getComputeResultWithMetadata)
export(getVariables)
export(rankedAbundance)
export(selfCorrelation)
export(updateCollectionName)
exportMethods(getCollectionNames)
exportMethods(getCollectionVariableNames)
exportMethods(getMetadataVariableNames)
exportMethods(getMetadataVariableSummary)
exportMethods(getSampleMetadata)
Expand Down
115 changes: 115 additions & 0 deletions R/methods-MbioDataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,119 @@ setMethod("getCollection", "MbioDataset", function(object, collectionName = char
}

return(abundanceData)
})

collectionVarNamesGeneric <- getGeneric("getCollectionVariableNames", "veupathUtils")
#' Get Microbiome Dataset Collection Variable Names
#'
#' Get the variable names in a collection in the Microbiome Dataset.
#'
#' @examples
#' variableNames <- getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus")
#' @param object A Microbiome Dataset
#' @param collectionName The name of the collection to return the variable names for
#' @return a character vector of the variable names in the requested collection
#' @export
#' @rdname getCollectionVariableNames
#' @aliases getCollectionVariableNames,MbioDataset,character-method
setMethod(collectionVarNamesGeneric, "MbioDataset", function(object, collectionName) {
return(veupathUtils::getCollectionVariableNames(getCollection(object, collectionName)))
})

#' Get Microbiome Dataset Variables
#'
#' Get the variables in the Microbiome Dataset by their names.
#' The requested variables could belong to any collection or
#' to the metadata. The returned data.table will contain the
#' requested variables as columns. If one of the requested
d-callan marked this conversation as resolved.
Show resolved Hide resolved
#' variables cannot be returned, a warning will be printed.
#'
#' @examples
#' getCollectionVariableNames(microbiomeData::DiabImmune, "16S (V4) Genus")
#' getMetadataVariableNames(microbiomeData::DiabImmune)
#' variablesDT <- getVariables(
#' microbiomeData::DiabImmune,
#' list("metadata" = c("age_months", "sex"),
d-callan marked this conversation as resolved.
Show resolved Hide resolved
#' "16S (V4) Genus" = "Bacteroides",
#' "WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)"
#' )
#' )
#' @param object A Microbiome Dataset
#' @param variables The names of the variables to return. this should be a named list
d-callan marked this conversation as resolved.
Show resolved Hide resolved
#' where the names are collection names and the values are variable names for that collection.
#' For the case of metadata variables, the name should be "metadata".
#' @return a data.table of the requested variables
#' @rdname getVariables
#' @export
setGeneric("getVariables", function(object, variables) standardGeneric("getVariables"), signature = "object")

#' @rdname getVariables
#' @aliases getVariables,MbioDataset,character-method
setMethod("getVariables", "MbioDataset", function(object, variables) {

if (!is.list(variables)) {
stop("variables argument must be a list")
}
if (is.null(names(variables))) {
stop("variables argument must be a named list")
}

## identify variables w identical names early
flattenedVars <- unlist(variables)
dups <- unname(flattenedVars[duplicated(flattenedVars)])
collectionsWithDups <- lapply(variables, function(x) {dups %in% x})
collectionsWithDupsIndexes <- unname(which(collectionsWithDups == TRUE))

fetchCollectionVariables <- function(collectionIndex) {
d-callan marked this conversation as resolved.
Show resolved Hide resolved
variableNames <- variables[[collectionIndex]]
collectionName <- names(variables)[collectionIndex]

if (collectionName == "metadata") {
return(getSampleMetadata(object, metadataVariables = variableNames))
}

if (!collectionName %in% getCollectionNames(object)) {
stop(sprintf("Collection '%s' does not exist", collectionName))
}

if (any(variableNames %in% getCollectionVariableNames(object, collectionName))) {
collection <- getCollection(object, collectionName)
presentVars <- variableNames[variableNames %in% getCollectionVariableNames(collection)]
if (veupathUtils::isOneToManyWithAncestor(collection)) {
warning("Unable to return the following variables: ", presentVars)
return(data.table::data.table())
}
dt <- veupathUtils::getCollectionData(collection, presentVars)
if (collectionIndex %in% collectionsWithDupsIndexes) {
## rename variables to prepend the collection name
names(dt)[names(dt) %in% presentVars] <- paste(collectionName, presentVars)
}
return(dt)
} else {
stop(sprintf("Collection '%s' does not contain the following variables: %s", collectionName, paste(variableNames, collapse = ", ")))
d-callan marked this conversation as resolved.
Show resolved Hide resolved
}
}

## this kind of assumes that metadata are always on ancestor entities of assays
## this will break for user data, when we get there
mergeCols <- getSampleMetadataIdColumns(object)
mergeCollectionVariables <- function(x, y) {
d-callan marked this conversation as resolved.
Show resolved Hide resolved
if (!length(x)) {
return(y)
} else if (!length(y)) {
return(x)
} else {
return(merge(x, y, by = mergeCols))
}
}

if (length(variables) == 0) {
return(data.table::data.table())
}

collectionVarDTs <- lapply(1:length(variables), fetchCollectionVariables)
names(collectionVarDTs) <- names(variables)
collectionVarDT <- purrr::reduce(collectionVarDTs, mergeCollectionVariables)

return(collectionVarDT)
})
23 changes: 23 additions & 0 deletions man/getCollectionVariableNames.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 40 additions & 0 deletions man/getVariables.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

68 changes: 68 additions & 0 deletions tests/testthat/test-MbioDataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,72 @@ test_that("we can update collection names and get collections", {

testCollection <- getCollection(testDataset, "My Collection", "phyloseq")
expect_s4_class(testCollection, "phyloseq")
})

test_that("we can get arbitrary variables", {
dataFile1 <- test_path('testdata','DiabImmune/DiabImmune_entity_16SRRNAV4Assay.txt')
metadataFile1 <- test_path('testdata','DiabImmune/DiabImmune_ParticipantRepeatedMeasure.txt')
dataFile2 <- test_path('testdata','DiabImmune/DiabImmune_MetagenomicSequencingAssay.txt')
metadataFile2 <- test_path('testdata','DiabImmune/DiabImmune_Participant.txt')
metadataFile3 <- test_path('testdata','DiabImmune/DiabImmune_Sample.txt')
ontologyFile <- test_path('testdata','DiabImmune/DiabImmune_OntologyMetadata.txt')
mbioDataset <- MbioDataset(list(dataFile1, dataFile2), list(metadataFile2, metadataFile1, metadataFile3), ontologyFile)

# try a sensible thing w vars on different 1:1 entities
variablesDT <- getVariables(
mbioDataset,
list("metadata" = c("age_months", "sex"),
"16S (V4) Genus" = "Bacteroides",
"WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)"
)
)
# expect a data.table w four columns
expect_s3_class(variablesDT, "data.table")
expect_equal(length(variablesDT), 9) # 4 vars + 5 ids
expect_equal(all(c("age_months", "sex", "Bacteroides", "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)") %in% names(variablesDT)), TRUE)
expect_equal(nrow(variablesDT) > 0, TRUE)

# try a var that doesnt exist
expect_error(
variablesDT <- getVariables(
mbioDataset,
list("metadata" = c("age_months", "sex"),
"16S (V4) Genus" = "Bacteroides",
"WGS Metagenome enzyme pathway abundance data" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)",
"WGS Genus" = "doesntexist"
)
)
)


# try a collection that doesnt exist
expect_error(
variablesDT <- getVariables(
mbioDataset,
list("metadata" = c("age_months", "sex"),
"16S (V4) Genus" = "Bacteroides",
"doesntexist" = "ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)"
)
)
)

# try the same named variable on two different collections
variablesDT <- getVariables(
mbioDataset,
list("metadata" = c("age_months", "sex"),
"16S (V4) Genus" = "Bacteroides",
"WGS Genus" = "Bacteroides"
)
)

expect_s3_class(variablesDT, "data.table")
expect_equal(length(variablesDT), 9) # 4 vars + 5 ids
expect_equal(all(c("age_months", "sex", "16S (V4) Genus Bacteroides", "WGS Genus Bacteroides") %in% names(variablesDT)), TRUE)
expect_equal(nrow(variablesDT) > 0, TRUE)

# pass something other than a named list
expect_error(variablesDT <- getVariables(mbioDataset, "16S (V4) Genus"))
expect_error(variablesDT <- getVariables(mbioDataset, list("16S (V4) Genus)")))

# find an ex where assays arent 1:1 w ancestors
})
Loading