Skip to content

Commit

Permalink
Merge pull request #55 from ncats/pw_enrich_dev
Browse files Browse the repository at this point in the history
Merge pw_enrich_dev branch into main (pkg version now 2.3.1)
  • Loading branch information
johnbraisted authored Jun 14, 2023
2 parents e24ce93 + 5335926 commit 7197301
Show file tree
Hide file tree
Showing 12 changed files with 541 additions and 291 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: RaMP
Title: RaMP (Relational Database of Metabolomic Pathways)
Type: Package
Version: 2.3.0
Version: 2.3.1
License: GPL-2
Depends: R (>= 3.6.0)
Authors@R: c(
Expand Down
14 changes: 9 additions & 5 deletions R/ReturnAnalytes_InputPathways.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#' @export
getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", max_pathway_size = Inf, names_or_ids="names") {
now <- proc.time()
print("fired")
print("fired!")
if(is.character(pathway)){
if(grepl("\n",pathway)[1]){
list_pathway <- strsplit(pathway,"\n")
Expand Down Expand Up @@ -69,7 +69,9 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m
df <- RMariaDB::dbGetQuery(con,sql)
RMariaDB::dbDisconnect(con)
} else if(match == 'fuzzy') {
df = data.frame(matrix(nrow=0, ncol=6))
df = data.frame(matrix(nrow=0, ncol=7))
colnames(df) <- c('analyteName', 'sourceAnalyteIDs', 'geneOrCompound',
'pathwayName', 'pathwayId', 'pathwayCategory', 'pathwayType')
sql = paste0("select
group_concat(distinct s.commonName order by s.commonName asc separator '; ') as analyteName,
group_concat(distinct s.sourceId order by s.sourceId asc separator '; ') as sourceAnalyteIDs,
Expand All @@ -87,9 +89,11 @@ getAnalyteFromPathway <- function(pathway, match="exact", analyte_type="both", m

con <- connectToRaMP()
for(p in pathway) {
currSQL = gsub(pattern = '[SOME_PW_NAME]', replacement = p, x= sql, fixed = T )
subdf <- RMariaDB::dbGetQuery(con,currSQL)
df <- rbind(df, subdf)
if(nchar(p)>2) {
currSQL = gsub(pattern = '[SOME_PW_NAME]', replacement = p, x= sql, fixed = T )
subdf <- RMariaDB::dbGetQuery(con,currSQL)
df <- rbind(df, subdf)
}
}
RMariaDB::dbDisconnect(con)
}
Expand Down
19 changes: 17 additions & 2 deletions R/ReturnPathwaysEnrich_InputAnalytes.R
Original file line number Diff line number Diff line change
Expand Up @@ -956,7 +956,7 @@ getCustomPathwayFromAnalyte <- function(analytes, pathways, analyte_type) {
#' fisher.results <- runCombinedFisherTest(pathwaydf = pathwaydf)
#' clustered.fisher.results <- findCluster(fisher.results)
#' }
#' @export
#' @exportcluster_list

findCluster <- function(fishers_df, perc_analyte_overlap = 0.5,
min_pathway_tocluster = 2, perc_pathway_overlap = 0.5) {
Expand Down Expand Up @@ -1078,7 +1078,22 @@ findCluster <- function(fishers_df, perc_analyte_overlap = 0.5,
clusters_to_merge <- unique(t(apply(clusters_to_merge, 1, sort)))

for (i in 1:nrow(clusters_to_merge)) {
if (!is.na(cluster_list[[clusters_to_merge[i, 1]]]) && !is.na(cluster_list[[clusters_to_merge[i, 2]]])) {
# print(" ")
# if(length(!is.na(cluster_list[[clusters_to_merge[i, 1]]])) > 1) {
# print(paste0("cluster list 1 is greater than 1 ", length(!is.na(cluster_list[[clusters_to_merge[i, 1]]]))))
# print(cluster_list[[clusters_to_merge[i, 1]]])
# } else {
# print("cluster list 1 length is 1")
# }
#
# if(length(!is.na(cluster_list[[clusters_to_merge[i, 1]]])) > 1) {
# print(paste0("cluster list 2 is greater than 1 ", length(!is.na(cluster_list[[clusters_to_merge[i, 2]]]))))
# print(cluster_list[[clusters_to_merge[i, 2]]])
# } else {
# print("cluster list 2 length is 1")
# }

if (all(!is.na(cluster_list[[clusters_to_merge[i, 1]]])) && all(!is.na(cluster_list[[clusters_to_merge[i, 2]]]))) {
cluster_list[[clusters_to_merge[i, 1]]] <- unique(unlist(cluster_list[c(clusters_to_merge[i, 1], clusters_to_merge[i, 2])]))
cluster_list[[clusters_to_merge[i, 2]]] <- NA
}
Expand Down
77 changes: 52 additions & 25 deletions R/rampChemClassQueries.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

# queries to retrieve and analyze chemical classes

#' Returns chemical class information comparing a metabolite subset to a larger metabolite population.
Expand All @@ -11,7 +12,10 @@
#' or 'file' in which case the background parameter will be a file path, or 'biospecimen' where the specified background parameter is
#' a RaMP HMDB metabolite ontology term (see background parameter, above, for the most common biospecimen background values).
#' @param includeRaMPids include internal RaMP identifiers (default is "FALSE")
#' @return Returns chemcial class information data including class count tallies and comparisons between metabolites of interest and the metabolite population,
#' @param inferIdMapping if FALSE, the survey only reports on class annotations made directly on the input ids.
#' If inferIdMapping is set to TRUE, the ids are cross-referenced or mapped to related ids that contain metabolite class annotations.
#' The default is TRUE.
#' @return Returns chemical class information data including class count tallies and comparisons between metabolites of interest and the metabolite population,
#' metabolite mappings to classes, and query summary report indicating the number of input metabolites that were resolved and listing those metabolite ids
#' that are not found in the database.
#'
Expand Down Expand Up @@ -66,9 +70,8 @@
#' metClassResult$query_report
#'}
#' @export
chemicalClassSurvey <- function(mets, background = "database", background_type="database", includeRaMPids = FALSE){
chemicalClassSurvey <- function(mets, background = "database", background_type="database", includeRaMPids = FALSE, inferIdMapping = TRUE){


conn <- connectToRaMP()
print("Starting Chemical Class Survey")

Expand Down Expand Up @@ -109,8 +112,16 @@ chemicalClassSurvey <- function(mets, background = "database", background_type="

print(paste0("Biospecimen background specified: ", background))

query <- paste0("select s.rampId, s.sourceId from source s, analytehasontology ao, ontology o
where o.commonName in ('", background, "') and o.rampOntologyId=ao.rampOntologyId and s.rampId = ao.rampCompoundId")
# if we id map, then we go through and extend the soruce ids via ramp ids
# else, we just pick up hmdb ids from the ao table.
if(inferIdMapping) {
query <- paste0("select distinct s.rampId, s.sourceId from source s, analytehasontology ao, ontology o
where o.commonName in ('", background, "') and o.rampOntologyId=ao.rampOntologyId and s.rampId = ao.rampCompoundId")
} else {
# how do we keep the direct mapping on the source HMDB ids? We don't capture that.
query <- paste0("select distinct s.rampId, s.sourceId from source s, analytehasontology ao, ontology o
where o.commonName in ('", background, "') and o.rampOntologyId=ao.rampOntologyId and s.rampId = ao.rampCompoundId")
}
con <- connectToRaMP()
bg <- RMariaDB::dbGetQuery(con, query)
RMariaDB::dbDisconnect(con)
Expand All @@ -134,7 +145,8 @@ chemicalClassSurvey <- function(mets, background = "database", background_type="
}

# try to make a set of source ids assoicated with unique rampids
bg <- bg[!duplicated(bg$rampId),]
# drop this, as it limits the source ids such that there is only one random soruce id per ramp id
# bg <- bg[!duplicated(bg$rampId),]

# use unique list of source ids for the biospecimen type
# note that rampId counts will be used downstream for statistics, reducing redundancy
Expand All @@ -149,11 +161,14 @@ chemicalClassSurvey <- function(mets, background = "database", background_type="
stop("background_type was not specified correctly. Please specify one of the following options: 'database', 'file', 'list', or 'biospecimen'.")
}

# note that for enrichment analysis the inferIdMapping for the class survey is set to FALSE
# This means that only ids that have direct id-to-class annotations will contribute to results.
if(background_type == "database"){
res <- chemicalClassSurveyRampIdsFullPopConn(mets, conn)
res <- chemicalClassSurveyRampIdsFullPopConn(mets=mets, conn=conn, inferIdMapping=inferIdMapping)
} else {
res <- chemicalClassSurveyRampIdsConn(mets, bkgrnd, conn)
res <- chemicalClassSurveyRampIdsConn(mets=mets, pop=bkgrnd, conn=conn, inferIdMapping=inferIdMapping)
}

RMariaDB::dbDisconnect(conn)

print("Finished Chemical Class Survey")
Expand Down Expand Up @@ -192,6 +207,10 @@ chemicalClassSurvey <- function(mets, background = "database", background_type="
#' @param background_type one of 'database' (all analytes in the RaMP Database), 'list' (a list of input ids),
#' or 'file' in which case the background parameter will be a file path, or 'biospecimen' where the specified background parameter is
#' a RaMP HMDB metabolite ontology term (see background parameter, above. for the most common biospecimen background values).
#' @param inferIdMapping if FALSE, the method only reports on class annotations made directly on the input ids.
#' If inferIdMapping is set to TRUE, the input ids are cross-referenced or mapped to other existing ids that contain metabolite class annotations.
#' Following id cross references can expand coverage if the input type is other than HMDB ids or LIPIDMAPS ids.
#' The default value is FALSE.
#' @return a list of dataframes, each holding chemical classs enrichment statistics for specific chemical classification systems,
#' such as HMDB Classyfire class categories and LIPIDMAPS class categories. The results list chemical classes, metabolite hits counts,
#' Fisher Exact p-values and Benjamini-Hochberg corrected p-values (FDR estimates)
Expand Down Expand Up @@ -220,13 +239,16 @@ chemicalClassSurvey <- function(mets, background = "database", background_type="
#' enrichedClassStats <- chemicalClassEnrichment(mets = metList)
#'}
#' @export
chemicalClassEnrichment <- function(mets, background = "database", background_type = "database") {
chemicalClassEnrichment <- function(mets, background = "database", background_type = "database", inferIdMapping=F) {
print("Starting Chemical Class Enrichment")

# note that inferIdMapping is set to FALSE
# enrichment will only be reported for input ids that have
# a direct association with the chemical class.
classData <- chemicalClassSurvey(mets = mets,
background = background,
background_type = background_type,
includeRaMPids = TRUE)
includeRaMPids = TRUE, inferIdMapping = inferIdMapping)

# Quit if empty survey result
if(is.null(classData) || length(classData) < 1) {
Expand All @@ -236,29 +258,35 @@ chemicalClassEnrichment <- function(mets, background = "database", background_ty

enrichmentStat <- list()

totalCountInfo <- getTotalFoundInCategories(classData)

breakPoint <- TRUE
# helper method to summarize information on classes based on chemical class survey
totalCountInfo <- getTotalFoundInCategories(classData, inferIdMapping = inferIdMapping)

for (category in names(classData$count_summary)) {

categoryData <- classData$count_summary[[category]]

# only run if we have data for that class category
# ANNNND we have that category represented in $mets
if(nrow(categoryData) > 0 && category %in% names(totalCountInfo$mets)) {
if(nrow(categoryData) > 0 && category %in% totalCountInfo$mets$Var1) {
resultRow <- 1

category <- category
totMetCnt <- totalCountInfo$mets[[category]]
totPopCnt <- as.integer(totalCountInfo$pop[[category]])
metsInfo <- totalCountInfo$mets
popInfo <- totalCountInfo$pop

# some summary methods returned Integer64, cast as integer
# these values are the counts for the annotation category for the metabolite set and the population
totMetCnt <- as.integer(metsInfo[metsInfo$Var1 == category,2])
totPopCnt <- as.integer(popInfo[popInfo$Var1 == category,2])

contingencyMat <- matrix(nrow=2, ncol=2)
resultMat <- data.frame(matrix(ncol=7))
colnames(resultMat) <- c("category", "class_name", "met_hits", "pop_hits",
"met_size", "pop_size", "p-value")

for (i in 1:nrow(categoryData)) {
if(categoryData[i,'mets_count'] >= 1) {
contingencyMat[1,1] <- categoryData[i,'mets_count']

contingencyMat[1,1] <- as.integer(categoryData[i,'mets_count'])
contingencyMat[1,2] <- totMetCnt - contingencyMat[1,1]
contingencyMat[2,1] <- as.integer(categoryData[i,'pop_count']) - contingencyMat[1,1]
contingencyMat[2,2] <- totPopCnt - contingencyMat[2,1] - contingencyMat[1,2]
Expand All @@ -275,19 +303,18 @@ chemicalClassEnrichment <- function(mets, background = "database", background_ty
resultRow <- resultRow + 1
}
}
resultMat

resultMat <- bhCorrect(resultMat)
enrichmentStat[[as.character(category)]] <- resultMat
} else {
# just append an empty data frame place holder for that category type.
resultMat <- data.frame(matrix(ncol=7, nrow=0))
colnames(resultMat) <- c("category", "class_name", "met_hits", "pop_hits",
"met_size", "pop_size", "p-value")
enrichmentStat[[as.character(category)]] <- resultMat

}
}
enrichmentStat[['result_type']] <- 'chemical_class_enrichment'
print("Finished Chemical Class Enrichment")
return(enrichmentStat)
}





Loading

0 comments on commit 7197301

Please sign in to comment.